In [None]:
import pandas as pd
# load parquet

# Specify the path to your Parquet file
parquet_file_path = '../data/raw/loc/veterans_history_project.parquet'

# Read the Parquet file into a Pandas DataFrame
df = pd.read_parquet(parquet_file_path)

In [None]:
# column names checker
# note: date/ dats are related to dates of service/ war campaigns
df.columns

In [None]:
# extract year of record creation as a proxy of the age of media
df['number_date_created_first_itm'] = df['number_date_created'].apply(lambda x: x[0])
df['year_record_created'] = df['number_date_created_first_itm'].str.extract(r'^(\d{4})').astype(int)
df = df.sort_values(by='year_record_created', ascending=True)

In [None]:
import matplotlib
bins = df['year_record_created'].max() - df['year_record_created'].min()
df['year_record_created'].hist(bins = bins)

In [None]:
df_pre2010 = df[df['year_record_created']<=2010]
df_pre2010.to_parquet('../data/raw/loc/veterans_history_project_pre2010.parquet', index=False)

In [None]:
df_post2010 = df[df['year_record_created']>2010]
df_post2010.to_parquet('../data/raw/loc/veterans_history_project_post2010.parquet', index=False)

In [None]:
df_pre2010.iloc[0]

In [None]:
range(len(df_pre2010))

In [None]:
df_pre2010 = df_pre2010.reset_index(drop=True)

In [None]:
df_post2010.iloc[0]

In [None]:
range(len(df_post2010))

In [None]:
df_post2010 = df_post2010.reset_index(drop=True)

In [None]:
# retrieve resource DataFrame from the parquet file
df_resources = pd.read_parquet('../data/raw/loc/veterans_history_project_resources.parquet')

In [None]:
# reconstruct the dataframe where each row contains only one media resource
l_collection_numbers = []
for n in range(len(df_pre2010)):
    collection_number = df_pre2010['item'][n]['collection_number']
    # print(collection_number)
    l_collection_numbers.append(collection_number)

In [None]:
# check if the item collection numbers are unique
print(len(l_collection_numbers))
print(len(set(l_collection_numbers)))

In [None]:
df_resources_filtered = df_resources[df_resources['collection_number'].isin(l_collection_numbers)]

In [None]:
# save the DataFrame to a parquet file
df_resources_filtered.to_parquet('../data/raw/loc/veterans_history_project_resources_pre2010.parquet', index=False)

In [None]:
# post 2010:
# reconstruct the dataframe where each row contains only one media resource
l_collection_numbers = []
for n in range(len(df_post2010)):
    collection_number = df_post2010['item'][n]['collection_number']
    # print(collection_number)
    l_collection_numbers.append(collection_number)
df_resources_filtered = df_resources[df_resources['collection_number'].isin(l_collection_numbers)]
# save the DataFrame to a parquet file
df_resources_filtered.to_parquet('../data/raw/loc/veterans_history_project_resources_post2010.parquet', index=False)

Create train/ validation splits as current sampled set (random seed `42`) as evaluatio(test) set:

In [None]:
from sklearn.model_selection import train_test_split

# Retrieve pre 2010 dataframe and simulated sample=1000:
df_pre2010 = pd.read_parquet('../data/raw/loc/veterans_history_project_resources_pre2010.parquet')

# Replicate sample set creation from current production config

# 1. Filter for items that have transcripts
if 'fulltext_file_str' in df_pre2010.columns:
    df_pre2010 = df_pre2010[df_pre2010['fulltext_file_str'].notna()]
    print(f"Filtered to {len(df_pre2010)} items with transcripts")
    has_media = (df_pre2010['audio_url'].notna()) | (df_pre2010['video_url'].notna())
    df_pre2010 = df_pre2010[has_media]
    print(f"Filtered to {len(df_pre2010)} items with media")

# 2. Sort by index for deterministic order
df_pre2010 = df_pre2010.sort_index()

# 3. Random see = 42, sample size = 1000
df_pre2010_sample1000 = df_pre2010.sample(n=1000, random_state=42)

# Train/ Validation set creation
df_pre2010_train_val = df_pre2010.drop(df_pre2010_sample1000.index)
print("number of rows after filtering: " + str(len(df_pre2010)))
print("number of inference samples created (eval set): " + str(len(df_pre2010_sample1000)))
print("number of remaining rows used for training and validation: " + str(len(df_pre2010_train_val)))

# helper to check dataframe slice
# df_pre2010_sample1000.head()

# reserved for future use (e.g. featuring engineering)
# Separate features (X) and target (y)
# X = df_pre2010_train_val.drop(columns = ['fulltext_file_str', 'fulltext_file_str_cleaned', 'transcript_raw_text_only'], axis=1)
# y = df_pre2010_train_val[['fulltext_file_str', 'fulltext_file_str_cleaned', 'transcript_raw_text_only']]
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# df_pre2010_train = pd.concat([X_train, y_train])
# df_pre2010_val = pd.concat([X_val, y_val])

# Create train/ val splits
df_pre2010_train, df_pre2010_val = train_test_split(df_pre2010_train_val, test_size=0.2, random_state=42)

print("number of rows for training: " + str(len(df_pre2010_train)))
print("number of rows for validation: " + str(len(df_pre2010_val)))



In [None]:
# Save train, validation and test sets as parquet files
df_pre2010_sample1000.to_parquet('../data/raw/loc/veterans_history_project_resources_pre2010_test.parquet', index=False)
df_pre2010_train.to_parquet('../data/raw/loc/veterans_history_project_resources_pre2010_train.parquet', index=False)
df_pre2010_val.to_parquet('../data/raw/loc/veterans_history_project_resources_pre2010_val.parquet', index=False)