### Fetch original data from hugging face, drop a few metadata columns, and group pension files data by NAID

In [1]:
import pandas as pd
import re

##### Fetch data from hugging face (only run once âœ…)

In [None]:
# df = pd.read_parquet("hf://datasets/RevolutionCrossroads/nara_revolutionary_war_pension_files/nara_pension_file_pages.parquet")

In [None]:
# Save the dataframe locally to avoid re-downloading
# df.to_parquet('nara_pension_file_pages.parquet', engine='pyarrow')

#### Load locally saved df

In [4]:
df = pd.read_parquet('nara_pension_file_pages.parquet')

In [5]:
df.shape

(2244629, 19)

In [None]:
# remove metatdata columns that aren't needed here
# df = df.drop(columns=['transcriptionDate', 'transcriptionUserNames', 'transcriptionContributionCount', 'transcriptionID', 'ocrID', 'ocrUploadDate', 'ocrContributor', 'variantControlNumbers'])

In [None]:
# df.shape

(2244629, 11)

In [6]:
# Check for NaN values in the title column - should be zero
print("Number of NaN values in title column:", df['title'].isna().sum())

Number of NaN values in title column: 0


In [12]:
# df.head()
# df.tail()

#### Group by NAID

In [7]:
separator = '||'

In [8]:
# group by NAID to create a new df with the grouped data
# for each row being grouped by the same NAID, concatenate the values for each row with "||" as a separator defined above

# df_grouped = df.groupby('NAID').agg(lambda x: separator.join(x.dropna().astype(str))).reset_index()

# also remove duplicates
df_grouped = df.groupby('NAID').agg(lambda x: separator.join(x.dropna().astype(str).unique())).reset_index()

In [11]:
# Count how many duplicate NAID values exist (should be zero)
print("Number of duplicate NAID values:", df_grouped['NAID'].duplicated().sum())


Number of duplicate NAID values: 0


In [13]:
# Check for NaN values in the title column for new grouped df
print("Number of NaN values in title column:", df_grouped['title'].isna().sum())

Number of NaN values in title column: 0


In [14]:
# Check for titles containing separator '||'
print("Number of titles containing separator '||':", df_grouped['title'].str.contains(re.escape(separator), na=False).sum())

Number of titles containing separator '||': 0


In [15]:
df_grouped.shape
# df_grouped.head()
# df_grouped.info()

(78926, 19)

In [16]:
df_grouped_sorted = df_grouped.sort_values(by='title')

In [28]:
# df_grouped_sorted.head()
# df_grouped_sorted.tail()

### Save parquet for rows grouped by NAID and sorted by title

In [17]:
# save df_grouped_sorted to parquet
df_grouped_sorted.to_parquet('df_grouped_NAID_sorted_title.parquet', engine='pyarrow')