### Fetch original data from hugging face, drop a few metadata columns, and group pension files data by NAID

In [None]:
import pandas as pd

##### Fetch data from hugging face (only run once ✅)

In [None]:
# df = pd.read_parquet("hf://datasets/RevolutionCrossroads/nara_revolutionary_war_pension_files/nara_pension_file_pages.parquet")

In [None]:
# Save the dataframe locally to avoid re-downloading
# df.to_parquet('original_nara_pension_file_pages.parquet', engine='pyarrow')

#### Load locally saved df

In [None]:
df = pd.read_parquet('original_nara_pension_file_pages.parquet')

In [None]:
df.shape

In [None]:
# remove columns: transcriptionDate, transcriptionUserNames, transcriptionContributionCount, transcriptionID, logicalDate, ocrID, ocrUploadDate, ocrContributor
df = df.drop(columns=['transcriptionDate', 'transcriptionUserNames', 'transcriptionContributionCount', 'transcriptionID', 'logicalDate', 'ocrID', 'ocrUploadDate', 'ocrContributor', 'variantControlNumbers', 'pdfObjectID'])

In [None]:
# Check for NaN values in the title column
print("Number of NaN values in title column:", df['title'].isna().sum())

In [None]:
df.head()
# df.tail()

#### Group by NAID

In [None]:
separator = '||'

In [None]:
# group by NAID to create a new df with the grouped data
# for each row being grouped by the same NAID, concatenate the values for each row with "||" as a separator

df_grouped = df.groupby('NAID').agg(lambda x: separator.join(x.dropna().astype(str))).reset_index()

In [None]:
# Check for NaN values in the title column for new grouped df
print("Number of NaN values in title column:", df_grouped['title'].isna().sum())

In [None]:
df_grouped.shape
df_grouped.head()
df_grouped.info()

##### remove any duplicate values from grouping by NAID in select columns including [title]

In [None]:
def should_remove_duplicates(cell):
    if not isinstance(cell, str):
        return False
    if not separator in cell:
        return False
    return True

def safe_remove_duplicates(cell):
    vals = cell.split(separator)
    
    # Get unique values
    unique_vals = list(set(vals))
    
    if len(unique_vals) == 1:
        return unique_vals[0]
    else:
        return separator.join(unique_vals)

In [None]:
df_remove_duplicates = df_grouped.copy()

In [None]:
df_remove_duplicates['title'] = df_remove_duplicates['title'].apply(
    lambda x: safe_remove_duplicates(x) if should_remove_duplicates(x) else x
)

In [None]:
# Check what types of values are in the title column after applying the duplicates function
print("Data types in title column after applying safe_remove_duplicates:")
print(df_remove_duplicates['title'].apply(type).value_counts())

print("\nSample values and their types:")
sample_values_de_duplicated = df_remove_duplicates['title'].head(10)
for i, val in enumerate(sample_values_de_duplicated):
    print(f"Index {i}: '{val}' (type: {type(val)}, is NaN: {pd.isna(val)})")

print(f"\nNaN count after: {df_remove_duplicates['title'].isna().sum()}")


#### remove duplicates from other columns

In [None]:
df_remove_duplicates['pageImageType'] = df_remove_duplicates['pageImageType'].apply(
    lambda x: safe_remove_duplicates(x) if should_remove_duplicates(x) else x
)

In [None]:
df_remove_duplicates.head()