### Analyze categories of the overall dataset by title, and divide the pension application files into pre-determined categories

In [46]:
import pandas as pd
import re


In [47]:
df = pd.read_parquet('df_grouped_NAID_sorted_title.parquet')

##### Look at first x characters of titles to see groupings 

In [48]:
df['title_prefix'] = df['title'].str[:21].str.lower()
df.groupby('title_prefix').size().sort_values(ascending=False)

title_prefix
revolutionary war pen    78604
microfilm target shee      270
illustrated family re       24
copy of the affidavit        4
discharge certificate        4
pension affidavit of         3
newcomb family record        1
statement of john cro        1
sarah benjamin's eyew        1
sampler of chester go        1
sampler for the famil        1
petition of william s        1
pension application o        1
birth and baptismal r        1
copy of a letter to h        1
fraktur of the family        1
flying camp rations b        1
diary of levi stedman        1
daniel kimball's revo        1
copy of the testimony        1
copy of the statement        1
copy of the commissio        1
statement regarding p        1
dtype: int64

In [49]:
df['title_prefix'] = df['title'].str.replace('-', ' ')
df['title_prefix'] = df['title_prefix'].str[:66].str.lower()
df.groupby('title_prefix').size().sort_values(ascending=False)

title_prefix
revolutionary war pension and bounty land warrant application file    78604
microfilm target sheet                                                  270
illustrated family record (fraktur) found in revolutionary war pen       20
pension affidavit of prince vaughan                                       1
illustrated family record [fraktur] of john jenkins                       1
illustrated family record [fraktur] of john mersereau                     1
newcomb family record                                                     1
pension affidavit of cato greene                                          1
pension affidavit of cuff greene                                          1
birth and baptismal record of dorothea garecht sell                       1
illustrated family record (fraktur) for ezekiel root, connecticut         1
petition of william scott, late a major in the new hampshire line,        1
sampler for the family of peter and mary booth                            1

In [50]:
application_prefix = 'revolutionary war pension and bounty land warrant application file' 
family_record = 'family record' 
microfilm_target_sheet = 'microfilm target sheet'
other = 'other'

In [51]:
df['file_type'] = ''

### Filter data by title that starts with 'revolutionary war pension and bounty land warrant application file'

In [66]:
df_filtered = df.copy()

In [67]:
application_prefix = 'revolutionary war pension and bounty land warrant application file'

df_filtered['title_modified'] = (df_filtered['title']
    .str.replace('-', ' ')
    .str.lower()
    .str.replace(application_prefix + 's', application_prefix, case=False, regex=False)
)

# filter by title starts with application_prefix
df_filtered = df_filtered[df_filtered['title_modified'].str.startswith(application_prefix.lower())]

In [68]:
df_filtered.shape

(78604, 14)

### Data cleaning & sort into categories

In [69]:
df_filtered['title_modified'] = df_filtered['title_modified'].str.replace(application_prefix, '', regex=True)

# short hand for registration and other words don't need
words = ["for", "file", "see"] #"reg", "rej", "red", 
for word in words:
    df_filtered['title_modified'] = df_filtered['title_modified'].str.replace(word, '', regex=False)

# various punctuation
punctuation = [',', '.', '!', '?', ':', ';', '-', '_', '(', ')', '[', ']', '{', '}', '\'', '\"', '/', '\\', '|', '`', '~', '=', '+', '*', '#', '@', '%', '$', '^', '&']

for punct in punctuation:
    df_filtered['title_modified'] = df_filtered['title_modified'].str.replace(punct, ' ', regex=False)

# Replace multiple consecutive spaces with a single space
df_filtered['title_modified'] = df_filtered['title_modified'].str.replace(' +', ' ', regex=True)

In [70]:
# explore first 3 characters of titles
#  df_filtered.groupby(df_filtered['title'].str[len(application_prefix):len(application_prefix) + 3]).size().sort_values(ascending=False)

# pd.set_option('display.max_rows', 100)
df_filtered.groupby(df_filtered['title_modified'].str[:3]).size().sort_values(ascending=False)

title_modified
 s     36096
 w     25512
 r     11130
 b      3220
        1446
       ...  
 mc        1
 ba        1
 aa        1
 p         1
 m         1
Length: 69, dtype: int64

In [71]:
# investigate groupings

# df_filtered.groupby(df_filtered['title'].str[len(application_prefix):len(application_prefix) + 3]).size().sort_values(ascending=False)

# df_filtered['title'].str.contains(f'{application_prefix}  bl ').sum()

# Show both original and modified titles
# mask = df_filtered['title'].str.contains(' n ', na=False, regex=False)
# mask = df_filtered['title_modified'].str.contains(' s ', na=False, regex=False)
# df_filtered[mask].assign(modified_title=df_filtered[mask]['title_modified'].str.replace(application_prefix + ' ', '', regex=False))[['title_modified', 'modified_title']]

# df_filtered['title_modified'].str.startswith(' illegible ').sum()

# (df_filtered['title_modified'].str.strip() == '').sum()

df_filtered[df_filtered['title_modified'].str.contains('microfilm pamphlet')]


Unnamed: 0,NAID,naraURL,title,logicalDate,pdfObjectID,pdfURL,pageObjectId,pageURL,pageImageType,ocrText,transcriptionText,title_prefix,file_type,title_modified
9611,144028666,https://catalog.archives.gov/id/144028666,Revolutionary War Pension and Bounty Land Warr...,,,,144028667||144028668||144028669||144028670||14...,https://s3.amazonaws.com/NARAprodstorage/lz/mi...,Image (JPG),NATIONAL ARCHIVES MICROFILM PUBLICATIONS\nREVO...,[HEADING] NATIONAL ARCHIVES MICROFILM PUBLICA...,revolutionary war pension and bounty land warr...,,microfilm pamphlet


#### Add file_cat column and categorize based on dictionary

using "unkown" to group titles with no other info, that have "blank" and that have "illegible"

In [72]:
# Initialize file_cat column
df_filtered['file_cat'] = ''

In [73]:
# Set file_cat to "unknown" for empty or whitespace-only title_modified
mask = df_filtered['title_modified'].str.strip() == ''
df_filtered.loc[mask, 'file_cat'] = 'unknown'

In [74]:
category_dict = {
    "soldier": ['s', 'sur', 't'], # soldier (survived)
    'rejected': ['r', 'k', 'p', 'rej', 'rejected'], # rejected
    "widow": ['w', 'wid'], # widow
    "bounty land warrant": ['b', 'bl', 'wt', 'b l wt', 'bounty land', 'blwt'], # bounty land warrant
    "old war": ['old act', 'old war', 'o w', 'ow'], # old war
    "nara archival administrative sheets": ["nara archival administrative sheets"]
}

# if no other category is found and one of these is found, set to unknown
nara_accession = ['n a acc no', 'acc no', 'no'] # National Archives Accession
unknown_group = ["blank", "illegible", "ctf"] #(ctf = certificate)

In [75]:
# Initialize file_cat as empty string
df_filtered['file_cat'] = ''

# Multiple category method - append categories (allowing duplicates during iteration)
for key, values in category_dict.items():
    for value in values:
        if value:
            pattern = f' {value} '
            mask = df_filtered['title_modified'].str.contains(pattern, na=False)
            # Append category to existing file_cat (allowing duplicates)
            df_filtered.loc[mask, 'file_cat'] = df_filtered.loc[mask, 'file_cat'] + f'{key}||'

# Clean up trailing separators, remove duplicates, and sort alphabetically
df_filtered['file_cat'] = df_filtered['file_cat'].str.rstrip('||').apply(
    lambda x: '||'.join(sorted(list(set(x.split('||'))))) if x else x
)

In [76]:
# Combine both groups
all_unknown_values = unknown_group + nara_accession  # Now both are lists

# If df_filtered['file_cat'] is empty and contains one of the values, set to unknown
mask = df_filtered['file_cat'] == ''
for value in all_unknown_values:
    value_mask = df_filtered['title_modified'].str.contains(value, case=False, na=False, regex=False)
    df_filtered.loc[mask & value_mask, 'file_cat'] = 'unknown'

In [77]:
# and if file_cat is still empty (the rest appear to be names), then also set to unknown
df_filtered.loc[mask & ~df_filtered['file_cat'].isin(category_dict.keys()), 'file_cat'] = 'unknown'


In [64]:
# Show first 10 title_modified strings where file_cat is empty
# df_filtered[df_filtered['file_cat'] == '']['title_modified'].head(100)

In [78]:
df_filtered.groupby(df_filtered['file_cat']).size().sort_values(ascending=False)

file_cat
soldier                                            34960
widow                                              24253
rejected                                           10745
bounty land warrant                                 3060
unknown                                             2491
rejected||soldier                                   1411
rejected||widow                                      830
soldier||widow                                       475
bounty land warrant||soldier                         166
bounty land warrant||rejected                        105
bounty land warrant||widow                            34
old war                                               17
old war||rejected||widow                              12
old war||widow                                        12
old war||soldier                                       7
rejected||soldier||widow                               5
old war||rejected                                      5
old war||rejected||sol

In [402]:
df_filtered.groupby(df_filtered['file_cat']).size().sort_values(ascending=False)

file_cat
soldier                                            34960
widow                                              24253
rejected                                           10745
bounty land warrant                                 3060
unknown                                             2188
rejected||soldier                                   1411
rejected||widow                                      830
soldier||widow                                       475
nara archival administrative sheets                  303
bounty land warrant||soldier                         166
bounty land warrant||rejected                        105
bounty land warrant||widow                            34
old war                                               17
old war||widow                                        12
old war||rejected||widow                              12
old war||soldier                                       7
rejected||soldier||widow                               5
old war||rejected     

In [None]:
# deal with microfilm pamphlet

In [423]:
# Debug: Check what strings are actually being found
application_prefix = 'revolutionary war pension and bounty land warrant application file' 
family_record = 'family record' 
microfilm_target_sheet = 'microfilm target sheet'

print("Checking each string individually:")
print(f"Application prefix matches: {df['title_prefix'].str.contains(application_prefix, case=False, na=False).sum()}")
print(f"Family record matches: {df['title_prefix'].str.contains(family_record, case=False, na=False).sum()}")
print(f"Microfilm target sheet matches: {df['title_prefix'].str.contains(microfilm_target_sheet, case=False, na=False).sum()}")

# Check what unique values are in title_prefix
print("\nUnique values in title_prefix (first 20):")
print(df['title_prefix'].value_counts().head(20))

Checking each string individually:
Application prefix matches: 78624
Family record matches: 25
Microfilm target sheet matches: 270

Unique values in title_prefix (first 20):
title_prefix
Revolutionary War Pension and Bounty Land Warrant Application File                                                      1446
Revolutionary War Pension and Bounty Land Warrant Application Files   NARA Archival Administrative Sheets                303
Microfilm target sheet                                                                                                   270
Revolutionary War Pension and Bounty Land Warrant Application File [Blank], for [Blank] [Blank], [Blank]                  10
Revolutionary War Pension and Bounty Land Warrant Application File [ILLEGIBLE], [ILLEGIBLE] [ILLEGIBLE], [ILLEGIBLE]       8
Revolutionary War Pension and Bounty Land Warrant Application File S. 46,466, for Benoni Pen, New Jersey                   2
Revolutionary War Pension and Bounty Land Warrant Application F

In [425]:
# Check if family_record and microfilm_target_sheet are independent or part of application_prefix
family_record = 'family record'
microfilm_target_sheet = 'microfilm target sheet'
application_prefix = 'revolutionary war pension and bounty land warrant application file'

# Check for rows that contain family_record but NOT application_prefix
family_only = df['title_prefix'].str.contains(family_record, case=False, na=False) & ~df['title_prefix'].str.contains(application_prefix, case=False, na=False)
print(f"Family record only (not in application prefix): {family_only.sum()}")

# Check for rows that contain microfilm_target_sheet but NOT application_prefix  
microfilm_only = df['title_prefix'].str.contains(microfilm_target_sheet, case=False, na=False) & ~df['title_prefix'].str.contains(application_prefix, case=False, na=False)
print(f"Microfilm target sheet only (not in application prefix): {microfilm_only.sum()}")

# Show some examples
print("\nExamples of family_record matches:")
print(df[df['title_prefix'].str.contains(family_record, case=False, na=False)]['title_prefix'].head())

print("\nExamples of microfilm_target_sheet matches:")
print(df[df['title_prefix'].str.contains(microfilm_target_sheet, case=False, na=False)]['title_prefix'].head())

Family record only (not in application prefix): 5
Microfilm target sheet only (not in application prefix): 270

Examples of family_record matches:
20220                            Illustrated Family Record
39759    Illustrated Family Record (Fraktur) Found in R...
39760    Illustrated Family Record (Fraktur) Found in R...
39761    Illustrated Family Record (Fraktur) Found in R...
39762    Illustrated Family Record (Fraktur) Found in R...
Name: title_prefix, dtype: object

Examples of microfilm_target_sheet matches:
3413    Microfilm target sheet
3444    Microfilm target sheet
3474    Microfilm target sheet
3503    Microfilm target sheet
3540    Microfilm target sheet
Name: title_prefix, dtype: object


## Import function from set_categories developed from testing above

In [1]:
import pandas as pd
df = pd.read_parquet('df_grouped_NAID_sorted_title.parquet')

from set_categories import set_categories, set_application_categories

# Run the function
df_categories = set_categories(df)

# Check results
print("File type distribution:")
print(df_categories['file_type'].value_counts())

# revolutionary war pension and bounty land warrant application file should have count 78604

File type distribution:
file_type
revolutionary war pension and bounty land warrant application file    78604
microfilm target sheet                                                  270
other                                                                    27
family record                                                            25
Name: count, dtype: int64


In [3]:
df_categories_2 = set_application_categories(df_categories)

# Check results
print("File cat distribution:")
print(df_categories_2['file_cat'].value_counts())

File cat distribution:
file_cat
soldier                                            34960
widow                                              24253
rejected                                           10745
bounty land warrant                                 3060
unknown                                             2491
rejected||soldier                                   1411
rejected||widow                                      830
soldier||widow                                       475
non_application                                      322
bounty land warrant||soldier                         166
bounty land warrant||rejected                        105
bounty land warrant||widow                            34
old war                                               17
old war||rejected||widow                              12
old war||widow                                        12
old war||soldier                                       7
old war||rejected                                      5