### Analyze categories of the overall dataset by title, and divide the pension application files into pre-determined categories

In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_parquet('df_grouped_NAID_sorted_title.parquet')

##### Look at first x characters of titles to see groupings 

In [23]:
df['title_prefix'] = df['title'].str[:21].str.lower()
df.groupby('title_prefix').size().sort_values(ascending=False)

title_prefix
revolutionary war pen    78604
microfilm target shee      270
illustrated family re       24
copy of the affidavit        4
discharge certificate        4
pension affidavit of         3
newcomb family record        1
statement of john cro        1
sarah benjamin's eyew        1
sampler of chester go        1
sampler for the famil        1
petition of william s        1
pension application o        1
birth and baptismal r        1
copy of a letter to h        1
fraktur of the family        1
flying camp rations b        1
diary of levi stedman        1
daniel kimball's revo        1
copy of the testimony        1
copy of the statement        1
copy of the commissio        1
statement regarding p        1
dtype: int64

In [31]:
df['title_prefix'] = df['title'].str[:25].str.lower()
df.groupby('title_prefix').size().sort_values(ascending=False)

title_prefix
revolutionary war pension    78604
microfilm target sheet         270
illustrated family record       24
copy of the affidavit of         4
discharge certificate for        2
discharge certificate of         2
pension affidavit of cato        1
statement of john crosley        1
sarah benjamin's eyewitne        1
sampler of chester goodal        1
sampler for the family of        1
petition of william scott        1
pension application of pr        1
pension affidavit of prin        1
pension affidavit of cuff        1
birth and baptismal recor        1
newcomb family record            1
copy of a letter to henry        1
fraktur of the family of         1
flying camp rations broad        1
diary of levi stedman of         1
daniel kimball's revoluti        1
copy of the testimony of         1
copy of the statement by         1
copy of the commission is        1
statement regarding princ        1
dtype: int64

In [199]:
df['title_prefix'] = df['title'].str.replace('-', ' ')
df['title_prefix'] = df['title_prefix'].str[:66].str.lower()
df.groupby('title_prefix').size().sort_values(ascending=False)

title_prefix
revolutionary war pension and bounty land warrant application file    78604
microfilm target sheet                                                  270
illustrated family record (fraktur) found in revolutionary war pen       20
pension affidavit of prince vaughan                                       1
illustrated family record [fraktur] of john jenkins                       1
illustrated family record [fraktur] of john mersereau                     1
newcomb family record                                                     1
pension affidavit of cato greene                                          1
pension affidavit of cuff greene                                          1
birth and baptismal record of dorothea garecht sell                       1
illustrated family record (fraktur) for ezekiel root, connecticut         1
petition of william scott, late a major in the new hampshire line,        1
sampler for the family of peter and mary booth                            1

### Filter data by title that starts with 'revolutionary war pension and bounty land warrant application file'

In [None]:
df_filtered = df.copy()

In [261]:
application_prefix = 'revolutionary war pension and bounty land warrant application file'

df_filtered['title_modified'] = (df_filtered['title']
    .str.replace('-', ' ')
    .str.lower()
    .str.replace(application_prefix + 's', application_prefix, case=False, regex=False)
)

# filter by title starts with application_prefix
df_filtered = df_filtered[df_filtered['title_modified'].str.startswith(application_prefix.lower())]

In [262]:
df_filtered.shape

(78604, 14)

### Data cleaning & sort into categories

In [None]:
df_filtered['title_modified'] = df_filtered['title_modified'].str.replace(application_prefix, '', regex=True)

# short hand for registration and other words don't need
words = ["reg", "rej", "red", "for", "file", "see"]
for word in words:
    df_filtered['title_modified'] = df_filtered['title_modified'].str.replace(word, '', regex=False)

# various punctuation
punctuation = [',', '.', '!', '?', ':', ';', '-', '_', '(', ')', '[', ']', '{', '}', '\'', '\"', '/', '\\', '|', '`', '~', '=', '+', '*', '#', '@', '%', '$', '^', '&']

for punct in punctuation:
    df_filtered['title_modified'] = df_filtered['title_modified'].str.replace(punct, ' ', regex=False)

# Replace multiple consecutive spaces with a single space
df_filtered['title_modified'] = df_filtered['title_modified'].str.replace(' +', ' ', regex=True)

In [266]:
# df_filtered.groupby(df_filtered['title'].str[len(application_prefix):len(application_prefix) + 3]).size().sort_values(ascending=False)

pd.set_option('display.max_rows', 100)
df_filtered.groupby(df_filtered['title_modified'].str[:3]).size().sort_values(ascending=False)

title_modified
 s     36096
 w     25512
 r     11130
 b      3227
        1446
 bl      538
 na      304
 il       98
 ol       45
 jo       20
 wt       19
 o        16
 ct       15
 wi       11
 n        10
 ja       10
 th        6
 su        5
 ro        5
 4         5
 no        4
 he        4
 ch        4
 da        4
 di        4
 se        3
 sa        3
 18        3
 ab        2
 ad        2
 s4        2
 ne        2
 je        2
 bo        2
 mi        2
 ec        2
 eb        2
 sp        1
 14        1
 s1        1
 s2        1
 s3        1
 ev        1
 sh        1
 st        1
 23        1
 36        1
 sy        1
 t         1
 ac        1
 ur        1
 va        1
 13        1
 co        1
 am        1
 hu        1
 17        1
 pe        1
 ba        1
 ge        1
 10        1
 k         1
 ka        1
 m         1
 mc        1
 aa        1
 39        1
 20        1
 fe        1
 ni        1
 ca        1
 an        1
 ez        1
 p         1
 p5        1
 ri       

In [328]:
# investigate groupings

# df_filtered.groupby(df_filtered['title'].str[len(application_prefix):len(application_prefix) + 3]).size().sort_values(ascending=False)

# df_filtered['title'].str.contains(f'{application_prefix}  bl ').sum()


# Show both original and modified titles
# mask = df_filtered['title'].str.contains(' n ', na=False, regex=False)
# mask = df_filtered['title_modified'].str.contains(' s ', na=False, regex=False)
# df_filtered[mask].assign(modified_title=df_filtered[mask]['title_modified'].str.replace(application_prefix + ' ', '', regex=False))[['title_modified', 'modified_title']]

# df_filtered['title_modified'].str.startswith(' illegible ').sum()

# (df_filtered['title_modified'].str.strip() == '').sum()

df_filtered[df_filtered['title_modified'].str.contains('microfilm pamphlet')]


Unnamed: 0,NAID,naraURL,title,logicalDate,pdfObjectID,pdfURL,pageObjectId,pageURL,pageImageType,ocrText,transcriptionText,title_prefix,file_cat,title_modified
9611,144028666,https://catalog.archives.gov/id/144028666,Revolutionary War Pension and Bounty Land Warr...,,,,144028667||144028668||144028669||144028670||14...,https://s3.amazonaws.com/NARAprodstorage/lz/mi...,Image (JPG),NATIONAL ARCHIVES MICROFILM PUBLICATIONS\nREVO...,[HEADING] NATIONAL ARCHIVES MICROFILM PUBLICA...,revolutionary war pension and bounty land warr...,,microfilm pamphlet


#### Add file_cat column and categorize based on dictionary

using "unkown" to group titles with no other info, that have "blank" and that have "illegible"

In [321]:
# Initialize file_cat column
df_filtered['file_cat'] = ''

In [322]:
# Set file_cat to "unknown" for empty or whitespace-only title_modified
mask = df_filtered['title_modified'].str.strip() == ''
df_filtered.loc[mask, 'file_cat'] = 'unknown'

In [None]:
category_dict = {
    "soldier": ['s', 'sur'], # soldier (survived)
    'rejected': ['r', 'k'], # rejected
    "widow": ['w', 'wid'], # widow
    "bounty land warrant": ['b', 'bl', 'wt', 'b l wt', 'bounty land', 'blwt'], # bounty land warrant
    "old war": ['old act', 'old war', 'o w', 'ow'], # old war
    "nara accession": ['n a acc no', 'acc no', 'no'], # National Archives Accession
    "unknown": ["blank", "illegible", "ctf"], #(ctf = certificate)
    "nara archival administrative sheets": ["nara archival administrative sheets"]
}

# Handle the special case for "nara archival administrative sheets" first
nara_value = "nara archival administrative sheets"
nara_mask = df_filtered['title_modified'].str.contains(nara_value, na=False)
df_filtered.loc[nara_mask, 'file_cat'] = nara_value

# For all other key-value pairs in category_dict (excluding the nara case)
for key, values in category_dict.items():
    if key != "nara archival administrative sheets":  # Skip the special case
        for value in values:
            if value:  # Skip empty lists
                # Check if title_modified starts with " " + value + " "
                pattern = f' {value} '
                mask = df_filtered['title_modified'].str.startswith(pattern, na=False)
                df_filtered.loc[mask, 'file_cat'] = key



# # For each key-value pair in category_dict
# for key, values in category_dict.items():
#     for value in values:
#         if value:  # Skip empty lists
#             # Check if title_modified starts with " " + value + " "
#             pattern = f' {value} '
#             mask = df_filtered['title_modified'].str.startswith(pattern, na=False)
#             df_filtered.loc[mask, 'file_cat'] = key



# # handle these separately because different pattern
# nara_mask = df_filtered['title_modified'].str.startswith(category_dict["nara archival administrative sheets"][0], na=False)
# df_filtered.loc[nara_mask, 'file_cat'] = category_dict["nara archival administrative sheets"]

In [338]:
df_filtered.groupby(df_filtered['file_cat']).size().sort_values(ascending=False)

file_cat
soldier                                36101
widow                                  25516
rejected                               11131
bounty land warrant                     3261
unknown                                 2084
nara archival administrative sheets      303
                                         132
old war                                   61
nara accession                            15
dtype: int64

In [339]:
# Show first 10 title_modified strings where file_cat is empty
df_filtered[df_filtered['file_cat'] == '']['title_modified'].head(100)



# Handle a few other special cases
# contains "ow", " b l " "  l wt  "





58024                     17 692 william cremer new jersey
72250                                 1828 david hall n c 
77760            36585 ezekiel hotchkiss continental conn 
42624                              4 365 enoch bagley n h 
56430                         4 2 217 nicholas conly penn 
56417     4 2 218 thomas conley new jersey indian war w...
56425                             4 2 221 neill conly n c 
56531                    4 3 041 william connor new jersey
18301       abraham t abraham titcomb sweatt new hampshire
54897                             bateson john clarke del 
57461                      covert bergun covert new jersey
61287                          dickson miles dickson conn 
66767                                dickson peter ney n c
47770                             dis henry bouce new york
62844     dis william dunton navy pennsylvania residenc...
18691                   ferson daniel b tarr massachusetts
40526                           jej b l john alexander v

In [None]:
# for any rows that have file_cat = "" then check if the title_modified contains any values in category_dict for "bounty land warrant" and "old war" and if so, set file_cat to the key
