# Get frequency count for widows text

In [1]:
# !pip install nltk==3.5

In [2]:
import re
import pandas as pd
import nltk

In [3]:
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words("english"))

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from nltk import FreqDist

nltk.download('punkt')
nltk.download('wordnet')

import string
punctuation = set(string.punctuation)
stop_words_with_punct = stop_words.union(punctuation)

from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/oliviakasmin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/oliviakasmin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/oliviakasmin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
df = pd.read_parquet('df_grouped_NAID_sorted_title_with_file_cat.parquet')

In [5]:
df_full = pd.read_parquet('nara_pension_file_pages_with_file_cat.parquet')


In [6]:
df_widow = df_full[df_full['file_cat'].str.contains('widow', case=False, na=False)].copy()

In [7]:
df_widow.shape

(1035404, 20)

In [8]:
rows_with_transcription = ((df['transcriptionText'].notna()) & (df_widow['transcriptionText'] != '')).sum()
total_rows = len(df_widow)
percentage = (rows_with_transcription / total_rows) * 100

print(f"Total rows: {total_rows}")
print(f"Rows with transcriptionText: {rows_with_transcription}")
print(f"Percentage: {percentage:.2f}%")

Total rows: 25517
Rows with transcriptionText: 6756
Percentage: 26.48%


### Clean ocrText

1. split on "||"
2. tokenize words
3. filter to remove stop words
4. lemmatize words to get more consistent results 

In [8]:
sample = df_widow.iloc[0]['ocrText']
sample_str = sample.replace("||", " ").replace("\n", " ")

In [None]:
# First tokenize the text into words
words = word_tokenize(sample_str)

# Then filter out stop words
meaningful_words = [word for word in words if word.casefold() not in stop_words_with_punct]

# Then lemmatize
lemmatized_words = [lemmatizer.lemmatize(word) for word in meaningful_words]

# Then get frequency distribution
frequency_distribution = FreqDist(lemmatized_words)

In [54]:
frequency_distribution

FreqDist({'Livingston': 94, 'York': 34, 'Henry': 33, 'Jane': 31, 'New': 30, 'said': 28, 'day': 25, '1840': 19, 'Major': 18, 'County': 18, ...})

Collocations

In [71]:
new_text = nltk.Text(lemmatized_words)
new_text.collocations()

New York; Smith Thompson; Henry Livingston; day March; Commissioner
Pensions; United Colonies; duly sworn; hand seal; Know men; Notary
Public; Livingston decd; Livingston Junior; pension allowed; State
New; constitute appoint; men present; Henry Gilbert; Robert Gilbert;
April 1840; Edwards Esq


In [100]:
def analyze_ocr_text(ocrText):
    # Clean the text
    text_str = ocrText.replace("||", " ").replace("\n", " ")
   
    # First tokenize the text into words
    words = word_tokenize(text_str)

    # Then filter out stop words and punctuation
    meaningful_words = [word for word in words if word.casefold() not in stop_words_with_punct]

    # Then lemmatize
    lemmatized_words = [lemmatizer.lemmatize(word) for word in meaningful_words]

    # Get frequency distribution
    frequency_distribution = FreqDist(lemmatized_words)

    # Get bigram collocations
    # bigram_finder = BigramCollocationFinder.from_words(lemmatized_words)
    # bigram_collocations = bigram_finder.nbest(BigramAssocMeasures.raw_freq, 20)
    
    # Get trigram collocations
    # trigram_finder = TrigramCollocationFinder.from_words(lemmatized_words)
    # trigram_collocations = trigram_finder.nbest(TrigramAssocMeasures.raw_freq, 20)
    
    freq_dict = dict(frequency_distribution)
    sorted_freq_dict = dict(sorted(freq_dict.items(), key=lambda x: x[1], reverse=True))
    
    return {
        'frequency_distribution': frequency_distribution,
        'frequency_dict': sorted_freq_dict, 
        # 'bigram_collocations': bigram_collocations,
        # 'trigram_collocations': trigram_collocations,
        # 'lemmatized_words': lemmatized_words
    }

In [97]:
result = analyze_ocr_text(sample)

In [99]:
result['trigram_collocations']

[('Edwin', 'G.', 'Livingston'),
 ('Henry', 'Livingston', 'Junior'),
 ('Jane', 'M.', 'Livingston'),
 ('State', 'New', 'York'),
 ('Charles', 'P.', 'Livingston'),
 ('Helen', 'P.', 'Livingston'),
 ('Henry', 'Gilbert', 'Livingston'),
 ('Livingston', 'Helen', 'P.'),
 ('Major', 'Henry', 'Livingston'),
 ('Susan', 'C.', 'Livingston'),
 ('widow', 'Henry', 'Livingston'),
 ('Army', 'United', 'Colonies'),
 ('County', 'New', 'York'),
 ('Given', 'hand', 'seal'),
 ('Henry', 'Livingston', 'decd'),
 ('James', 'L.', 'Edwards'),
 ('Jane', 'Livingston', 'widow'),
 ('Know', 'men', 'present'),
 ('Livingston', 'New', 'York'),
 ('Livingston', 'deceased', 'widow')]

### Add lemmatizedWords column for all widows

In [17]:
def get_lemmatized_words(ocrText):
    if not ocrText:
        return ""

    if not len(ocrText):
        return ""

    str = ocrText.replace("\n", " ").lower()
   
    # First tokenize the text into words
    words = word_tokenize(str)

    # Then filter out stop words
    meaningful_words = [word for word in words if word.casefold() not in stop_words_with_punct]

    # Then lemmatize
    lemmatized_words = [lemmatizer.lemmatize(word) for word in meaningful_words]

    return "||".join(lemmatized_words)

In [14]:
lem_words = get_lemmatized_words(sample)

In [15]:
lem_words

'3||1776||service||mass||content||presence||woodbury||nathaniel||mary||number||w.||15496||estate'

In [18]:
# lambda function - iterate through df_widow and add column lemmatizedWords
df_widow['lemmatizedWords'] = df_widow['ocrText'].apply(get_lemmatized_words)

In [19]:
df_widow.tail()[['ocrText', 'lemmatizedWords']]

Unnamed: 0,ocrText,lemmatizedWords
2244624,"R.\nWIDOW, &c.\n25381\nMaria Carson\nWid of\nJ...",r.||widow||c.||25381||maria||carson||wid||jame...
2244625,New York 2773\nMaria Carson\nwidow of James Ca...,new||york||2773||maria||carson||widow||james||...
2244626,State of New-York\nCITY AND COUNTY OF NEW-YORK...,state||new-york||city||county||new-york||s||be...
2244627,'-\n911\nMary Carson\n$120 per ann\nDavid Pye\...,'-||911||mary||carson||120||per||ann||david||p...
2244628,S\nli\nbefore\nState of New York\nRockland Cou...,li||state||new||york||rockland||county||⎬||s||...


In [20]:
df_widow.to_parquet('widow_ungrouped_with_lemmatizedWords.parquet', engine='pyarrow')

In [22]:
df_widow.columns

Index(['NAID', 'naraURL', 'title', 'logicalDate', 'variantControlNumbers',
       'pdfObjectID', 'pdfURL', 'pageObjectId', 'pageURL', 'pageImageType',
       'ocrID', 'ocrText', 'ocrUploadDate', 'ocrContributor',
       'transcriptionID', 'transcriptionText',
       'transcriptionContributionCount', 'transcriptionUserNames',
       'transcriptionDate', 'file_cat', 'lemmatizedWords'],
      dtype='object')

In [25]:
df_widow.iloc[0]['pageURL']

'https://s3.amazonaws.com/NARAprodstorage/lz/microfilm-publications/M804-RevolutionaryWarPensionAppFiles/0001/25/M804_2633/images/4177212_00545.jpg'

In [31]:
df_widow_minimal = df_widow[['NAID', 'naraURL', 'pageURL', 'transcriptionText', 'lemmatizedWords']].copy()

In [34]:
import re

def clean_text_for_csv(text):
    if pd.isna(text) or text == '':
        return text
    
    text = str(text)
    
    # Replace all whitespace characters with single space
    text = re.sub(r'[\n\r\t\x0b\x0c]', ' ', text)
    text = text.replace('\x00', ' ')
    
    # Replace multiple consecutive spaces with single space
    text = re.sub(r' +', ' ', text)
    
    # Strip leading/trailing whitespace
    return text.strip()

# Apply cleaning
df_widow_minimal['transcriptionText'] = df_widow_minimal['transcriptionText'].apply(clean_text_for_csv)

# filter rows that have no transcriptionText
df_widow_minimal = df_widow_minimal[df_widow_minimal['transcriptionText'].notna()]

# When saving to CSV, pandas will automatically escape quotes with double quotes

In [35]:
df_widow_minimal.to_csv('widow_ungrouped_with_lemmatizedWords.csv')

## Process all widows df to get overall frequency distrubtion, bigrams and trigrams

In [121]:
# Combine all ocrText from df_widow
all_text = " ".join(df_widow['ocrText'].dropna().astype(str))

# Clean the combined text
all_text_cleaned = all_text.replace("||", " ").replace("\n", " ").lower()

# Tokenize the combined text
all_words = word_tokenize(all_text_cleaned)

# Filter out stop words and punctuation
all_meaningful_words = [
    word for word in all_words 
    if word.casefold() not in stop_words_with_punct
]

# Lemmatize
all_lemmatized_words = [lemmatizer.lemmatize(word) for word in all_meaningful_words]

# Get frequency distribution for the combined text
combined_freq_dist = FreqDist(all_lemmatized_words)

# Get bigram collocations from combined text
bigram_finder = BigramCollocationFinder.from_words(all_lemmatized_words)
bigram_collocations = bigram_finder.nbest(BigramAssocMeasures.raw_freq, 20)

# Get trigram collocations from combined text
trigram_finder = TrigramCollocationFinder.from_words(all_lemmatized_words)
trigram_collocations = trigram_finder.nbest(TrigramAssocMeasures.raw_freq, 20)

# Results
# print("Top 20 most common words (combined corpus):")
# print(combined_freq_dist.most_common(20))

In [122]:
# Results
print("Top 20 most common words (combined corpus):")
print(combined_freq_dist.most_common(40))

Top 20 most common words (combined corpus):
[('said', 1361693), ('county', 1154822), ('state', 970883), ('day', 852895), ('year', 710170), ('pension', 706477), ('court', 551557), ('service', 496139), ('act', 472814), ('widow', 425544), ('war', 390089), ('john', 383063), ('time', 370861), ('one', 359410), ('certify', 348172), ('new', 346566), ('--', 327790), ('sworn', 325878), ('march', 319647), ('served', 291514), ('clerk', 290833), ('justice', 290665), ('july', 272634), ('may', 270945), ('month', 267304), ('declaration', 266618), ('record', 265027), ('name', 264974), ('peace', 262056), ('certificate', 261764), ('office', 256602), ('made', 254589), ('revolutionary', 253770), ('1', 251145), ('aforesaid', 248774), ('subscribed', 239725), ('┃', 236500), ('june', 233587), ('york', 222730), ('company', 217343)]


In [None]:
print("\nTop 20 bigram collocations (combined corpus):")
for word1, word2 in bigram_collocations:
    print(f"  {word1} {word2}")


In [102]:
print("\nTop 20 trigram collocations (combined corpus):")
for word1, word2, word3 in trigram_collocations:
    print(f"  {word1} {word2} {word3}")


Top 20 trigram collocations (combined corpus):
  State New York
  duly sworn according
  hereunto set hand
  whereof hereunto set
  sworn according law
  order obtain benefit
  testimony whereof hereunto
  first duly sworn
  Certificate Pension issued
  according law doth
  oath make following
  doth oath make
  4th day March
  per annum commence
  thousand eight hundred
  set hand affixed
  subscribed day year
  Cents per annum
  law doth oath
  Court Common Pleas


In [132]:
import json

# Prepare data for JSON serialization
results = {
    'frequency_distribution': combined_freq_dist.most_common(),  # List of (word, count) tuples
    'bigram_collocations': [list(bigram) for bigram in bigram_collocations],
    'trigram_collocations': [list(trigram) for trigram in trigram_collocations]
}

# Save to JSON
with open('widow_full_text_analysis_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print("Results saved to 'widow_full_text_analysis_results.json'")

Results saved to 'widow_full_text_analysis_results.json'


## Process all rows that have transcriptionText so can compare results from just widows to get frequency distrubtion, bigrams and trigrams

In [123]:
# Combine all ocrText from df_widow
all_cats_text = " ".join(df['transcriptionText'].dropna().astype(str))

# Clean the combined text
all_cats_text_cleaned = all_cats_text.replace("||", " ").replace("\n", " ").lower()

# Tokenize the combined text
all_cats_words = word_tokenize(all_cats_text_cleaned)

# Filter out stop words and punctuation
all_cats_meaningful_words = [
    word for word in all_cats_words 
    if word.casefold() not in stop_words_with_punct
]

# Lemmatize
all_cats_lemmatized_words = [lemmatizer.lemmatize(word) for word in all_cats_meaningful_words]

# Get frequency distribution for the combined text
combined_cats_freq_dist = FreqDist(all_cats_lemmatized_words)

# Get bigram collocations from combined text
bigram_cats_finder = BigramCollocationFinder.from_words(all_cats_lemmatized_words)
bigram_cats_collocations = bigram_cats_finder.nbest(BigramAssocMeasures.raw_freq, 20)

# Get trigram collocations from combined text
trigram_cats_finder = TrigramCollocationFinder.from_words(all_cats_lemmatized_words)
trigram_cats_collocations = trigram_cats_finder.nbest(TrigramAssocMeasures.raw_freq, 20)

In [124]:
print(combined_cats_freq_dist.most_common(40))

[('county', 658938), ('said', 641565), ('state', 608343), ('pension', 458630), ('day', 438489), ('service', 421794), ('year', 417024), ('court', 369497), ('war', 300698), ('illegible', 276604), ('--', 251536), ('john', 251028), ('act', 243030), ('served', 240179), ('time', 220085), ('one', 209059), ('month', 205624), ('revolutionary', 199902), ('sworn', 195004), ('name', 191195), ('new', 189591), ('certify', 185744), ('declaration', 177471), ('office', 176320), ('record', 172710), ('march', 171064), ('company', 164356), ("'s", 160403), ('clerk', 159214), ('certificate', 156325), ('claim', 156165), ('june', 155396), ('may', 155237), ('soldier', 151009), ('widow', 149523), ('justice', 147878), ('aforesaid', 146794), ('seal', 141979), ('made', 140151), ('``', 135929)]


In [125]:
print(bigram_cats_collocations)


[('--', '--'), ('revolutionary', 'war'), ('said', 'county'), ('justice', 'peace'), ('new', 'york'), ('united', 'state'), ('hereby', 'certify'), ('personally', 'appeared'), ('said', 'court'), ('state', 'new'), ('sworn', 'subscribed'), ('duly', 'sworn'), ('county', 'state'), ('act', 'congress'), ('testimony', 'whereof'), ('whereof', 'hereunto'), ('set', 'hand'), ('day', 'march'), ('according', 'law'), ('day', 'year')]


In [126]:
print(trigram_cats_collocations)

[('--', '--', '--'), ('testimony', 'whereof', 'hereunto'), ('state', 'new', 'york'), ('sworn', 'according', 'law'), ('hereunto', 'set', 'hand'), ('whereof', 'hereunto', 'set'), ('duly', 'sworn', 'according'), ('court', 'common', 'plea'), ('declaration', 'order', 'obtain'), ('sworn', 'subscribed', 'day'), ('subscribed', 'day', 'year'), ('according', 'law', 'doth'), ('order', 'obtain', 'benefit'), ('first', 'duly', 'sworn'), ('justice', 'peace', 'said'), ('thousand', 'eight', 'hundred'), ('law', 'doth', 'oath'), ('one', 'thousand', 'eight'), ('peace', 'said', 'county'), ('service', 'united', 'state')]


In [127]:
# Get top 20 words from each frequency distribution
top_20_freq = set([word for word, count in combined_freq_dist.most_common(200)])
top_20_cats = set([word for word, count in combined_cats_freq_dist.most_common(200)])

# Find unique words in top 20 of combined_freq_dist (not in top 20 of combined_cats_freq_dist)
unique_in_freq = top_20_freq - top_20_cats

# Find unique words in top 20 of combined_cats_freq_dist (not in top 20 of combined_freq_dist)
unique_in_cats = top_20_cats - top_20_freq




In [137]:
display(combined_freq_dist.most_common(200))

[('said', 1361693),
 ('county', 1154822),
 ('state', 970883),
 ('day', 852895),
 ('year', 710170),
 ('pension', 706477),
 ('court', 551557),
 ('service', 496139),
 ('act', 472814),
 ('widow', 425544),
 ('war', 390089),
 ('john', 383063),
 ('time', 370861),
 ('one', 359410),
 ('certify', 348172),
 ('new', 346566),
 ('--', 327790),
 ('sworn', 325878),
 ('march', 319647),
 ('served', 291514),
 ('clerk', 290833),
 ('justice', 290665),
 ('july', 272634),
 ('may', 270945),
 ('month', 267304),
 ('declaration', 266618),
 ('record', 265027),
 ('name', 264974),
 ('peace', 262056),
 ('certificate', 261764),
 ('office', 256602),
 ('made', 254589),
 ('revolutionary', 253770),
 ('1', 251145),
 ('aforesaid', 248774),
 ('subscribed', 239725),
 ('┃', 236500),
 ('june', 233587),
 ('york', 222730),
 ('company', 217343),
 ('duly', 216278),
 ('town', 209265),
 ('hereby', 205810),
 ('claim', 203292),
 ('hand', 202696),
 ('oath', 201494),
 ('seal', 198274),
 ('first', 196439),
 ('person', 193727),
 ('law', 1

In [136]:
display(combined_cats_freq_dist.most_common(200))

[('county', 658938),
 ('said', 641565),
 ('state', 608343),
 ('pension', 458630),
 ('day', 438489),
 ('service', 421794),
 ('year', 417024),
 ('court', 369497),
 ('war', 300698),
 ('illegible', 276604),
 ('--', 251536),
 ('john', 251028),
 ('act', 243030),
 ('served', 240179),
 ('time', 220085),
 ('one', 209059),
 ('month', 205624),
 ('revolutionary', 199902),
 ('sworn', 195004),
 ('name', 191195),
 ('new', 189591),
 ('certify', 185744),
 ('declaration', 177471),
 ('office', 176320),
 ('record', 172710),
 ('march', 171064),
 ('company', 164356),
 ("'s", 160403),
 ('clerk', 159214),
 ('certificate', 156325),
 ('claim', 156165),
 ('june', 155396),
 ('may', 155237),
 ('soldier', 151009),
 ('widow', 149523),
 ('justice', 147878),
 ('aforesaid', 146794),
 ('seal', 141979),
 ('made', 140151),
 ('``', 135929),
 ('subscribed', 132531),
 ('hereby', 128739),
 ('peace', 126549),
 ('july', 124011),
 ('william', 123548),
 ('captain', 122730),
 ('person', 120407),
 ('officer', 117247),
 ('regiment',

In [135]:
# Display results
print("Words in top 200 of combined_freq_dist but NOT in top 200 of combined_cats_freq_dist:")
for word in sorted(unique_in_freq):
    count = combined_freq_dist[word]
    print(f"  {word}: {count}")


Words in top 200 of combined_freq_dist but NOT in top 200 of combined_cats_freq_dist:
  1838: 114829
  1843: 79295
  a.d.: 74193
  annexed: 74625
  annum: 100018
  benefit: 80057
  c: 74065
  connecticut: 88931
  deceased: 73932
  elizabeth: 80697
  father: 79345
  full: 75075
  known: 77815
  late: 84758
  man: 78344
  massachusetts: 108438
  mr: 120530
  part: 76252
  rate: 99504
  six: 76078
  smith: 79034
  son: 104171
  thousand: 77869
  twenty: 85551
  w: 75963
  ⎬: 93569
  ┃: 236500
  ●: 74356


In [134]:
print("\nWords in top 200 of combined_cats_freq_dist but NOT in top 200 of combined_freq_dist:")
for word in sorted(unique_in_cats):
    count = combined_cats_freq_dist[word]
    print(f"  {word}: {count}")


Words in top 200 of combined_cats_freq_dist but NOT in top 200 of combined_freq_dist:
  1818: 49116
  appears: 50083
  battle: 59644
  blank: 51523
  carolina: 79055
  col.: 53332
  colonel: 65700
  district: 58060
  esq: 48932
  every: 46474
  file: 62270
  general: 67435
  illegible: 276604
  left: 46515
  living: 46695
  marched: 45011
  militia: 66239
  mrs.: 47700
  must: 63098
  north: 52109
  opinion: 51572
  pennsylvania: 45765
  period: 50533
  rev: 46437
  signed: 75328
  term: 52704
  troop: 56047
  vermont: 45148


In [129]:
print(f"\nSummary:")
print(f"  Unique in combined_freq_dist: {len(unique_in_freq)} words")
print(f"  Unique in combined_cats_freq_dist: {len(unique_in_cats)} words")
print(f"  Common words: {len(top_20_freq & top_20_cats)} words")


Summary:
  Unique in combined_freq_dist: 28 words
  Unique in combined_cats_freq_dist: 28 words
  Common words: 172 words


In [131]:
import json

# Prepare data for JSON serialization
cats_results = {
    'frequency_distribution': combined_cats_freq_dist.most_common(),  # List of (word, count) tuples
    'bigram_collocations': [list(bigram) for bigram in bigram_cats_collocations],  # Convert tuples to lists
    'trigram_collocations': [list(trigram) for trigram in trigram_cats_collocations]  # Convert tuples to lists
}

# Save to JSON
with open('cats_text_analysis_results.json', 'w') as f:
    json.dump(cats_results, f, indent=2)

print("Results saved to 'cats_text_analysis_results.json'")

Results saved to 'cats_text_analysis_results.json'
