# Get frequency count for widows text

In [4]:
# !pip install nltk==3.5

In [3]:
import re
import pandas as pd
import nltk

In [None]:
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words("english"))

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from nltk import FreqDist

nltk.download('punkt')
nltk.download('wordnet')

import string
punctuation = set(string.punctuation)
stop_words_with_punct = stop_words.union(punctuation)

from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/oliviakasmin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/oliviakasmin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/oliviakasmin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
df = pd.read_parquet('df_grouped_NAID_sorted_title_with_file_cat.parquet')

In [14]:
df_widow = df[df['file_cat'].str.contains('widow', case=False, na=False)]

In [None]:
# df_widow.shape

(25517, 22)

In [8]:
rows_with_transcription = ((df['transcriptionText'].notna()) & (df_widow['transcriptionText'] != '')).sum()
total_rows = len(df_widow)
percentage = (rows_with_transcription / total_rows) * 100

print(f"Total rows: {total_rows}")
print(f"Rows with transcriptionText: {rows_with_transcription}")
print(f"Percentage: {percentage:.2f}%")

Total rows: 25517
Rows with transcriptionText: 6756
Percentage: 26.48%


### Clean ocrText

1. split on "||"
2. tokenize words
3. filter to remove stop words
4. lemmatize words to get more consistent results 

In [None]:
sample = df_widow.iloc[0]['ocrText']
sample_str = sample.replace("||", " ").replace("\n", " ")

In [None]:
# First tokenize the text into words
words = word_tokenize(sample_str)

# Then filter out stop words
meaningful_words = [word for word in words if word.casefold() not in stop_words_with_punct]

# Then lemmatize
lemmatized_words = [lemmatizer.lemmatize(word) for word in meaningful_words]

# Then get frequency distribution
frequency_distribution = FreqDist(lemmatized_words)

In [54]:
frequency_distribution

FreqDist({'Livingston': 94, 'York': 34, 'Henry': 33, 'Jane': 31, 'New': 30, 'said': 28, 'day': 25, '1840': 19, 'Major': 18, 'County': 18, ...})

Collocations

In [71]:
new_text = nltk.Text(lemmatized_words)
new_text.collocations()

New York; Smith Thompson; Henry Livingston; day March; Commissioner
Pensions; United Colonies; duly sworn; hand seal; Know men; Notary
Public; Livingston decd; Livingston Junior; pension allowed; State
New; constitute appoint; men present; Henry Gilbert; Robert Gilbert;
April 1840; Edwards Esq


In [None]:
def analyze_ocr_text(ocrText):
    str = ocrText.replace("||", " ").replace("\n", " ")
   
    # First tokenize the text into words
    words = word_tokenize(str)

    # Then filter out stop words
    meaningful_words = [word for word in words if word.casefold() not in stop_words_with_punct]

    # Then lemmatize
    lemmatized_words = [lemmatizer.lemmatize(word) for word in meaningful_words]

    # Then get frequency distribution
    frequency_distribution = FreqDist(lemmatized_words)


    # text = nltk.Text(lemmatized_words)
    # collocations = text.collocations()

    # print(collocations)

    return frequency_distribution

In [67]:
analyze = analyze_ocr_text(sample)

[FreqDist({'Livingston': 94, 'York': 34, 'Henry': 33, 'Jane': 31, 'New': 30, 'said': 28, 'day': 25, '1840': 19, 'Major': 18, 'County': 18, ...})]

In [100]:
def analyze_ocr_text(ocrText):
    # Clean the text
    text_str = ocrText.replace("||", " ").replace("\n", " ")
   
    # First tokenize the text into words
    words = word_tokenize(text_str)

    # Then filter out stop words and punctuation
    meaningful_words = [word for word in words if word.casefold() not in stop_words_with_punct]

    # Then lemmatize
    lemmatized_words = [lemmatizer.lemmatize(word) for word in meaningful_words]

    # Get frequency distribution
    frequency_distribution = FreqDist(lemmatized_words)

    # Get bigram collocations
    # bigram_finder = BigramCollocationFinder.from_words(lemmatized_words)
    # bigram_collocations = bigram_finder.nbest(BigramAssocMeasures.raw_freq, 20)
    
    # Get trigram collocations
    # trigram_finder = TrigramCollocationFinder.from_words(lemmatized_words)
    # trigram_collocations = trigram_finder.nbest(TrigramAssocMeasures.raw_freq, 20)
    
    freq_dict = dict(frequency_distribution)
    sorted_freq_dict = dict(sorted(freq_dict.items(), key=lambda x: x[1], reverse=True))
    
    return {
        'frequency_distribution': frequency_distribution,
        'frequency_dict': sorted_freq_dict, 
        # 'bigram_collocations': bigram_collocations,
        # 'trigram_collocations': trigram_collocations,
        # 'lemmatized_words': lemmatized_words
    }

In [97]:
result = analyze_ocr_text(sample)

In [99]:
result['trigram_collocations']

[('Edwin', 'G.', 'Livingston'),
 ('Henry', 'Livingston', 'Junior'),
 ('Jane', 'M.', 'Livingston'),
 ('State', 'New', 'York'),
 ('Charles', 'P.', 'Livingston'),
 ('Helen', 'P.', 'Livingston'),
 ('Henry', 'Gilbert', 'Livingston'),
 ('Livingston', 'Helen', 'P.'),
 ('Major', 'Henry', 'Livingston'),
 ('Susan', 'C.', 'Livingston'),
 ('widow', 'Henry', 'Livingston'),
 ('Army', 'United', 'Colonies'),
 ('County', 'New', 'York'),
 ('Given', 'hand', 'seal'),
 ('Henry', 'Livingston', 'decd'),
 ('James', 'L.', 'Edwards'),
 ('Jane', 'Livingston', 'widow'),
 ('Know', 'men', 'present'),
 ('Livingston', 'New', 'York'),
 ('Livingston', 'deceased', 'widow')]

## Process all widows df to get overall frequency distrubtion, bigrams and trigrams

In [121]:
# Combine all ocrText from df_widow
all_text = " ".join(df_widow['ocrText'].dropna().astype(str))

# Clean the combined text
all_text_cleaned = all_text.replace("||", " ").replace("\n", " ").lower()

# Tokenize the combined text
all_words = word_tokenize(all_text_cleaned)

# Filter out stop words and punctuation
all_meaningful_words = [
    word for word in all_words 
    if word.casefold() not in stop_words_with_punct
]

# Lemmatize
all_lemmatized_words = [lemmatizer.lemmatize(word) for word in all_meaningful_words]

# Get frequency distribution for the combined text
combined_freq_dist = FreqDist(all_lemmatized_words)

# Get bigram collocations from combined text
bigram_finder = BigramCollocationFinder.from_words(all_lemmatized_words)
bigram_collocations = bigram_finder.nbest(BigramAssocMeasures.raw_freq, 20)

# Get trigram collocations from combined text
trigram_finder = TrigramCollocationFinder.from_words(all_lemmatized_words)
trigram_collocations = trigram_finder.nbest(TrigramAssocMeasures.raw_freq, 20)

# Results
# print("Top 20 most common words (combined corpus):")
# print(combined_freq_dist.most_common(20))

In [122]:
# Results
print("Top 20 most common words (combined corpus):")
print(combined_freq_dist.most_common(40))

Top 20 most common words (combined corpus):
[('said', 1361693), ('county', 1154822), ('state', 970883), ('day', 852895), ('year', 710170), ('pension', 706477), ('court', 551557), ('service', 496139), ('act', 472814), ('widow', 425544), ('war', 390089), ('john', 383063), ('time', 370861), ('one', 359410), ('certify', 348172), ('new', 346566), ('--', 327790), ('sworn', 325878), ('march', 319647), ('served', 291514), ('clerk', 290833), ('justice', 290665), ('july', 272634), ('may', 270945), ('month', 267304), ('declaration', 266618), ('record', 265027), ('name', 264974), ('peace', 262056), ('certificate', 261764), ('office', 256602), ('made', 254589), ('revolutionary', 253770), ('1', 251145), ('aforesaid', 248774), ('subscribed', 239725), ('┃', 236500), ('june', 233587), ('york', 222730), ('company', 217343)]


In [None]:
print("\nTop 20 bigram collocations (combined corpus):")
for word1, word2 in bigram_collocations:
    print(f"  {word1} {word2}")


In [102]:
print("\nTop 20 trigram collocations (combined corpus):")
for word1, word2, word3 in trigram_collocations:
    print(f"  {word1} {word2} {word3}")


Top 20 trigram collocations (combined corpus):
  State New York
  duly sworn according
  hereunto set hand
  whereof hereunto set
  sworn according law
  order obtain benefit
  testimony whereof hereunto
  first duly sworn
  Certificate Pension issued
  according law doth
  oath make following
  doth oath make
  4th day March
  per annum commence
  thousand eight hundred
  set hand affixed
  subscribed day year
  Cents per annum
  law doth oath
  Court Common Pleas


In [132]:
import json

# Prepare data for JSON serialization
results = {
    'frequency_distribution': combined_freq_dist.most_common(),  # List of (word, count) tuples
    'bigram_collocations': [list(bigram) for bigram in bigram_collocations],
    'trigram_collocations': [list(trigram) for trigram in trigram_collocations]
}

# Save to JSON
with open('widow_full_text_analysis_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print("Results saved to 'widow_full_text_analysis_results.json'")

Results saved to 'widow_full_text_analysis_results.json'


## Process all rows that have transcriptionText so can compare results from just widows to get frequency distrubtion, bigrams and trigrams

In [123]:
# Combine all ocrText from df_widow
all_cats_text = " ".join(df['transcriptionText'].dropna().astype(str))

# Clean the combined text
all_cats_text_cleaned = all_cats_text.replace("||", " ").replace("\n", " ").lower()

# Tokenize the combined text
all_cats_words = word_tokenize(all_cats_text_cleaned)

# Filter out stop words and punctuation
all_cats_meaningful_words = [
    word for word in all_cats_words 
    if word.casefold() not in stop_words_with_punct
]

# Lemmatize
all_cats_lemmatized_words = [lemmatizer.lemmatize(word) for word in all_cats_meaningful_words]

# Get frequency distribution for the combined text
combined_cats_freq_dist = FreqDist(all_cats_lemmatized_words)

# Get bigram collocations from combined text
bigram_cats_finder = BigramCollocationFinder.from_words(all_cats_lemmatized_words)
bigram_cats_collocations = bigram_cats_finder.nbest(BigramAssocMeasures.raw_freq, 20)

# Get trigram collocations from combined text
trigram_cats_finder = TrigramCollocationFinder.from_words(all_cats_lemmatized_words)
trigram_cats_collocations = trigram_cats_finder.nbest(TrigramAssocMeasures.raw_freq, 20)

In [124]:
print(combined_cats_freq_dist.most_common(40))

[('county', 658938), ('said', 641565), ('state', 608343), ('pension', 458630), ('day', 438489), ('service', 421794), ('year', 417024), ('court', 369497), ('war', 300698), ('illegible', 276604), ('--', 251536), ('john', 251028), ('act', 243030), ('served', 240179), ('time', 220085), ('one', 209059), ('month', 205624), ('revolutionary', 199902), ('sworn', 195004), ('name', 191195), ('new', 189591), ('certify', 185744), ('declaration', 177471), ('office', 176320), ('record', 172710), ('march', 171064), ('company', 164356), ("'s", 160403), ('clerk', 159214), ('certificate', 156325), ('claim', 156165), ('june', 155396), ('may', 155237), ('soldier', 151009), ('widow', 149523), ('justice', 147878), ('aforesaid', 146794), ('seal', 141979), ('made', 140151), ('``', 135929)]


In [125]:
print(bigram_cats_collocations)


[('--', '--'), ('revolutionary', 'war'), ('said', 'county'), ('justice', 'peace'), ('new', 'york'), ('united', 'state'), ('hereby', 'certify'), ('personally', 'appeared'), ('said', 'court'), ('state', 'new'), ('sworn', 'subscribed'), ('duly', 'sworn'), ('county', 'state'), ('act', 'congress'), ('testimony', 'whereof'), ('whereof', 'hereunto'), ('set', 'hand'), ('day', 'march'), ('according', 'law'), ('day', 'year')]


In [126]:
print(trigram_cats_collocations)

[('--', '--', '--'), ('testimony', 'whereof', 'hereunto'), ('state', 'new', 'york'), ('sworn', 'according', 'law'), ('hereunto', 'set', 'hand'), ('whereof', 'hereunto', 'set'), ('duly', 'sworn', 'according'), ('court', 'common', 'plea'), ('declaration', 'order', 'obtain'), ('sworn', 'subscribed', 'day'), ('subscribed', 'day', 'year'), ('according', 'law', 'doth'), ('order', 'obtain', 'benefit'), ('first', 'duly', 'sworn'), ('justice', 'peace', 'said'), ('thousand', 'eight', 'hundred'), ('law', 'doth', 'oath'), ('one', 'thousand', 'eight'), ('peace', 'said', 'county'), ('service', 'united', 'state')]


In [127]:
# Get top 20 words from each frequency distribution
top_20_freq = set([word for word, count in combined_freq_dist.most_common(200)])
top_20_cats = set([word for word, count in combined_cats_freq_dist.most_common(200)])

# Find unique words in top 20 of combined_freq_dist (not in top 20 of combined_cats_freq_dist)
unique_in_freq = top_20_freq - top_20_cats

# Find unique words in top 20 of combined_cats_freq_dist (not in top 20 of combined_freq_dist)
unique_in_cats = top_20_cats - top_20_freq




In [128]:
# Display results
print("Words in top 20 of combined_freq_dist but NOT in top 100 of combined_cats_freq_dist:")
for word in sorted(unique_in_freq):
    count = combined_freq_dist[word]
    print(f"  {word}: {count}")


Words in top 20 of combined_freq_dist but NOT in top 100 of combined_cats_freq_dist:
  1838: 114829
  1843: 79295
  a.d.: 74193
  annexed: 74625
  annum: 100018
  benefit: 80057
  c: 74065
  connecticut: 88931
  deceased: 73932
  elizabeth: 80697
  father: 79345
  full: 75075
  known: 77815
  late: 84758
  man: 78344
  massachusetts: 108438
  mr: 120530
  part: 76252
  rate: 99504
  six: 76078
  smith: 79034
  son: 104171
  thousand: 77869
  twenty: 85551
  w: 75963
  ⎬: 93569
  ┃: 236500
  ●: 74356


In [119]:
print("\nWords in top 20 of combined_cats_freq_dist but NOT in top 100 of combined_freq_dist:")
for word in sorted(unique_in_cats):
    count = combined_cats_freq_dist[word]
    print(f"  {word}: {count}")


Words in top 20 of combined_cats_freq_dist but NOT in top 100 of combined_freq_dist:
  Carolina: 78372
  Col.: 51666
  Colonel: 58035
  District: 48412
  General: 48735
  Henry: 43931
  L.: 44364
  Mrs.: 45556
  North: 49761
  Pennsylvania: 45392
  Rev: 45492
  appears: 49837
  blank: 48161
  called: 44073
  county: 94499
  court: 65679
  family: 44426
  given: 49537
  illegible: 262961
  letter: 58848
  living: 45816
  must: 62918
  opinion: 50963
  period: 44600
  regiment: 46142
  signed: 64555
  statement: 51350
  term: 47107


In [129]:
print(f"\nSummary:")
print(f"  Unique in combined_freq_dist: {len(unique_in_freq)} words")
print(f"  Unique in combined_cats_freq_dist: {len(unique_in_cats)} words")
print(f"  Common words: {len(top_20_freq & top_20_cats)} words")


Summary:
  Unique in combined_freq_dist: 28 words
  Unique in combined_cats_freq_dist: 28 words
  Common words: 172 words


In [131]:
import json

# Prepare data for JSON serialization
cats_results = {
    'frequency_distribution': combined_cats_freq_dist.most_common(),  # List of (word, count) tuples
    'bigram_collocations': [list(bigram) for bigram in bigram_cats_collocations],  # Convert tuples to lists
    'trigram_collocations': [list(trigram) for trigram in trigram_cats_collocations]  # Convert tuples to lists
}

# Save to JSON
with open('cats_text_analysis_results.json', 'w') as f:
    json.dump(cats_results, f, indent=2)

print("Results saved to 'cats_text_analysis_results.json'")

Results saved to 'cats_text_analysis_results.json'
