## Topic Model Letter Prep

## Resources

In [197]:
# Packages
import pandas as pd
import gensim
import numpy as np
from gensim.utils import simple_preprocess
import spacy
import pickle
from collections import Counter

In [198]:
# Functions for stopwords, bigrams, trigrams, lemmatization, conversion of texts into list of lowercase tokens

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'VERB']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

def text_to_words(texts):
    for text in texts:
        yield(gensim.utils.simple_preprocess(str(text), 
                                             deacc=True, # removes accents
                                             min_len=3))   # removes tokens shorter than three characters

In [199]:
# Initialize spacy 'en' model
# python3 -m spacy download en_core_web_md
nlp = spacy.load('en_core_web_md')

In [200]:
# 19th century stopwords
stop_words = pd.read_csv("Jockers_19thCenturyStops.csv")
stop_words = stop_words['word'].values.astype(str).tolist()
len(stop_words)

5631

In [201]:
# Narratives
df = pd.read_csv("20240608_PhD_Data4TopicModel-Letter.csv") 
df = df.rename(columns={'Unnamed: 0':'docID-AT'}) # Change column name to "docID-AT"
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 676 entries, 0 to 675
Data columns (total 27 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   docID-AT          676 non-null    int64  
 1   docid             676 non-null    object 
 2   docyear           676 non-null    int64  
 3   docmonth          669 non-null    float64
 4   authorName        623 non-null    object 
 5   docauthorid       676 non-null    object 
 6   authorLocation    676 non-null    object 
 7   authorGender      676 non-null    object 
 8   nationalOrigin    676 non-null    object 
 9   irish             676 non-null    bool   
 10  otherUK           676 non-null    bool   
 11  relMin            339 non-null    object 
 12  catholic          339 non-null    object 
 13  otherChristian    339 non-null    object 
 14  U                 378 non-null    object 
 15  M                 387 non-null    object 
 16  S                 376 non-null    object 
 1

Code below adapated from https://medium.com/analytics-vidhya/topic-modeling-using-gensim-lda-in-python-48eaa2344920

## Prepare narratives

In [202]:
# Convert values in text to list of strings (objects)
data = df.text.values.tolist()

In [203]:
data_words = list(text_to_words(data))

In [204]:
#data_words[0]

## Bigram (and trigram) models

In [205]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=24)
#trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
#trigram_mod = gensim.models.phrases.Phraser(trigram)

In [206]:
# See results of trigrams function

#trigrams1 = [] # Create list
#for item in data_words: # For each item (i.e., chunk) in data words
    #trigrams1.append([b for b in trigram[item] if b.count('_') == 1]) # add 2-grams to new list
#trigrams1 = list(np.concatenate(trigrams1)) # flatten list
#trigrams1 = list(dict.fromkeys(trigrams1)) # take unique values
#print(sorted(trigrams1)) # print in alphabetical order

#trigrams2 = [] # Create list
#for item in data_words: # For each item (i.e., chunk) in data words
    #trigrams2.append([b for b in trigram[item] if b.count('_') == 2]) # add 2-grams to new list
#trigrams2 = list(np.concatenate(trigrams2)) # flatten list
#trigrams2 = list(dict.fromkeys(trigrams2)) # take unique values
#print(sorted(trigrams2)) # print in alphabetical order

## Stop Words

In [207]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

In [208]:
# What are the most common words?
counts = Counter(x for sublist in data_words_nostops for x in sublist)
Counter(counts).most_common(1000)

[('dear', 1592),
 ('time', 1061),
 ('letter', 955),
 ('little', 908),
 ('day', 771),
 ('country', 734),
 ('write', 655),
 ('old', 579),
 ('hear', 549),
 ('give', 517),
 ('send', 469),
 ('place', 456),
 ('kind', 454),
 ('work', 454),
 ('week', 446),
 ('house', 446),
 ('take', 434),
 ('children', 427),
 ('came', 419),
 ('years', 410),
 ('land', 408),
 ('wish', 406),
 ('quite', 403),
 ('critchlow', 394),
 ('days', 390),
 ('family', 382),
 ('mother', 378),
 ('poor', 375),
 ('going', 370),
 ('year', 366),
 ('feel', 362),
 ('people', 359),
 ('health', 348),
 ('brother', 345),
 ('friends', 337),
 ('father', 337),
 ('find', 332),
 ('present', 316),
 ('god', 315),
 ('sister', 314),
 ('money', 304),
 ('sent', 303),
 ('received', 294),
 ('pounds', 287),
 ('glad', 280),
 ('friend', 280),
 ('dollars', 272),
 ('letters', 270),
 ('able', 269),
 ('life', 269),
 ('heard', 263),
 ('night', 262),
 ('wrote', 260),
 ('fine', 256),
 ('canada', 254),
 ('happy', 252),
 ('saw', 247),
 ('having', 242),
 ('says'

In [210]:
len(Counter(counts))

15681

In [211]:
new_items = ['critchlow', 
             'moodie', 
             'albuquerque', 
             'ellin', 
             'belleville', 
             'toronto', 
             'montreal', 
             'bentley', 
             'york', 
             'london', 
             'quebec',
             'hamilton', 
             'carrothers', 
             'blandina', 
             'cincinnati', 
             'california', 
             'gasparri', 
             'washington',
             'canada',
             'states', # in this corpus, refers to United States (leaving singular)
             'england',
             'ireland',
             'mexico',
             'scotland',
             'january',
             'february',
             'march',
             'april',
             'may',
             'june',
             'july',
             'august',
             'september',
             'october',
             'november',
             'december',
             'boston',
             'europe',
             #'monday',
             #'tuesday',
             #'wednesday',
             #'thursday',
             #'friday',
             #'saturday',
             #'sunday',
             'dear', # In this corpus, dear, regard, regards are conventional opening and closing language'regard',
             'regards',
             #'mother', # not including plural forms so the base forms of the items below 
             #'father', # still exist in the final texts
             #'brother',
             #'sister',
             #'uncle',
             #'aunt',
             #'son',
             #'daughter',
             #'friend',
             #'sir'
             #'letter', # allowing the plural form to capture to correspondence more generally
             #'time', # high frequency
             #'times',
             #'day', # high frequency
             #'days',
             #'week',
             #'weeks',
             #'month',
             #'months',
             #'year',
             #'years',
             'parr',
             'traill',
             'ellin',
             'cumming',
             'philadelphia',
             'petersburg',
             'peterboro',
             'ohio',
             'orlebar',
             'haszard', 
             'united', # in this corpus, frequently refers to united states
             'davies',
             'lamy',
             'forsyth',
             'chavez',
             'vickers',
             'liverpool', 
             'albany',
             'weir',
             'baltimore',
             'dunbar',
             'campbell',
             'thorndike',
             'birmingham',
             'ontario',
             'columbia'
            ]

stop_words.extend(new_items)
len(stop_words)

5697

I noticed that winter was dropped as a stopword. Checking the Jockers list, I see that the other seasons are there too. I want to leave seasons as they are potentially associated with key topics. Also, removing here, there and home. Checked the whole list and added quite a few words that seem inappropriate as stopwords for this study.

In [212]:
x = ['winter', 
     'spring', 
     'summer', 
     'autumn',  
     'home', 
     'fairy', 
     'faith', 
     'forest', 
     'brook', 
     'gala', 
     'gay', 
     'hunter', 
     'ivy',
     'jewel',
     #'love', keeping out love because of it being a sign-off convention
     'maple',
     'pages',
     'page',
     'research',
     'son',
     'star',
     'stormy',
     'sun',
     'sunny',
     'sunshine',
     'temple',
     'together',
     'velvet',
     'blossom']
stop_words = [s for s in stop_words if s not in x]
len(stop_words)

5669

In [213]:
# Put in alphabetical order
stop_words.sort() 

In [214]:
# Re-remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# What are the most common words?
counts = Counter(x for sublist in data_words_nostops for x in sublist)
Counter(counts).most_common(1000)

[('time', 1061),
 ('letter', 955),
 ('little', 908),
 ('day', 771),
 ('country', 734),
 ('home', 695),
 ('write', 655),
 ('old', 579),
 ('hear', 549),
 ('give', 517),
 ('send', 469),
 ('place', 456),
 ('kind', 454),
 ('work', 454),
 ('week', 446),
 ('house', 446),
 ('take', 434),
 ('children', 427),
 ('came', 419),
 ('years', 410),
 ('land', 408),
 ('wish', 406),
 ('quite', 403),
 ('days', 390),
 ('family', 382),
 ('mother', 378),
 ('poor', 375),
 ('going', 370),
 ('year', 366),
 ('feel', 362),
 ('people', 359),
 ('health', 348),
 ('brother', 345),
 ('friends', 337),
 ('father', 337),
 ('find', 332),
 ('winter', 330),
 ('present', 316),
 ('god', 315),
 ('sister', 314),
 ('money', 304),
 ('sent', 303),
 ('received', 294),
 ('pounds', 287),
 ('glad', 280),
 ('friend', 280),
 ('dollars', 272),
 ('letters', 270),
 ('able', 269),
 ('life', 269),
 ('summer', 267),
 ('heard', 263),
 ('night', 262),
 ('wrote', 260),
 ('fine', 256),
 ('happy', 252),
 ('saw', 247),
 ('having', 242),
 ('says', 24

In [215]:
len(Counter(counts))

15654

## Bigrams & Lemmatization

In [217]:
# See results of bigrams function
bigrams = [] # Create list
for item in data_words_nostops: # For each item (i.e., letter) in data words
    bigrams.append([b for b in bigram[item] if b.count('_') == 1]) # add 2-grams to new list
bigrams = list(np.concatenate(bigrams)) # flatten list
bigrams = list(dict.fromkeys(bigrams)) # take unique values
print(sorted(bigrams)) # print in alphabetical order

['days_ago', 'glad_hear', 'god_bless', 'indian_corn', 'short_time', 'sorry_hear', 'thank_god', 'weeks_ago', 'welcome_letter', 'years_ago']


The bigrams function was overactive at min=5, threshold=100. I tweeked this and settled on min=25 (to bring into the most frequent word list) and no threshold setting (so that it would default to 10). 

In [218]:
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Form Trigrams
# data_words_trigrams = make_trigrams(data_words_nostops)

In [219]:
# Do lemmatization keeping only nouns
data_lemmatizedNouns = lemmatization(data_words_bigrams, allowed_postags=['NOUN'])
#print(data_lemmatizedNouns[:1])

In [220]:
counts = Counter(x for sublist in data_lemmatizedNouns for x in sublist)
len(Counter(counts))

6505

In [221]:
# Do lemmatization keeping only verbs
#data_lemmatizedVerbs = lemmatization(data_words_bigrams, allowed_postags=['VERB'])
#print(data_lemmatizedVerbs[:1])

In [222]:
#counts = Counter(x for sublist in data_lemmatizedVerbs for x in sublist)
#len(Counter(counts))

In [223]:
# Do lemmatization keeping only noun, verb
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'VERB'])
#print(data_lemmatized[:1])

In [224]:
counts = Counter(x for sublist in data_lemmatized for x in sublist)
len(Counter(counts))

7834

## Saving work

In [225]:
with open("20240608_PhD_LtrLem-N.txt", "wb") as fp:   #Pickling
    pickle.dump(data_lemmatizedNouns, fp)

#with open("20240608_PhD_LtrLem-V.txt", "wb") as fp:   #Pickling
    #pickle.dump(data_lemmatizedVerbs, fp)

with open("20240608_PhD_LtrLem-NV.txt", "wb") as fp:   #Pickling
    pickle.dump(data_lemmatized, fp)