## Topic Model Letter Prep

## Resources

In [2]:
# Packages
import pandas as pd
import gensim
import numpy as np
from gensim.utils import simple_preprocess
import spacy
import pickle
from collections import Counter

In [3]:
# Functions for stopwords, bigrams, trigrams, lemmatization, conversion of texts into list of lowercase tokens

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'VERB']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

def text_to_words(texts):
    for text in texts:
        yield(gensim.utils.simple_preprocess(str(text), 
                                             deacc=True, # removes accents
                                             min_len=3))   # removes tokens shorter than three characters

In [4]:
# Initialize spacy 'en' model
# python3 -m spacy download en_core_web_md
nlp = spacy.load('en_core_web_md')

In [5]:
# 19th century stopwords
stop_words = pd.read_csv("Jockers_19thCenturyStops.csv")
stop_words = stop_words['word'].values.astype(str).tolist()
len(stop_words)

5631

In [6]:
# Narratives
df = pd.read_csv("20240405_PhD_Data4TopicModel-Letter.csv") 
df = df.rename(columns={'Unnamed: 0':'docID-AT'}) # Change column name to "docID-AT"
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 492 entries, 0 to 491
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   docID-AT          492 non-null    int64  
 1   docauthorid       492 non-null    object 
 2   docauthorname     492 non-null    object 
 3   docid             492 non-null    object 
 4   docyear           489 non-null    float64
 5   docmonth          477 non-null    float64
 6   authorgender      492 non-null    object 
 7   agewriting        380 non-null    float64
 8   agedeath          365 non-null    float64
 9   relMin            396 non-null    object 
 10  nationalOrigin    491 non-null    object 
 11  authorLocation    492 non-null    object 
 12  U                 442 non-null    object 
 13  M                 442 non-null    object 
 14  S                 442 non-null    object 
 15  F                 442 non-null    object 
 16  L                 442 non-null    object 
 1

Code below adapated from https://medium.com/analytics-vidhya/topic-modeling-using-gensim-lda-in-python-48eaa2344920

## Prepare narratives

In [7]:
# Convert values in text to list of strings (objects)
data = df.text.values.tolist()

In [8]:
data_words = list(text_to_words(data))

In [9]:
#data_words[0]

## Bigram (and trigram) models

In [10]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=26)
#trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
#trigram_mod = gensim.models.phrases.Phraser(trigram)

In [11]:
# See results of trigrams function

#trigrams1 = [] # Create list
#for item in data_words: # For each item (i.e., chunk) in data words
    #trigrams1.append([b for b in trigram[item] if b.count('_') == 1]) # add 2-grams to new list
#trigrams1 = list(np.concatenate(trigrams1)) # flatten list
#trigrams1 = list(dict.fromkeys(trigrams1)) # take unique values
#print(sorted(trigrams1)) # print in alphabetical order

#trigrams2 = [] # Create list
#for item in data_words: # For each item (i.e., chunk) in data words
    #trigrams2.append([b for b in trigram[item] if b.count('_') == 2]) # add 2-grams to new list
#trigrams2 = list(np.concatenate(trigrams2)) # flatten list
#trigrams2 = list(dict.fromkeys(trigrams2)) # take unique values
#print(sorted(trigrams2)) # print in alphabetical order

## Stop Words

In [12]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

In [13]:
# What are the most common words?
counts = Counter(x for sublist in data_words_nostops for x in sublist)
Counter(counts).most_common(1000)

[('dear', 971),
 ('sister', 755),
 ('time', 696),
 ('little', 656),
 ('day', 647),
 ('country', 607),
 ('work', 526),
 ('old', 486),
 ('place', 477),
 ('came', 455),
 ('land', 447),
 ('give', 442),
 ('years', 422),
 ('house', 413),
 ('men', 399),
 ('letter', 396),
 ('critchlow', 396),
 ('people', 379),
 ('take', 372),
 ('children', 367),
 ('money', 343),
 ('poor', 338),
 ('week', 332),
 ('kind', 328),
 ('year', 320),
 ('find', 316),
 ('dollars', 307),
 ('school', 305),
 ('going', 296),
 ('god', 291),
 ('family', 279),
 ('feel', 279),
 ('days', 278),
 ('life', 271),
 ('father', 270),
 ('mother', 265),
 ('send', 258),
 ('wish', 257),
 ('quite', 257),
 ('canada', 252),
 ('sent', 249),
 ('saw', 241),
 ('hear', 241),
 ('write', 240),
 ('took', 232),
 ('states', 231),
 ('town', 229),
 ('large', 222),
 ('friends', 221),
 ('sisters', 220),
 ('room', 218),
 ('half', 218),
 ('present', 217),
 ('having', 215),
 ('fine', 207),
 ('brother', 205),
 ('england', 202),
 ('things', 201),
 ('seen', 200),

In [14]:
len(Counter(counts))

16298

In [15]:
new_items = ['critchlow', 
             'moodie', 
             'albuquerque', 
             'ellin', 
             'belleville', 
             'toronto', 
             'montreal', 
             'bentley', 
             'york', 
             'london', 
             'quebec',
             'hamilton', 
             'carrothers', 
             'dunbar', 
             'blandina', 
             'cincinnati', 
             'california', 
             'gasparri', 
             'washington',
             'sister',
             'sisters',
             'mother',
             'father',
             'brother',
             'brothers',
             'canada',
             'states', # in this corpus, refers to United States
             'england',
             'ireland',
             'mexico',
             'scotland',
             'january',
             'february',
             'march',
             'april',
             'may',
             'june',
             'july',
             'august',
             'september',
             'october',
             'november',
             'december',
             'boston',
             'europe',
             'monday',
             'tuesday',
             'wednesday',
             'thursday',
             'friday',
             'saturday',
             'sunday',
             'dear', # In this corpus, dear, regard, regards are conventional opening and closing language
             'regard',
             'regards',
             'parr',
             'traill'      
            ]

In [16]:
stop_words.extend(new_items)
len(stop_words)

5688

I noticed that winter was dropped as a stopword. Checking the Jockers list, I see that the other seasons are there too. I want to leave seasons as they are potentially associated with key topics.

In [17]:
x = ['winter', 'spring', 'summer', 'autumn']
stop_words = [s for s in stop_words if s not in x]
len(stop_words)

5684

In [18]:
# Put in alphabetical order
stop_words.sort() 

In [19]:
# Re-remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# What are the most common words?
counts = Counter(x for sublist in data_words_nostops for x in sublist)
Counter(counts).most_common(1000)

[('time', 696),
 ('little', 656),
 ('day', 647),
 ('country', 607),
 ('work', 526),
 ('old', 486),
 ('place', 477),
 ('came', 455),
 ('land', 447),
 ('give', 442),
 ('years', 422),
 ('house', 413),
 ('men', 399),
 ('letter', 396),
 ('people', 379),
 ('take', 372),
 ('children', 367),
 ('money', 343),
 ('poor', 338),
 ('week', 332),
 ('kind', 328),
 ('year', 320),
 ('find', 316),
 ('dollars', 307),
 ('school', 305),
 ('going', 296),
 ('god', 291),
 ('family', 279),
 ('feel', 279),
 ('days', 278),
 ('life', 271),
 ('send', 258),
 ('wish', 257),
 ('quite', 257),
 ('sent', 249),
 ('saw', 241),
 ('hear', 241),
 ('write', 240),
 ('took', 232),
 ('town', 229),
 ('large', 222),
 ('friends', 221),
 ('room', 218),
 ('half', 218),
 ('present', 217),
 ('having', 215),
 ('fine', 207),
 ('things', 201),
 ('seen', 200),
 ('winter', 197),
 ('farm', 197),
 ('pay', 196),
 ('want', 193),
 ('small', 190),
 ('heard', 189),
 ('bring', 189),
 ('coming', 188),
 ('world', 186),
 ('away', 185),
 ('keep', 185),


In [20]:
len(Counter(counts))

16252

In [21]:
# Add more stopwords

new_items = ['haszard', 
             'united',
             'davies',
             'lamy',
             'forsyth',
             'chavez'
             ]

stop_words.extend(new_items)
len(stop_words)

5690

In [22]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# What are the most common words?
counts = Counter(x for sublist in data_words_nostops for x in sublist)
Counter(counts).most_common(1000)

[('time', 696),
 ('little', 656),
 ('day', 647),
 ('country', 607),
 ('work', 526),
 ('old', 486),
 ('place', 477),
 ('came', 455),
 ('land', 447),
 ('give', 442),
 ('years', 422),
 ('house', 413),
 ('men', 399),
 ('letter', 396),
 ('people', 379),
 ('take', 372),
 ('children', 367),
 ('money', 343),
 ('poor', 338),
 ('week', 332),
 ('kind', 328),
 ('year', 320),
 ('find', 316),
 ('dollars', 307),
 ('school', 305),
 ('going', 296),
 ('god', 291),
 ('family', 279),
 ('feel', 279),
 ('days', 278),
 ('life', 271),
 ('send', 258),
 ('wish', 257),
 ('quite', 257),
 ('sent', 249),
 ('saw', 241),
 ('hear', 241),
 ('write', 240),
 ('took', 232),
 ('town', 229),
 ('large', 222),
 ('friends', 221),
 ('room', 218),
 ('half', 218),
 ('present', 217),
 ('having', 215),
 ('fine', 207),
 ('things', 201),
 ('seen', 200),
 ('winter', 197),
 ('farm', 197),
 ('pay', 196),
 ('want', 193),
 ('small', 190),
 ('heard', 189),
 ('bring', 189),
 ('coming', 188),
 ('world', 186),
 ('away', 185),
 ('keep', 185),


In [23]:
# Count words
len(Counter(counts))

16246

In [24]:
# Add more stopwords

new_items = ['liverpool', 
             'albany'
             ]

stop_words.extend(new_items)
len(stop_words)

5692

In [25]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# What are the most common words?
counts = Counter(x for sublist in data_words_nostops for x in sublist)
Counter(counts).most_common(1000)

[('time', 696),
 ('little', 656),
 ('day', 647),
 ('country', 607),
 ('work', 526),
 ('old', 486),
 ('place', 477),
 ('came', 455),
 ('land', 447),
 ('give', 442),
 ('years', 422),
 ('house', 413),
 ('men', 399),
 ('letter', 396),
 ('people', 379),
 ('take', 372),
 ('children', 367),
 ('money', 343),
 ('poor', 338),
 ('week', 332),
 ('kind', 328),
 ('year', 320),
 ('find', 316),
 ('dollars', 307),
 ('school', 305),
 ('going', 296),
 ('god', 291),
 ('family', 279),
 ('feel', 279),
 ('days', 278),
 ('life', 271),
 ('send', 258),
 ('wish', 257),
 ('quite', 257),
 ('sent', 249),
 ('saw', 241),
 ('hear', 241),
 ('write', 240),
 ('took', 232),
 ('town', 229),
 ('large', 222),
 ('friends', 221),
 ('room', 218),
 ('half', 218),
 ('present', 217),
 ('having', 215),
 ('fine', 207),
 ('things', 201),
 ('seen', 200),
 ('winter', 197),
 ('farm', 197),
 ('pay', 196),
 ('want', 193),
 ('small', 190),
 ('heard', 189),
 ('bring', 189),
 ('coming', 188),
 ('world', 186),
 ('away', 185),
 ('keep', 185),


In [26]:
# Count words
len(Counter(counts))

16244

## Bigrams & Lemmatization

In [27]:
# See results of bigrams function
bigrams = [] # Create list
for item in data_words_nostops: # For each item (i.e., letter) in data words
    bigrams.append([b for b in bigram[item] if b.count('_') == 1]) # add 2-grams to new list
bigrams = list(np.concatenate(bigrams)) # flatten list
bigrams = list(dict.fromkeys(bigrams)) # take unique values
print(sorted(bigrams)) # print in alphabetical order

['indian_corn', 'short_time', 'thank_god', 'years_ago']


The bigrams function was overactive at min=5, threshold=100. I tweeked this and settled on min=26 (to bring into the most frequent word list) and no threshold setting (so that it would default to 10). 

In [28]:
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Form Trigrams
# data_words_trigrams = make_trigrams(data_words_nostops)

In [29]:
# Do lemmatization keeping only nouns
data_lemmatizedNouns = lemmatization(data_words_bigrams, allowed_postags=['NOUN'])
#print(data_lemmatizedNouns[:1])

In [30]:
counts = Counter(x for sublist in data_lemmatizedNouns for x in sublist)
len(Counter(counts))

6739

In [31]:
# Do lemmatization keeping only verbs
data_lemmatizedVerbs = lemmatization(data_words_bigrams, allowed_postags=['VERB'])
#print(data_lemmatizedVerbs[:1])

In [32]:
counts = Counter(x for sublist in data_lemmatizedVerbs for x in sublist)
len(Counter(counts))

2961

In [33]:
# Do lemmatization keeping only noun, verb
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'VERB'])
#print(data_lemmatized[:1])

In [34]:
counts = Counter(x for sublist in data_lemmatized for x in sublist)
len(Counter(counts))

8083

## Saving work

In [35]:
with open("20240405_PhD_LtrLem-N.txt", "wb") as fp:   #Pickling
    pickle.dump(data_lemmatizedNouns, fp)

with open("20240405_PhD_LtrLem-V.txt", "wb") as fp:   #Pickling
    pickle.dump(data_lemmatizedVerbs, fp)

with open("20240405_PhD_LtrLem-NV.txt", "wb") as fp:   #Pickling
    pickle.dump(data_lemmatized, fp)