## Topic Model Letter Prep

## Resources

In [1]:
import pandas as pd
import gensim
import numpy as np
from gensim.utils import simple_preprocess
import spacy
import pickle

## Get Data

In [2]:
# Sentence Data
df = pd.read_csv("20240220_PhD_Data4TopicModel-Letter.csv") 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 576 entries, 0 to 575
Data columns (total 30 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        576 non-null    int64  
 1   docauthorid       576 non-null    object 
 2   docauthorname     576 non-null    object 
 3   docid             576 non-null    object 
 4   sourcetitle       576 non-null    object 
 5   docyear           573 non-null    float64
 6   docmonth          519 non-null    float64
 7   docday            474 non-null    float64
 8   authorgender      576 non-null    object 
 9   agewriting        460 non-null    float64
 10  birthyear         463 non-null    float64
 11  deathyear         449 non-null    float64
 12  religionNew       450 non-null    object 
 13  relMin            474 non-null    object 
 14  nationalOrigin    575 non-null    object 
 15  britishEmpire_EU  573 non-null    object 
 16  translated        576 non-null    bool   
 1

In [3]:
# Change column name to "docID-AT"
df = df.rename(columns={'Unnamed: 0':'docID-AT'})
list(df)

['docID-AT',
 'docauthorid',
 'docauthorname',
 'docid',
 'sourcetitle',
 'docyear',
 'docmonth',
 'docday',
 'authorgender',
 'agewriting',
 'birthyear',
 'deathyear',
 'religionNew',
 'relMin',
 'nationalOrigin',
 'britishEmpire_EU',
 'translated',
 'authorLocation',
 'socialClass',
 'A',
 'I',
 'CCP',
 'Unknown',
 'wageLabour',
 'publicLetter',
 'text',
 'scoreNeg',
 'scorePos',
 'scoreNeu',
 'scoreCompound']

In [4]:
df['text'][0]

' TRINIDAD On Train from Steubenville, Ohio, to Cincinnati. Nov 30, 1872. My Darling Sister Justina: How interestedly you, Sister M Louis and myself read Eugénie de Guérin\'s Journal and her daily anxieties to save her brother from being a spiritual outcast! This Journal which I propose keeping for you will deal with incidents occurring on my journey to Trinidad and happenings in that far-off land to which I am consigned. The Journal will begin with the first act. Here is Mother Josephine\'s letter: Mt St Vincent, O, Nov 27, 1872. Sister Blandina, Steubenville, O My Dear Child: You are missioned to Trinidad. You will leave Cincinnati Wednesday and alone. Mother Regina will attend to your needs. Devotedly, Mother Josephine. This letter thrilled us both. I was delighted to make the sacrifice, and you were hiding your feelings that I might not lose any merit. Neither of us could find Trinidad on the map except in the island of Cuba. So we concluded that Cuba was my destination. I was to l

Code below adapated from https://medium.com/analytics-vidhya/topic-modeling-using-gensim-lda-in-python-48eaa2344920

In [5]:
# Convert values in text to list of strings (objects)
data = df.text.values.tolist()
data[0]

' TRINIDAD On Train from Steubenville, Ohio, to Cincinnati. Nov 30, 1872. My Darling Sister Justina: How interestedly you, Sister M Louis and myself read Eugénie de Guérin\'s Journal and her daily anxieties to save her brother from being a spiritual outcast! This Journal which I propose keeping for you will deal with incidents occurring on my journey to Trinidad and happenings in that far-off land to which I am consigned. The Journal will begin with the first act. Here is Mother Josephine\'s letter: Mt St Vincent, O, Nov 27, 1872. Sister Blandina, Steubenville, O My Dear Child: You are missioned to Trinidad. You will leave Cincinnati Wednesday and alone. Mother Regina will attend to your needs. Devotedly, Mother Josephine. This letter thrilled us both. I was delighted to make the sacrifice, and you were hiding your feelings that I might not lose any merit. Neither of us could find Trinidad on the map except in the island of Cuba. So we concluded that Cuba was my destination. I was to l

In [6]:
# Define function to convert texts into a list of lowercase tokens
def text_to_words(texts):
    for text in texts:
        yield(gensim.utils.simple_preprocess(str(text), 
                                             deacc=True, # removes accents
                                             min_len=3))   # removes tokens shorter than three characters

data_words = list(text_to_words(data))

print(data_words[:1])

[['trinidad', 'train', 'from', 'steubenville', 'ohio', 'cincinnati', 'nov', 'darling', 'sister', 'justina', 'how', 'interestedly', 'you', 'sister', 'louis', 'and', 'myself', 'read', 'eugenie', 'guerin', 'journal', 'and', 'her', 'daily', 'anxieties', 'save', 'her', 'brother', 'from', 'being', 'spiritual', 'outcast', 'this', 'journal', 'which', 'propose', 'keeping', 'for', 'you', 'will', 'deal', 'with', 'incidents', 'occurring', 'journey', 'trinidad', 'and', 'happenings', 'that', 'far', 'off', 'land', 'which', 'consigned', 'the', 'journal', 'will', 'begin', 'with', 'the', 'first', 'act', 'here', 'mother', 'josephine', 'letter', 'vincent', 'nov', 'sister', 'blandina', 'steubenville', 'dear', 'child', 'you', 'are', 'missioned', 'trinidad', 'you', 'will', 'leave', 'cincinnati', 'wednesday', 'and', 'alone', 'mother', 'regina', 'will', 'attend', 'your', 'needs', 'devotedly', 'mother', 'josephine', 'this', 'letter', 'thrilled', 'both', 'was', 'delighted', 'make', 'the', 'sacrifice', 'and', 'yo

In [7]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [8]:
# See results of bigrams function
bigrams = [] # Create list
for item in data_words: # For each item (i.e., letter) in data words
    bigrams.append([b for b in bigram[item] if b.count('_') == 1]) # add 2-grams to new list
bigrams = list(np.concatenate(bigrams)) # flatten list
bigrams = list(dict.fromkeys(bigrams)) # take unique values
print(sorted(bigrams)) # print in alphabetical order

['acres_cleared', 'alexander_robb', 'allen_ransome', 'archbishop_lamy', 'associate_presbytery', 'attorney_general', 'below_zero', 'bentley_belleville', 'big_jim', 'black_flies', 'both_sides', 'british_columbia', 'bushels_wheat', 'canal_boats', 'captain_orlebar', 'captain_thorndike', 'catharine_parr', 'cents_per', 'charles_haszard', 'circuit_court', 'colonel_chavez', 'commodious_seats', 'corrugated_iron', 'cup_tea', 'dark_cloud', 'dearest_catherine', 'degrees_below', 'difference_between', 'different_parts', 'discharged_soldiers', 'dollars_acre', 'don_santiago', 'dona_juanita', 'dona_nieves', 'eastern_townships', 'erie_canal', 'everything_else', 'father_gasparri', 'father_pinto', 'ferry_boat', 'few_lines', 'few_minutes', 'flora_lyndsay', 'flour_mills', 'fruit_trees', 'general_carleton', 'geoffrey_moncton', 'get_rid', 'glad_hear', 'god_bless', 'good_bye', 'grand_river', 'great_deal', 'greater_part', 'grist_mill', 'half_dozen', 'half_hour', 'half_past', 'hermana_dolores', 'house_assembly',

In [9]:
# See results of trigrams function

trigrams1 = [] # Create list
for item in data_words: # For each item (i.e., chunk) in data words
    trigrams1.append([b for b in trigram[item] if b.count('_') == 1]) # add 2-grams to new list
trigrams1 = list(np.concatenate(trigrams1)) # flatten list
trigrams1 = list(dict.fromkeys(trigrams1)) # take unique values
print(sorted(trigrams1)) # print in alphabetical order

trigrams2 = [] # Create list
for item in data_words: # For each item (i.e., chunk) in data words
    trigrams2.append([b for b in trigram[item] if b.count('_') == 2]) # add 2-grams to new list
trigrams2 = list(np.concatenate(trigrams2)) # flatten list
trigrams2 = list(dict.fromkeys(trigrams2)) # take unique values
print(sorted(trigrams2)) # print in alphabetical order

['acres_cleared', 'associate_presbytery', 'attorney_general', 'big_jim', 'black_flies', 'blessing_god', 'both_sides', 'british_columbia', 'bushels_wheat', 'canal_boats', 'captain_orlebar', 'captain_thorndike', 'cents_per', 'charles_haszard', 'circuit_court', 'colonel_chavez', 'commodious_seats', 'cup_tea', 'dark_cloud', 'days_ago', 'dear_bentley', 'dearest_catherine', 'degrees_below', 'difference_between', 'different_parts', 'discharged_soldiers', 'dollars_acre', 'don_santiago', 'dona_juanita', 'eastern_townships', 'erie_canal', 'everything_else', 'father_gasparri', 'father_pinto', 'ferry_boat', 'few_days', 'few_lines', 'few_minutes', 'flora_lyndsay', 'flour_mills', 'friend_allen', 'fruit_trees', 'general_carleton', 'get_rid', 'glad_hear', 'god_bless', 'good_bye', 'good_deal', 'got_rid', 'grand_river', 'great_deal', 'greater_part', 'grist_mill', 'half_dozen', 'half_hour', 'half_past', 'hermana_dolores', 'house_assembly', 'house_commons', 'human_nature', 'humble_servant', 'indian_corn',

In [13]:
# Get 19th century stopwords
stop_words = pd.read_csv("Jockers_19thCenturyStops.csv")
stop_words = stop_words['word'].values.astype(str).tolist()
len(stop_words)

5631

In [14]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'VERB']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [15]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

In [16]:
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Form Trigrams
data_words_trigrams = make_trigrams(data_words_nostops)

In [15]:
# Initialize spacy 'en' model
# python3 -m spacy download en_core_web_md
nlp = spacy.load('en_core_web_md')

In [16]:
# Do lemmatization keeping only nouns
data_lemmatizedNouns = lemmatization(data_words_bigrams, allowed_postags=['NOUN'])
print(data_lemmatizedNouns[:1])

[['sister', 'anxiety', 'brother', 'outcast', 'journal', 'deal', 'incident', 'journey', 'land', 'journal', 'mother', 'letter', 'child', 'mother', 'mother', 'letter', 'sacrifice', 'hiding', 'feeling', 'merit', 'find', 'map', 'island', 'destination', 'pupil', 'teacher', 'annoyance', 'class', 'catechism', 'story', 'schoolhouse', 'roof', 'hurrah', 'story', 'endurance', 'word', 'goodbye', 'crowd', 'station', 'godspeed', 'travel', 'plain', 'mining', 'town', 'destination', 'course', 'gentleman', 'plain', 'matter', 'cowboy', 'substance', 'conversation', 'sister', 'plain', 'assent', 'snow', 'traveler', 'snow', 'week', 'danger', 'gentleman', 'danger', 'cowboy', 'speaker', 'woman', 'cowboy', 'danger', 'snow', 'cowboy', 'dearest', 'distance', 'clock', 'baggage', 'checker', 'coach', 'pocketbook', 'cent', 'fare', 'mind', 'doorbell', 'response', 'stone', 'step', 'minute', 'doorbell', 'sister', 'child', 'cold', 'father', 'hospital', 'year', 'barge', 'coal', 'need', 'dollar', 'cent', 'hand', 'kind', 'di

In [17]:
# Do lemmatization keeping only verbs
data_lemmatizedVerbs = lemmatization(data_words_bigrams, allowed_postags=['VERB'])
print(data_lemmatizedVerbs[:1])

[['read', 'save', 'propose', 'keep', 'occur', 'happening', 'consign', 'missione', 'attend', 'need', 'thrill', 'lose', 'conclude', 'leave', 'cause', 'take', 'hopeful', 'go', 'disturb', 'dismiss', 'remember', 'wish', 'slip', 'speak', 'reach', 'know', 'travel', 'inform', 'bind', 'look', 'know', 'bind', 'wish', 'look', 'grasp', 'mean', 'give', 'try', 'understand', 'consider', 'frighten', 'travel', 'go', 'come', 'look', 'contain', 'ride', 'skirt', 'ring', 'sit', 'wait', 'hear', 'rise', 'wait', 'ring', 'come', 'come', 'die', 'send', 'die', 'possess', 'walk', 'accompany', 'take', 'wait', 'point', 'start', 'indicate', 'manage', 'find', 'send', 'meet', 'walk', 'stop', 'take', 'return', 'take', 'go', 'shoulder', 'show', 'hear', 'see', 'write', 'say', 'envy', 'make', 'send', 'receive', 'go', 'consult', 'send', 'live', 'spend', 'give', 'anticipate', 'spend', 'keep', 'come', 'miss', 'threaten', 'take', 'take', 'choose', 'choose', 'act', 'soothe', 'go', 'keep', 'permit', 'want', 'accompany', 'prefer

In [18]:
# Do lemmatization keeping only noun, verb
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'VERB'])
print(data_lemmatized[:1])

[['sister', 'read', 'anxiety', 'save', 'brother', 'outcast', 'journal', 'propose', 'keep', 'deal', 'incident', 'occur', 'journey', 'happening', 'land', 'consign', 'journal', 'mother', 'letter', 'child', 'missione', 'mother', 'attend', 'need', 'mother', 'letter', 'thrill', 'sacrifice', 'hiding', 'feeling', 'lose', 'merit', 'find', 'map', 'island', 'conclude', 'destination', 'leave', 'pupil', 'cause', 'teacher', 'annoyance', 'class', 'take', 'hopeful', 'catechism', 'go', 'story', 'schoolhouse', 'roof', 'disturb', 'hurrah', 'story', 'endurance', 'dismiss', 'word', 'goodbye', 'remember', 'crowd', 'station', 'wish', 'godspeed', 'slip', 'speak', 'travel', 'plain', 'reach', 'mining', 'town', 'know', 'destination', 'course', 'gentleman', 'travel', 'plain', 'matter', 'inform', 'cowboy', 'substance', 'conversation', 'sister', 'bind', 'plain', 'look', 'assent', 'know', 'snow', 'traveler', 'snow', 'bind', 'week', 'danger', 'wish', 'gentleman', 'danger', 'cowboy', 'look', 'speaker', 'grasp', 'mean'

## Saving work and re-importing

In [22]:
with open("20240220_PhD_LtrLem-N.txt", "wb") as fp:   #Pickling
    pickle.dump(data_lemmatizedNouns, fp)

with open("20240220_PhD_LtrLem-V.txt", "wb") as fp:   #Pickling
    pickle.dump(data_lemmatizedVerbs, fp)

with open("20240220_PhD_LtrLem-NV.txt", "wb") as fp:   #Pickling
    pickle.dump(data_lemmatized, fp)