## Topic Model Chunk Prep

## Resources

In [1]:
import pandas as pd
import gensim
import numpy as np
from gensim.utils import simple_preprocess
import spacy
import pickle

## Get Data

In [2]:
# Sentence Data
df = pd.read_csv("20240220_PhD_Data4TopicModel-Chunk.csv") 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3785 entries, 0 to 3784
Data columns (total 33 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        3785 non-null   int64  
 1   docauthorid       3785 non-null   object 
 2   docauthorname     3785 non-null   object 
 3   docid             3785 non-null   object 
 4   sourcetitle       3785 non-null   object 
 5   docyear           3741 non-null   float64
 6   docmonth          2824 non-null   float64
 7   docday            2215 non-null   float64
 8   authorgender      3785 non-null   object 
 9   agewriting        2817 non-null   float64
 10  birthyear         2861 non-null   float64
 11  deathyear         2849 non-null   float64
 12  religionNew       2501 non-null   object 
 13  relMin            3198 non-null   object 
 14  nationalOrigin    3780 non-null   object 
 15  britishEmpire_EU  3773 non-null   object 
 16  translated        3785 non-null   bool   


In [3]:
# Change column name to "docID-AT"
df = df.rename(columns={'Unnamed: 0':'docID-AT'})
df = df.rename(columns={'chunk':'text'})
list(df)

['docID-AT',
 'docauthorid',
 'docauthorname',
 'docid',
 'sourcetitle',
 'docyear',
 'docmonth',
 'docday',
 'authorgender',
 'agewriting',
 'birthyear',
 'deathyear',
 'religionNew',
 'relMin',
 'nationalOrigin',
 'britishEmpire_EU',
 'translated',
 'authorLocation',
 'socialClass',
 'A',
 'I',
 'CCP',
 'Unknown',
 'wageLabour',
 'publicLetter',
 'text',
 'sequence',
 'scoreNeg',
 'scorePos',
 'scoreNeu',
 'scoreCompound',
 'chunks',
 'position']

In [4]:
df['text'][0]

"TRINIDAD On Train from Steubenville, Ohio, to Cincinnati. Nov 30, 1872. My Darling Sister Justina: How interestedly you, Sister M Louis and myself read Eugénie de Guérin's Journal and her daily anxieties to save her brother from being a spiritual outcast! This Journal which I propose keeping for you will deal with incidents occurring on my journey to Trinidad and happenings in that far-off land to which I am consigned. The Journal will begin with the first act. Here is Mother Josephine's letter: Mt St Vincent, O, Nov 27, 1872. Sister Blandina, Steubenville, O My Dear Child: You are missioned to Trinidad. You will leave Cincinnati Wednesday and alone. Mother Regina will attend to your needs. Devotedly, Mother Josephine. This letter thrilled us both. I was delighted to make the sacrifice, and you were hiding your feelings that I might not lose any merit. Neither of us could find Trinidad on the map except in the island of Cuba. So we concluded that Cuba was my destination. I was to leav

Code below adapated from https://medium.com/analytics-vidhya/topic-modeling-using-gensim-lda-in-python-48eaa2344920

In [5]:
# Convert values in text to list of strings (objects)
data = df.text.values.tolist()
data[0]

"TRINIDAD On Train from Steubenville, Ohio, to Cincinnati. Nov 30, 1872. My Darling Sister Justina: How interestedly you, Sister M Louis and myself read Eugénie de Guérin's Journal and her daily anxieties to save her brother from being a spiritual outcast! This Journal which I propose keeping for you will deal with incidents occurring on my journey to Trinidad and happenings in that far-off land to which I am consigned. The Journal will begin with the first act. Here is Mother Josephine's letter: Mt St Vincent, O, Nov 27, 1872. Sister Blandina, Steubenville, O My Dear Child: You are missioned to Trinidad. You will leave Cincinnati Wednesday and alone. Mother Regina will attend to your needs. Devotedly, Mother Josephine. This letter thrilled us both. I was delighted to make the sacrifice, and you were hiding your feelings that I might not lose any merit. Neither of us could find Trinidad on the map except in the island of Cuba. So we concluded that Cuba was my destination. I was to leav

In [6]:
# Define function to convert texts into a list of lowercase tokens
def text_to_words(texts):
    for text in texts:
        yield(gensim.utils.simple_preprocess(str(text), 
                                             deacc=True, # removes accents
                                             min_len=3))   # removes tokens shorter than three characters

data_words = list(text_to_words(data))

print(data_words[:1])

[['trinidad', 'train', 'from', 'steubenville', 'ohio', 'cincinnati', 'nov', 'darling', 'sister', 'justina', 'how', 'interestedly', 'you', 'sister', 'louis', 'and', 'myself', 'read', 'eugenie', 'guerin', 'journal', 'and', 'her', 'daily', 'anxieties', 'save', 'her', 'brother', 'from', 'being', 'spiritual', 'outcast', 'this', 'journal', 'which', 'propose', 'keeping', 'for', 'you', 'will', 'deal', 'with', 'incidents', 'occurring', 'journey', 'trinidad', 'and', 'happenings', 'that', 'far', 'off', 'land', 'which', 'consigned', 'the', 'journal', 'will', 'begin', 'with', 'the', 'first', 'act', 'here', 'mother', 'josephine', 'letter', 'vincent', 'nov', 'sister', 'blandina', 'steubenville', 'dear', 'child', 'you', 'are', 'missioned', 'trinidad', 'you', 'will', 'leave', 'cincinnati', 'wednesday', 'and', 'alone', 'mother', 'regina', 'will', 'attend', 'your', 'needs', 'devotedly', 'mother', 'josephine', 'this', 'letter', 'thrilled', 'both', 'was', 'delighted', 'make', 'the', 'sacrifice', 'and', 'yo

In [7]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [8]:
# See results of bigrams function
bigrams = [] # Create list
for item in data_words: # For each item (i.e., letter) in data words
    bigrams.append([b for b in bigram[item] if b.count('_') == 1]) # add 2-grams to new list
bigrams = list(np.concatenate(bigrams)) # flatten list
bigrams = list(dict.fromkeys(bigrams)) # take unique values
print(sorted(bigrams)) # print in alphabetical order

['absolutely_necessary', 'albany_buffalo', 'alexander_robb', 'allen_ransome', 'ancient_civilization', 'angels_guard', 'annual_pass', 'archbishop_lamy', 'associate_presbytery', 'attorney_general', 'baby_christened', 'below_zero', 'bentley_belleville', 'bids_fair', 'big_jim', 'bishop_machebeuf', 'black_flies', 'black_walnut', 'blade_grass', 'both_sides', 'boundary_line', 'british_columbia', 'burying_ground', 'bushels_wheat', 'cab_hire', 'cabin_luggage', 'canal_boats', 'cape_breton', 'captain_ferrier', 'captain_orlebar', 'captain_thorndike', 'carrothers_lisbellaw', 'castor_oil', 'catharine_parr', 'cedar_swamp', 'cents_per', 'charles_haszard', 'circuit_court', 'civil_war', 'colonel_chavez', 'colonel_myers', 'colonial_building', 'commodious_habitations', 'commodious_seats', 'cooking_utensils', 'copy_roughing', 'corrugated_iron', 'county_durham', 'crazy_ann', 'crushing_cylinder', 'cup_tea', 'dark_cloud', 'degrees_below', 'delegate_congress', 'deo_gratias', 'dick_wooten', 'difference_between'

In [9]:
# See results of trigrams function

trigrams1 = [] # Create list
for item in data_words: # For each item (i.e., chunk) in data words
    trigrams1.append([b for b in trigram[item] if b.count('_') == 1]) # add 2-grams to new list
trigrams1 = list(np.concatenate(trigrams1)) # flatten list
trigrams1 = list(dict.fromkeys(trigrams1)) # take unique values
print(sorted(trigrams1)) # print in alphabetical order

trigrams2 = [] # Create list
for item in data_words: # For each item (i.e., chunk) in data words
    trigrams2.append([b for b in trigram[item] if b.count('_') == 2]) # add 2-grams to new list
trigrams2 = list(np.concatenate(trigrams2)) # flatten list
trigrams2 = list(dict.fromkeys(trigrams2)) # take unique values
print(sorted(trigrams2)) # print in alphabetical order

['absolutely_necessary', 'air_exercises', 'albany_buffalo', 'ancient_civilization', 'angels_guard', 'annual_pass', 'archbishop_lamy', 'associate_presbytery', 'attorney_general', 'baby_christened', 'below_zero', 'bids_fair', 'big_jim', 'bishop_durango', 'black_flies', 'blade_grass', 'both_sides', 'boundary_line', 'british_columbia', 'burying_ground', 'bushels_wheat', 'cab_hire', 'cabin_luggage', 'canal_boats', 'captain_orlebar', 'captain_thorndike', 'carrothers_lisbellaw', 'cedar_swamp', 'cents_bushel', 'cents_per', 'charles_haszard', 'christmas_tree', 'circuit_court', 'civil_war', 'colonel_chavez', 'colonel_myers', 'colonial_building', 'commodious_habitations', 'commodious_seats', 'cooking_utensils', 'copy_roughing', 'county_durham', 'crazy_ann', 'crushing_cylinder', 'cup_tea', 'dark_cloud', 'days_ago', 'dear_bentley', 'dearest_catherine', 'degrees_below', 'delegate_congress', 'dick_wooten', 'difference_between', 'different_parts', 'dining_room', 'discharged_soldiers', 'doctor_russ', '

In [10]:
# Get 19th century stopwords
stop_words = pd.read_csv("Jockers_19thCenturyStops.csv")
stop_words = stop_words['word'].values.astype(str).tolist()
len(stop_words)

5631

In [11]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'VERB']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [12]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

In [13]:
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Form Trigrams
data_words_trigrams = make_trigrams(data_words_nostops)

In [14]:
# Initialize spacy 'en' model
# python3 -m spacy download en_core_web_md
nlp = spacy.load('en_core_web_md')

In [15]:
# Do lemmatization keeping only nouns
data_lemmatizedNouns = lemmatization(data_words_bigrams, allowed_postags=['NOUN'])
print(data_lemmatizedNouns[:1])

[['sister', 'anxiety', 'brother', 'outcast', 'journal', 'deal', 'incident', 'journey', 'land', 'journal', 'mother', 'letter', 'child', 'mother', 'mother', 'letter', 'sacrifice', 'hiding', 'feeling', 'merit', 'find', 'map', 'island', 'destination', 'pupil', 'teacher']]


In [16]:
# Do lemmatization keeping only verbs
data_lemmatizedVerbs = lemmatization(data_words_bigrams, allowed_postags=['VERB'])
print(data_lemmatizedVerbs[:1])

[['read', 'save', 'propose', 'keep', 'occur', 'happening', 'consign', 'missione', 'attend', 'need', 'thrill', 'lose', 'conclude', 'leave', 'cause']]


In [17]:
# Do lemmatization keeping only noun, verb
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'VERB'])
print(data_lemmatized[:1])

[['sister', 'read', 'anxiety', 'save', 'brother', 'outcast', 'journal', 'propose', 'keep', 'deal', 'incident', 'occur', 'journey', 'happening', 'land', 'consign', 'journal', 'mother', 'letter', 'child', 'missione', 'mother', 'attend', 'need', 'mother', 'letter', 'thrill', 'sacrifice', 'hiding', 'feeling', 'lose', 'merit', 'find', 'map', 'island', 'conclude', 'destination', 'leave', 'pupil', 'cause', 'teacher']]


## Saving work

In [18]:
with open("20240220_PhD_ChkLem-N.txt", "wb") as fp:   #Pickling
    pickle.dump(data_lemmatizedNouns, fp)

with open("20240220_PhD_ChkLem-V.txt.txt", "wb") as fp:   #Pickling
    pickle.dump(data_lemmatizedVerbs, fp)

with open("20240220_PhD_ChkLem-NV.txt", "wb") as fp:   #Pickling
    pickle.dump(data_lemmatized, fp)