## Merge topic labels to letter dataframe

In [1]:
import pickle
import gensim
from gensim import models
import gensim.corpora as corpora
import pandas as pd
import warnings
warnings.filterwarnings("ignore",category=FutureWarning)
from pprint import pprint

## Get Resources

In [3]:
# Lemmanized text
with open("20240220_PhD_LtrLem-NV.txt", "rb") as fp:   # Unpickling
    data_lemmatized = pickle.load(fp)

In [4]:
# Load saved mallet
lda_mallet = gensim.models.wrappers.LdaMallet.load("20240221_PhD_TopicLtrMAL14")
# Convert to LDA
ldamodel = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(lda_mallet)

In [5]:
pprint(sorted(ldamodel.print_topics(num_words=10)))

[(0,
  '0.032*"time" + 0.020*"people" + 0.017*"day" + 0.016*"give" + 0.016*"person" '
  '+ 0.014*"manner" + 0.014*"order" + 0.014*"case" + 0.014*"return" + '
  '0.014*"child"'),
 (1,
  '0.039*"place" + 0.026*"town" + 0.024*"day" + 0.023*"walk" + 0.020*"house" + '
  '0.016*"travel" + 0.015*"remain" + 0.015*"return" + 0.015*"reach" + '
  '0.015*"call"'),
 (2,
  '0.052*"land" + 0.046*"country" + 0.027*"acre" + 0.027*"farm" + '
  '0.017*"dollar" + 0.014*"farmer" + 0.014*"sell" + 0.014*"pound" + '
  '0.014*"wood" + 0.013*"price"'),
 (3,
  '0.048*"week" + 0.036*"child" + 0.029*"day" + 0.028*"critchlow" + '
  '0.019*"stay" + 0.019*"evening" + 0.018*"feel" + 0.015*"today" + 0.014*"boy" '
  '+ 0.014*"morning"'),
 (4,
  '0.065*"year" + 0.044*"letter" + 0.036*"send" + 0.033*"family" + '
  '0.032*"brother" + 0.030*"live" + 0.025*"hear" + 0.021*"money" + '
  '0.021*"friend" + 0.021*"write"'),
 (5,
  '0.088*"sister" + 0.018*"school" + 0.015*"man" + 0.014*"room" + '
  '0.012*"visit" + 0.011*"mother" 

In [6]:
# Metadata
df = pd.read_csv("20240220_PhD_Data4TopicModel-Letter.csv") 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 576 entries, 0 to 575
Data columns (total 30 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        576 non-null    int64  
 1   docauthorid       576 non-null    object 
 2   docauthorname     576 non-null    object 
 3   docid             576 non-null    object 
 4   sourcetitle       576 non-null    object 
 5   docyear           573 non-null    float64
 6   docmonth          519 non-null    float64
 7   docday            474 non-null    float64
 8   authorgender      576 non-null    object 
 9   agewriting        460 non-null    float64
 10  birthyear         463 non-null    float64
 11  deathyear         449 non-null    float64
 12  religionNew       450 non-null    object 
 13  relMin            474 non-null    object 
 14  nationalOrigin    575 non-null    object 
 15  britishEmpire_EU  573 non-null    object 
 16  translated        576 non-null    bool   
 1

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,docauthorid,docauthorname,docid,sourcetitle,docyear,docmonth,docday,authorgender,agewriting,...,I,CCP,Unknown,wageLabour,publicLetter,text,scoreNeg,scorePos,scoreNeu,scoreCompound
0,1,per0001043,"Segale, Sister Blandina, 1850-1941",S1019-D002,At the End of the Santa Fe Trail,1872.0,11.0,30.0,F,22.0,...,False,True,False,False,,"TRINIDAD On Train from Steubenville, Ohio, to...",0.051,0.131,0.818,0.9994
1,2,per0001043,"Segale, Sister Blandina, 1850-1941",S1019-D004,At the End of the Santa Fe Trail,1872.0,12.0,6.0,F,22.0,...,False,True,False,False,,"Kansas City, Dec 6, 1872. Number one! Number ...",0.055,0.111,0.834,0.9993
2,3,per0001043,"Segale, Sister Blandina, 1850-1941",S1019-D005,At the End of the Santa Fe Trail,1872.0,12.0,10.0,F,22.0,...,False,True,False,False,,"Trinidad, December 10, 1872. My dearest dear:...",0.056,0.101,0.843,0.9987
3,4,per0001043,"Segale, Sister Blandina, 1850-1941",S1019-D006,At the End of the Santa Fe Trail,1872.0,12.0,21.0,F,22.0,...,False,True,False,False,,December 21. Rumor is loud in predicting an a...,0.061,0.099,0.84,0.9981
4,5,per0001043,"Segale, Sister Blandina, 1850-1941",S1019-D007,At the End of the Santa Fe Trail,1873.0,3.0,1.0,F,23.0,...,False,True,False,False,,"March 1, 1873. My Dear Sister Justina: It is ...",0.071,0.081,0.848,0.967


In [8]:
# Change column name to "docID-AT"
df = df.rename(columns={'Unnamed: 0':'docID-AT'})
df

Unnamed: 0,docID-AT,docauthorid,docauthorname,docid,sourcetitle,docyear,docmonth,docday,authorgender,agewriting,...,I,CCP,Unknown,wageLabour,publicLetter,text,scoreNeg,scorePos,scoreNeu,scoreCompound
0,1,per0001043,"Segale, Sister Blandina, 1850-1941",S1019-D002,At the End of the Santa Fe Trail,1872.0,11.0,30.0,F,22.0,...,False,True,False,False,,"TRINIDAD On Train from Steubenville, Ohio, to...",0.051,0.131,0.818,0.9994
1,2,per0001043,"Segale, Sister Blandina, 1850-1941",S1019-D004,At the End of the Santa Fe Trail,1872.0,12.0,6.0,F,22.0,...,False,True,False,False,,"Kansas City, Dec 6, 1872. Number one! Number ...",0.055,0.111,0.834,0.9993
2,3,per0001043,"Segale, Sister Blandina, 1850-1941",S1019-D005,At the End of the Santa Fe Trail,1872.0,12.0,10.0,F,22.0,...,False,True,False,False,,"Trinidad, December 10, 1872. My dearest dear:...",0.056,0.101,0.843,0.9987
3,4,per0001043,"Segale, Sister Blandina, 1850-1941",S1019-D006,At the End of the Santa Fe Trail,1872.0,12.0,21.0,F,22.0,...,False,True,False,False,,December 21. Rumor is loud in predicting an a...,0.061,0.099,0.840,0.9981
4,5,per0001043,"Segale, Sister Blandina, 1850-1941",S1019-D007,At the End of the Santa Fe Trail,1873.0,3.0,1.0,F,23.0,...,False,True,False,False,,"March 1, 1873. My Dear Sister Justina: It is ...",0.071,0.081,0.848,0.9670
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
571,572,per0022894,"Hutchings, James Mason, 1820-1902",S9957-D014,Seeking the Elephant: James Mason Hutching's J...,1850.0,3.0,22.0,M,30.0,...,True,True,False,False,,"Hang Town, California March 22, 1850 Dear Sir...",0.083,0.156,0.761,0.9168
572,573,per0022894,"Hutchings, James Mason, 1820-1902",S9957-D015,Seeking the Elephant: James Mason Hutching's J...,1851.0,4.0,4.0,M,31.0,...,True,True,False,False,,"California Weaver Creek, April 4th 1851 Dear ...",0.098,0.149,0.754,0.9911
573,574,per0036192,"Thorpe, John, fl. 1828",S9974-D008,America's Immigrants: Adventures in Eyewitness...,1828.0,,,M,,...,,,,,,During the first three decades of the ninetee...,0.071,0.126,0.802,0.9970
574,575,per0036196,"Downe, John, fl. 1830",S9974-D010,America's Immigrants: Adventures in Eyewitness...,1830.0,8.0,12.0,M,,...,True,False,False,False,,"John Downe, a weaver of Frome, England, was o...",0.056,0.094,0.850,0.9881


In [11]:
data = df.text.values.tolist()
print(data[0])

 TRINIDAD On Train from Steubenville, Ohio, to Cincinnati. Nov 30, 1872. My Darling Sister Justina: How interestedly you, Sister M Louis and myself read Eugénie de Guérin's Journal and her daily anxieties to save her brother from being a spiritual outcast! This Journal which I propose keeping for you will deal with incidents occurring on my journey to Trinidad and happenings in that far-off land to which I am consigned. The Journal will begin with the first act. Here is Mother Josephine's letter: Mt St Vincent, O, Nov 27, 1872. Sister Blandina, Steubenville, O My Dear Child: You are missioned to Trinidad. You will leave Cincinnati Wednesday and alone. Mother Regina will attend to your needs. Devotedly, Mother Josephine. This letter thrilled us both. I was delighted to make the sacrifice, and you were hiding your feelings that I might not lose any merit. Neither of us could find Trinidad on the map except in the island of Cuba. So we concluded that Cuba was my destination. I was to leav

# Create Functions & Objects

In [12]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [13]:
def format_topics(ldamodel=ldamodel, corpus=corpus, texts=texts):
    # Init output
    topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                topics_df = topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    topics_df = pd.concat([topics_df, contents], axis=1)
    return(topics_df)

In [14]:
# Check the topic for the first 10 rows. 
df_topic_keywords = format_topics(ldamodel=ldamodel, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head()

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,5,0.308,"sister, school, man, room, visit, mother, give...","TRINIDAD On Train from Steubenville, Ohio, to..."
1,1,1,0.3901,"place, town, day, walk, house, travel, remain,...","Kansas City, Dec 6, 1872. Number one! Number ..."
2,2,1,0.3827,"place, town, day, walk, house, travel, remain,...","Trinidad, December 10, 1872. My dearest dear:..."
3,3,5,0.3496,"sister, school, man, room, visit, mother, give...",December 21. Rumor is loud in predicting an a...
4,4,1,0.3119,"place, town, day, walk, house, travel, remain,...","March 1, 1873. My Dear Sister Justina: It is ..."


In [15]:
map_Text_topicNumber = {}

In [16]:
for index, item in enumerate(data):
    topic_dist = ldamodel.get_document_topics(corpus[index])

    # topic_dist is a list of tuples: each tuple has a topic number with its corresponding proportion
    sorted_topic_dist = sorted(topic_dist, key=lambda x: (x[1]), reverse=True) # Sort it in descending order
    
    # Pick the top one:
    topic_number = sorted_topic_dist[0][0]
    map_Text_topicNumber[item] = topic_number

In [17]:
# View the first key value pair in dictionary (text, label)
dict(list(map_Text_topicNumber.items())[0:1])

{' TRINIDAD On Train from Steubenville, Ohio, to Cincinnati. Nov 30, 1872. My Darling Sister Justina: How interestedly you, Sister M Louis and myself read Eugénie de Guérin\'s Journal and her daily anxieties to save her brother from being a spiritual outcast! This Journal which I propose keeping for you will deal with incidents occurring on my journey to Trinidad and happenings in that far-off land to which I am consigned. The Journal will begin with the first act. Here is Mother Josephine\'s letter: Mt St Vincent, O, Nov 27, 1872. Sister Blandina, Steubenville, O My Dear Child: You are missioned to Trinidad. You will leave Cincinnati Wednesday and alone. Mother Regina will attend to your needs. Devotedly, Mother Josephine. This letter thrilled us both. I was delighted to make the sacrifice, and you were hiding your feelings that I might not lose any merit. Neither of us could find Trinidad on the map except in the island of Cuba. So we concluded that Cuba was my destination. I was to 

In [18]:
df['topicNumber'] = df['text'].map(map_Text_topicNumber)
df.loc[:10,['docid', 'docauthorid', 'text', 'topicNumber']]

Unnamed: 0,docid,docauthorid,text,topicNumber
0,S1019-D002,per0001043,"TRINIDAD On Train from Steubenville, Ohio, to...",5
1,S1019-D004,per0001043,"Kansas City, Dec 6, 1872. Number one! Number ...",1
2,S1019-D005,per0001043,"Trinidad, December 10, 1872. My dearest dear:...",1
3,S1019-D006,per0001043,December 21. Rumor is loud in predicting an a...,5
4,S1019-D007,per0001043,"March 1, 1873. My Dear Sister Justina: It is ...",1
5,S1019-D008,per0001043,"July, 1873. Dear Sister Justina: The last tim...",5
6,S1019-D009,per0001043,"September, 1873. Dearest Sister Justina: Our ...",5
7,S1019-D010,per0001043,"Dear Sister Justina: June 30, 1874. It is nea...",5
8,S1019-D011,per0001043,November 14. Three of my senior pupils have g...,5
9,S1019-D012,per0001043,"June, 1876. Dear Sister Justina: To-day I ask...",5


In [19]:
# Group top 5 sentences under each topic
topics_sorteddf_mallet = pd.DataFrame()

topics_outdf_grpd = df_topic_keywords.groupby('Dominant_Topic')

for i, grp in topics_outdf_grpd:
    topics_sorteddf_mallet = pd.concat([topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
topics_sorteddf_mallet

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Text
0,0,0.5147,"time, people, day, give, person, manner, order...","Having, in my former letters, obeyed the prin..."
1,1,0.3901,"place, town, day, walk, house, travel, remain,...","Kansas City, Dec 6, 1872. Number one! Number ..."
2,2,0.5737,"land, country, acre, farm, dollar, farmer, sel...",Extract of a letter from Mr Birkbeck to his s...
3,3,0.3939,"week, child, day, critchlow, stay, evening, fe...","Sarah To Martha January 21, 1857 I cannot tel..."
4,4,0.4449,"year, letter, send, family, brother, live, hea...","January 4th 1860 Dear Brother, I am spared th..."
5,5,0.475,"sister, school, man, room, visit, mother, give...",November 14. Three of my senior pupils have g...
6,6,0.4868,"write, book, send, work, copy, receive, paper,...",78 To Richard Bentley Belleville April 24 / 6...
7,7,0.4524,"people, state, year, city, world, dollar, empl...","Wisconsin, United States of America, January ..."
8,8,0.6604,"day, ship, water, bed, passenger, pass, wind, ...",Our residence on the banks of Newfoundland co...
9,9,0.631,"place, year, church, country, settlement, sett...",For more than a year I preached in the large ...


In [20]:
# Number of Documents for Each Topic
topic_counts = df_topic_keywords['Dominant_Topic'].value_counts()
topic_counts

3     171
2      79
11     71
5      51
6      41
4      36
8      28
9      25
10     16
12     15
1      14
7      14
0       8
13      7
Name: Dominant_Topic, dtype: int64

In [21]:
# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)
topic_contribution

3     0.2969
2     0.1372
11    0.1233
5     0.0885
6     0.0712
4     0.0625
8     0.0486
9     0.0434
10    0.0278
12    0.0260
1     0.0243
7     0.0243
0     0.0139
13    0.0122
Name: Dominant_Topic, dtype: float64

In [22]:
# Topic Number and Keywords
topic_num_keywords = df_topic_keywords[['Dominant_Topic', 'Topic_Keywords']].drop_duplicates().sort_values(by = 'Dominant_Topic')
topic_num_keywords

Unnamed: 0,Dominant_Topic,Topic_Keywords
89,0,"time, people, day, give, person, manner, order..."
1,1,"place, town, day, walk, house, travel, remain,..."
83,2,"land, country, acre, farm, dollar, farmer, sel..."
185,3,"week, child, day, critchlow, stay, evening, fe..."
71,4,"year, letter, send, family, brother, live, hea..."
0,5,"sister, school, man, room, visit, mother, give..."
56,6,"write, book, send, work, copy, receive, paper,..."
158,7,"people, state, year, city, world, dollar, empl..."
84,8,"day, ship, water, bed, passenger, pass, wind, ..."
159,9,"place, year, church, country, settlement, sett..."


In [23]:
topic_num_keywords.reset_index(drop=True, inplace=True)
topic_num_keywords

Unnamed: 0,Dominant_Topic,Topic_Keywords
0,0,"time, people, day, give, person, manner, order..."
1,1,"place, town, day, walk, house, travel, remain,..."
2,2,"land, country, acre, farm, dollar, farmer, sel..."
3,3,"week, child, day, critchlow, stay, evening, fe..."
4,4,"year, letter, send, family, brother, live, hea..."
5,5,"sister, school, man, room, visit, mother, give..."
6,6,"write, book, send, work, copy, receive, paper,..."
7,7,"people, state, year, city, world, dollar, empl..."
8,8,"day, ship, water, bed, passenger, pass, wind, ..."
9,9,"place, year, church, country, settlement, sett..."


In [252]:
# Reordering
# topic_num_keywords = topic_num_keywords.reindex([11,17,14,5,3,2,4,0,18,20,7,1,15,10,16,19,9,6,8,12,13])
# topic_num_keywords

Unnamed: 0,Dominant_Topic,Topic_Keywords
11,11.0,"week, day, weather, boy, baby, morning, doctor..."
17,17.0,"thing, time, people, year, lot, house, uncle, ..."
14,14.0,"land, country, acre, farm, year, farmer, wheat..."
5,5.0,"book, paper, work, dear, copy, friend, world, ..."
3,3.0,"letter, brother, time, wife, health, news, mai..."
2,2.0,"sister, man, room, hospital, mother, place, pa..."
4,4.0,"child, school, mother, boy, time, teacher, par..."
0,0.0,"day, morning, ship, hour, sea, night, time, wi..."
18,18.0,"year, wife, money, time, care, interest, husba..."
20,20.0,"work, dollar, month, day, money, cent, time, w..."


In [24]:
# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts.rename('counts'), topic_contribution.rename('contribution')], axis=1).sort_values(by = 'Dominant_Topic')
df_dominant_topics

Unnamed: 0,Dominant_Topic,Topic_Keywords,counts,contribution
0,0,"time, people, day, give, person, manner, order...",8,0.0139
1,1,"place, town, day, walk, house, travel, remain,...",14,0.0243
2,2,"land, country, acre, farm, dollar, farmer, sel...",79,0.1372
3,3,"week, child, day, critchlow, stay, evening, fe...",171,0.2969
4,4,"year, letter, send, family, brother, live, hea...",36,0.0625
5,5,"sister, school, man, room, visit, mother, give...",51,0.0885
6,6,"write, book, send, work, copy, receive, paper,...",41,0.0712
7,7,"people, state, year, city, world, dollar, empl...",14,0.0243
8,8,"day, ship, water, bed, passenger, pass, wind, ...",28,0.0486
9,9,"place, year, church, country, settlement, sett...",25,0.0434


In [25]:
df_dominant_topics.to_csv("20240222__PhD_TopicsLtrMAL14.csv")

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 576 entries, 0 to 575
Data columns (total 31 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   docID-AT          576 non-null    int64  
 1   docauthorid       576 non-null    object 
 2   docauthorname     576 non-null    object 
 3   docid             576 non-null    object 
 4   sourcetitle       576 non-null    object 
 5   docyear           573 non-null    float64
 6   docmonth          519 non-null    float64
 7   docday            474 non-null    float64
 8   authorgender      576 non-null    object 
 9   agewriting        460 non-null    float64
 10  birthyear         463 non-null    float64
 11  deathyear         449 non-null    float64
 12  religionNew       450 non-null    object 
 13  relMin            474 non-null    object 
 14  nationalOrigin    575 non-null    object 
 15  britishEmpire_EU  573 non-null    object 
 16  translated        576 non-null    bool   
 1

In [27]:
df.to_csv("20240222_PhD_Data4NER-Letter.csv")