## Merge topic labels to letter dataframe

In [19]:
import pickle
import gensim
from gensim import models
import gensim.corpora as corpora
import pandas as pd
import warnings
warnings.filterwarnings("ignore",category=FutureWarning)
from pprint import pprint

## Get Resources

In [57]:
# Lemmanized text
with open("20240405_PhD_LtrLem-N.txt", "rb") as fp:   # Unpickling
    data_lemmatized = pickle.load(fp)

In [58]:
ldamodel = models.ldamodel.LdaModel.load('20240406_PhD_TopicLtrMAL10')

In [59]:
# Not needed because I converted from Mallet to LDA in 20240331_PhD_TopicLetterNV-Mallet.ipynb

# Load saved mallet
# lda_mallet = gensim.models.wrappers.LdaMallet.load("20240221_PhD_TopicLtrMAL14")

# Convert to LDA
# ldamodel = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(lda_mallet)

In [60]:
pprint(sorted(ldamodel.print_topics(num_words=10)))

[(0,
  '0.035*"book" + 0.033*"work" + 0.032*"year" + 0.014*"difficulty" + '
  '0.014*"paper" + 0.013*"interest" + 0.013*"copy" + 0.011*"end" + '
  '0.010*"opinion" + 0.010*"term"'),
 (1,
  '0.070*"country" + 0.054*"land" + 0.040*"year" + 0.038*"acre" + 0.029*"farm" '
  '+ 0.022*"money" + 0.019*"pound" + 0.017*"wood" + 0.016*"wheat" + '
  '0.016*"price"'),
 (2,
  '0.045*"place" + 0.036*"dollar" + 0.031*"house" + 0.030*"day" + 0.029*"work" '
  '+ 0.023*"cent" + 0.020*"town" + 0.018*"board" + 0.017*"river" + '
  '0.016*"business"'),
 (3,
  '0.088*"letter" + 0.047*"friend" + 0.038*"time" + 0.031*"kind" + '
  '0.029*"family" + 0.023*"wife" + 0.022*"health" + 0.022*"child" + '
  '0.021*"winter" + 0.021*"month"'),
 (4,
  '0.040*"time" + 0.039*"man" + 0.032*"day" + 0.023*"room" + 0.020*"work" + '
  '0.016*"hospital" + 0.014*"place" + 0.013*"order" + 0.012*"patient" + '
  '0.012*"money"'),
 (5,
  '0.051*"school" + 0.017*"number" + 0.017*"door" + 0.015*"place" + '
  '0.012*"teacher" + 0.011*"gen

In [61]:
# Metadata
df = pd.read_csv("20240405_PhD_Data4TopicModel-Letter.csv") 
df = df.rename(columns={'Unnamed: 0':'docID-AT'})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 492 entries, 0 to 491
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   docID-AT          492 non-null    int64  
 1   docauthorid       492 non-null    object 
 2   docauthorname     492 non-null    object 
 3   docid             492 non-null    object 
 4   docyear           489 non-null    float64
 5   docmonth          477 non-null    float64
 6   authorgender      492 non-null    object 
 7   agewriting        380 non-null    float64
 8   agedeath          365 non-null    float64
 9   relMin            396 non-null    object 
 10  nationalOrigin    491 non-null    object 
 11  authorLocation    492 non-null    object 
 12  U                 442 non-null    object 
 13  M                 442 non-null    object 
 14  S                 442 non-null    object 
 15  F                 442 non-null    object 
 16  L                 442 non-null    object 
 1

In [62]:
data = df.text.values.tolist()
# data[0]

# Create Functions & Objects

In [63]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [64]:
def format_topics(ldamodel=ldamodel, corpus=corpus, texts=texts):
    # Init output
    topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                topics_df = topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    topics_df = pd.concat([topics_df, contents], axis=1)
    return(topics_df)

In [65]:
# Check the topic for the first 10 rows. 
df_topic_keywords = format_topics(ldamodel=ldamodel, corpus=corpus, texts=data)

In [66]:
# Format
df_dominant_topic = df_topic_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head()

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,4,0.2626,"time, man, day, room, work, hospital, place, o...",TRINIDAD On Train from Steubenville Ohio to C...
1,1,5,0.35,"school, number, door, place, teacher, gentlema...",Kansas City Dec 6 1872. Number one! Number tw...
2,2,5,0.4906,"school, number, door, place, teacher, gentlema...",Trinidad December 10 1872. My dearest dear: H...
3,3,5,0.521,"school, number, door, place, teacher, gentlema...",December 21. Rumor is loud in predicting an a...
4,4,5,0.3773,"school, number, door, place, teacher, gentlema...",March 1 1873. My Dear Sister Justina: It is s...


In [67]:
map_Text_topicNumber = {}

In [68]:
for index, item in enumerate(data):
    topic_dist = ldamodel.get_document_topics(corpus[index])

    # topic_dist is a list of tuples: each tuple has a topic number with its corresponding proportion
    sorted_topic_dist = sorted(topic_dist, key=lambda x: (x[1]), reverse=True) # Sort it in descending order
    
    # Pick the top one:
    topic_number = sorted_topic_dist[0][0]
    map_Text_topicNumber[item] = topic_number

In [69]:
# View the first key value pair in dictionary (text, label)
# dict(list(map_Text_topicNumber.items())[0:1])

In [70]:
df['topicNumber'] = df['text'].map(map_Text_topicNumber)
df.loc[:10,['docid', 'docauthorid', 'text', 'topicNumber']]

Unnamed: 0,docid,docauthorid,text,topicNumber
0,S1019-D002,per0001043,TRINIDAD On Train from Steubenville Ohio to C...,4
1,S1019-D004,per0001043,Kansas City Dec 6 1872. Number one! Number tw...,5
2,S1019-D005,per0001043,Trinidad December 10 1872. My dearest dear: H...,5
3,S1019-D006,per0001043,December 21. Rumor is loud in predicting an a...,5
4,S1019-D007,per0001043,March 1 1873. My Dear Sister Justina: It is s...,5
5,S1019-D008,per0001043,July 1873. Dear Sister Justina: The last time...,5
6,S1019-D009,per0001043,September 1873. Dearest Sister Justina: Our s...,5
7,S1019-D010,per0001043,Dear Sister Justina: June 30 1874. It is near...,4
8,S1019-D011,per0001043,November 14. Three of my senior pupils have g...,5
9,S1019-D012,per0001043,June 1876. Dear Sister Justina: To-day I aske...,5


In [71]:
# Group top 5 sentences under each topic
topics_sorteddf_mallet = pd.DataFrame()

topics_outdf_grpd = df_topic_keywords.groupby('Dominant_Topic')

for i, grp in topics_outdf_grpd:
    topics_sorteddf_mallet = pd.concat([topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
topics_sorteddf_mallet

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Text
0,0,0.5076,"book, work, year, difficulty, paper, interest,...",78 To Richard Bentley Belleville April 24 / 6...
1,1,0.629,"country, land, year, acre, farm, money, pound,...","Illinois. ""I have now however so far entered i..."
2,2,0.5342,"place, dollar, house, day, work, cent, town, b...",Wisconsin March 1850. Dear Patrick: Having in...
3,3,0.3731,"letter, friend, time, kind, family, wife, heal...",March the 22th 1867 Dear Brother I write thes...
4,4,0.5308,"time, man, day, room, work, hospital, place, o...","January 1881. My old acquaintance ""Billy the ..."
5,5,0.521,"school, number, door, place, teacher, gentlema...",December 21. Rumor is loud in predicting an a...
6,6,0.5773,"people, year, man, state, government, world, c...",Wisconsin United States of America January 1s...
7,7,0.3728,"week, day, child, morning, evening, boy, baby,...",Sarah To Martha January 21 1857 I cannot tell...
8,8,0.482,"life, hand, heart, time, woman, night, trouble...",To Richard Bentley [autumn 1855] Mr Doran's b...
9,9,0.5598,"day, ship, body, water, air, month, vessel, se...",Go in good time and secure a berth in the ship...


In [72]:
# Number of Documents for Each Topic
topic_counts = df_topic_keywords['Dominant_Topic'].value_counts()
topic_counts

7    148
1     86
3     72
0     36
4     30
8     30
5     29
2     21
6     21
9     19
Name: Dominant_Topic, dtype: int64

In [73]:
# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)
topic_contribution

7    0.3008
1    0.1748
3    0.1463
0    0.0732
4    0.0610
8    0.0610
5    0.0589
2    0.0427
6    0.0427
9    0.0386
Name: Dominant_Topic, dtype: float64

In [74]:
# Topic Number and Keywords
topic_num_keywords = df_topic_keywords[['Dominant_Topic', 'Topic_Keywords']].drop_duplicates().sort_values(by = 'Dominant_Topic')
topic_num_keywords

Unnamed: 0,Dominant_Topic,Topic_Keywords
21,0,"book, work, year, difficulty, paper, interest,..."
18,1,"country, land, year, acre, farm, money, pound,..."
83,2,"place, dollar, house, day, work, cent, town, b..."
58,3,"letter, friend, time, kind, family, wife, heal..."
0,4,"time, man, day, room, work, hospital, place, o..."
1,5,"school, number, door, place, teacher, gentlema..."
142,6,"people, year, man, state, government, world, c..."
117,7,"week, day, child, morning, evening, boy, baby,..."
64,8,"life, hand, heart, time, woman, night, trouble..."
138,9,"day, ship, body, water, air, month, vessel, se..."


In [75]:
topic_num_keywords.reset_index(drop=True, inplace=True)
topic_num_keywords

Unnamed: 0,Dominant_Topic,Topic_Keywords
0,0,"book, work, year, difficulty, paper, interest,..."
1,1,"country, land, year, acre, farm, money, pound,..."
2,2,"place, dollar, house, day, work, cent, town, b..."
3,3,"letter, friend, time, kind, family, wife, heal..."
4,4,"time, man, day, room, work, hospital, place, o..."
5,5,"school, number, door, place, teacher, gentlema..."
6,6,"people, year, man, state, government, world, c..."
7,7,"week, day, child, morning, evening, boy, baby,..."
8,8,"life, hand, heart, time, woman, night, trouble..."
9,9,"day, ship, body, water, air, month, vessel, se..."


In [76]:
# Reordering
# topic_num_keywords = topic_num_keywords.reindex([11,17,14,5,3,2,4,0,18,20,7,1,15,10,16,19,9,6,8,12,13])
# topic_num_keywords

In [77]:
# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts.rename('counts'), topic_contribution.rename('contribution')], axis=1).sort_values(by = 'Dominant_Topic')
df_dominant_topics

Unnamed: 0,Dominant_Topic,Topic_Keywords,counts,contribution
0,0,"book, work, year, difficulty, paper, interest,...",36,0.0732
1,1,"country, land, year, acre, farm, money, pound,...",86,0.1748
2,2,"place, dollar, house, day, work, cent, town, b...",21,0.0427
3,3,"letter, friend, time, kind, family, wife, heal...",72,0.1463
4,4,"time, man, day, room, work, hospital, place, o...",30,0.061
5,5,"school, number, door, place, teacher, gentlema...",29,0.0589
6,6,"people, year, man, state, government, world, c...",21,0.0427
7,7,"week, day, child, morning, evening, boy, baby,...",148,0.3008
8,8,"life, hand, heart, time, woman, night, trouble...",30,0.061
9,9,"day, ship, body, water, air, month, vessel, se...",19,0.0386


In [78]:
df_dominant_topics.to_csv("20240411_PhD_TopicsLtrMAL10.csv")

In [79]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 492 entries, 0 to 491
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   docID-AT          492 non-null    int64  
 1   docauthorid       492 non-null    object 
 2   docauthorname     492 non-null    object 
 3   docid             492 non-null    object 
 4   docyear           489 non-null    float64
 5   docmonth          477 non-null    float64
 6   authorgender      492 non-null    object 
 7   agewriting        380 non-null    float64
 8   agedeath          365 non-null    float64
 9   relMin            396 non-null    object 
 10  nationalOrigin    491 non-null    object 
 11  authorLocation    492 non-null    object 
 12  U                 442 non-null    object 
 13  M                 442 non-null    object 
 14  S                 442 non-null    object 
 15  F                 442 non-null    object 
 16  L                 442 non-null    object 
 1

In [80]:
df.to_csv("20240411_PhD_Data4NER-Letter.csv")