## Merge topic labels to letter dataframe

In [26]:
import pickle
import gensim
from gensim import models
import gensim.corpora as corpora
import pandas as pd
import warnings
warnings.filterwarnings("ignore",category=FutureWarning)
from pprint import pprint

## Get Resources

In [27]:
# Lemmanized text
with open("20240608_PhD_LtrLem-N.txt", "rb") as fp:   # Unpickling
    data_lemmatized = pickle.load(fp)

In [28]:
ldamodel = models.ldamodel.LdaModel.load('20240611_PhD_TopicLtrMAL08i1ka5')

In [29]:
# Not needed because I converted from Mallet to LDA in 20240331_PhD_TopicLetterNV-Mallet.ipynb

# Load saved mallet
# lda_mallet = gensim.models.wrappers.LdaMallet.load("20240221_PhD_TopicLtrMAL14")

# Convert to LDA
# ldamodel = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(lda_mallet)

In [30]:
pprint(sorted(ldamodel.print_topics(num_words=20)))

[(0,
  '0.107*"letter" + 0.075*"time" + 0.026*"day" + 0.026*"month" + '
  '0.025*"friend" + 0.018*"pleasure" + 0.017*"news" + 0.015*"sister" + '
  '0.015*"home" + 0.014*"health" + 0.013*"return" + 0.013*"opportunity" + '
  '0.013*"happiness" + 0.012*"kind" + 0.012*"writing" + 0.011*"manner" + '
  '0.011*"trust" + 0.011*"father" + 0.011*"care" + 0.011*"weather"'),
 (1,
  '0.032*"dollar" + 0.030*"year" + 0.028*"man" + 0.027*"work" + 0.024*"people" '
  '+ 0.022*"money" + 0.019*"business" + 0.018*"country" + 0.015*"cent" + '
  '0.012*"company" + 0.011*"expense" + 0.011*"board" + 0.011*"pay" + '
  '0.010*"month" + 0.009*"government" + 0.009*"wage" + 0.009*"lot" + '
  '0.009*"number" + 0.008*"office" + 0.008*"state"'),
 (2,
  '0.052*"brother" + 0.051*"mother" + 0.051*"friend" + 0.048*"family" + '
  '0.041*"letter" + 0.032*"year" + 0.032*"sister" + 0.030*"health" + '
  '0.027*"child" + 0.026*"son" + 0.025*"death" + 0.025*"wife" + '
  '0.023*"daughter" + 0.018*"uncle" + 0.017*"home" + 0.017*"a

In [31]:
# Metadata
df = pd.read_csv("20240608_PhD_Data4TopicModel-Letter.csv") 
df = df.rename(columns={'Unnamed: 0':'docID-AT'})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 676 entries, 0 to 675
Data columns (total 27 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   docID-AT          676 non-null    int64  
 1   docid             676 non-null    object 
 2   docyear           676 non-null    int64  
 3   docmonth          669 non-null    float64
 4   authorName        623 non-null    object 
 5   docauthorid       676 non-null    object 
 6   authorLocation    676 non-null    object 
 7   authorGender      676 non-null    object 
 8   nationalOrigin    676 non-null    object 
 9   irish             676 non-null    bool   
 10  otherUK           676 non-null    bool   
 11  relMin            339 non-null    object 
 12  catholic          339 non-null    object 
 13  otherChristian    339 non-null    object 
 14  U                 378 non-null    object 
 15  M                 387 non-null    object 
 16  S                 376 non-null    object 
 1

In [32]:
data = df.text.values.tolist()
# data[0]

# Create Functions & Objects

In [33]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [34]:
def format_topics(ldamodel=ldamodel, corpus=corpus, texts=texts):
    # Init output
    topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num, topn=20)
                topic_keywords = ", ".join([word for word, prop in wp])
                topics_df = topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    topics_df = pd.concat([topics_df, contents], axis=1)
    return(topics_df)

In [35]:
# Check the topic for the first 10 rows. 
df_topic_keywords = format_topics(ldamodel=ldamodel, corpus=corpus, texts=data)

In [36]:
# Format
df_dominant_topic = df_topic_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head()

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,0,0.1974,"letter, time, day, month, friend, pleasure, ne...",July 18 1891 Dear Sister I have waited until I...
1,1,0,0.1866,"letter, time, day, month, friend, pleasure, ne...",Nov 17th My dearest Kate I got this interestin...
2,2,0,0.1851,"letter, time, day, month, friend, pleasure, ne...",May 25 1892 Dear Sister I write once to Bid yo...
3,3,2,0.2126,"brother, mother, friend, family, letter, year,...",1891 Oct. 12th Miss Weir Dear friend I now ans...
4,4,4,0.1998,"time, day, winter, home, summer, place, people...",February 1st 90 Dear Cousin It is with pleasur...


In [37]:
map_Text_topicNumber = {}

In [38]:
for index, item in enumerate(data):
    topic_dist = ldamodel.get_document_topics(corpus[index])

    # topic_dist is a list of tuples: each tuple has a topic number with its corresponding proportion
    sorted_topic_dist = sorted(topic_dist, key=lambda x: (x[1]), reverse=True) # Sort it in descending order
    
    # Pick the top one:
    topic_number = sorted_topic_dist[0][0]
    map_Text_topicNumber[item] = topic_number

In [39]:
# View the first key value pair in dictionary (text, label)
# dict(list(map_Text_topicNumber.items())[0:1])

In [40]:
df['topicNumber'] = df['text'].map(map_Text_topicNumber)
df.loc[:10,['docid', 'docauthorid', 'text', 'topicNumber']]

Unnamed: 0,docid,docauthorid,text,topicNumber
0,20910,IED0107,July 18 1891 Dear Sister I have waited until I...,0
1,21062,IED0179,Nov 17th My dearest Kate I got this interestin...,0
2,21324,IED0107,May 25 1892 Dear Sister I write once to Bid yo...,0
3,21334,IED0621,1891 Oct. 12th Miss Weir Dear friend I now ans...,2
4,21354,IED0958,February 1st 90 Dear Cousin It is with pleasur...,4
5,21470,IED0099,March 6th 1800 My dear Maggie I was sorry to l...,6
6,21549,IED0314,5th January 1839 My dear Sisters Christmas and...,6
7,21561,IED0314,24th 1838 My dear Peri Meri and Sisterhood We ...,6
8,21737,IED0657,3rd December 1844 My Dear Aunt Some time ago I...,2
9,21759,IED0831,April 5th 1880 Dear aunt I Received your lette...,2


In [41]:
# Group top 5 sentences under each topic
topics_sorteddf_mallet = pd.DataFrame()

topics_outdf_grpd = df_topic_keywords.groupby('Dominant_Topic')

for i, grp in topics_outdf_grpd:
    topics_sorteddf_mallet = pd.concat([topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
topics_sorteddf_mallet

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Text
0,0,0.4544,"letter, time, day, month, friend, pleasure, ne...",March 9th. 1814. My dearest Rachel I embrace w...
1,1,0.5531,"dollar, year, man, work, people, money, busine...",February 8 1863 THE last harvest was very heav...
2,2,0.4029,"brother, mother, friend, family, letter, year,...",May 3rd 1853 My Dear Friend In Jas James Ellis...
3,3,0.477,"life, hand, book, heart, world, interest, mind...",May 2 1858 My dear Mr Bentley CW I was truly g...
4,4,0.4246,"time, day, winter, home, summer, place, people...",August 2. — Took steam-boat to Albany where I ...
5,5,0.5619,"country, land, pound, year, acre, farm, money,...",December the 5 1853 My dear brother I take up ...
6,6,0.7054,"day, house, night, town, morning, water, hour,...",September 19th 1838 My Dear Sisters. I suppose...
7,7,0.5029,"week, child, boy, bed, baby, girl, school, vis...",Sarah To Martha January 21 1857 I cannot tell ...


In [42]:
# Number of Documents for Each Topic
topic_counts = df_topic_keywords['Dominant_Topic'].value_counts()
topic_counts

7    168
2    117
5     96
3     71
0     68
1     56
4     50
6     50
Name: Dominant_Topic, dtype: int64

In [43]:
# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)
topic_contribution

7    0.2485
2    0.1731
5    0.1420
3    0.1050
0    0.1006
1    0.0828
4    0.0740
6    0.0740
Name: Dominant_Topic, dtype: float64

In [44]:
# Topic Number and Keywords
topic_num_keywords = df_topic_keywords[['Dominant_Topic', 'Topic_Keywords']].drop_duplicates().sort_values(by = 'Dominant_Topic')
topic_num_keywords

Unnamed: 0,Dominant_Topic,Topic_Keywords
0,0,"letter, time, day, month, friend, pleasure, ne..."
18,1,"dollar, year, man, work, people, money, busine..."
3,2,"brother, mother, friend, family, letter, year,..."
13,3,"life, hand, book, heart, world, interest, mind..."
4,4,"time, day, winter, home, summer, place, people..."
70,5,"country, land, pound, year, acre, farm, money,..."
5,6,"day, house, night, town, morning, water, hour,..."
20,7,"week, child, boy, bed, baby, girl, school, vis..."


In [45]:
topic_num_keywords.reset_index(drop=True, inplace=True)
topic_num_keywords

Unnamed: 0,Dominant_Topic,Topic_Keywords
0,0,"letter, time, day, month, friend, pleasure, ne..."
1,1,"dollar, year, man, work, people, money, busine..."
2,2,"brother, mother, friend, family, letter, year,..."
3,3,"life, hand, book, heart, world, interest, mind..."
4,4,"time, day, winter, home, summer, place, people..."
5,5,"country, land, pound, year, acre, farm, money,..."
6,6,"day, house, night, town, morning, water, hour,..."
7,7,"week, child, boy, bed, baby, girl, school, vis..."


In [46]:
# Reordering
# topic_num_keywords = topic_num_keywords.reindex([11,17,14,5,3,2,4,0,18,20,7,1,15,10,16,19,9,6,8,12,13])
# topic_num_keywords

In [47]:
# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts.rename('counts'), topic_contribution.rename('contribution')], axis=1).sort_values(by = 'Dominant_Topic')
df_dominant_topics

Unnamed: 0,Dominant_Topic,Topic_Keywords,counts,contribution
0,0,"letter, time, day, month, friend, pleasure, ne...",68,0.1006
1,1,"dollar, year, man, work, people, money, busine...",56,0.0828
2,2,"brother, mother, friend, family, letter, year,...",117,0.1731
3,3,"life, hand, book, heart, world, interest, mind...",71,0.105
4,4,"time, day, winter, home, summer, place, people...",50,0.074
5,5,"country, land, pound, year, acre, farm, money,...",96,0.142
6,6,"day, house, night, town, morning, water, hour,...",50,0.074
7,7,"week, child, boy, bed, baby, girl, school, vis...",168,0.2485


In [48]:
df_dominant_topics.to_csv("20240611_PhD_TopicsLtrMAL08i1ka5.csv")

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 676 entries, 0 to 675
Data columns (total 28 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   docID-AT          676 non-null    int64  
 1   docid             676 non-null    object 
 2   docyear           676 non-null    int64  
 3   docmonth          669 non-null    float64
 4   authorName        623 non-null    object 
 5   docauthorid       676 non-null    object 
 6   authorLocation    676 non-null    object 
 7   authorGender      676 non-null    object 
 8   nationalOrigin    676 non-null    object 
 9   irish             676 non-null    bool   
 10  otherUK           676 non-null    bool   
 11  relMin            339 non-null    object 
 12  catholic          339 non-null    object 
 13  otherChristian    339 non-null    object 
 14  U                 378 non-null    object 
 15  M                 387 non-null    object 
 16  S                 376 non-null    object 
 1

In [50]:
df.to_csv("20240611_PhD_Data4NER-Letter.csv")