## Merge topic labels to chunk dataframe

In [1]:
import pickle
import gensim
from gensim import models
import gensim.corpora as corpora
import pandas as pd
import warnings
warnings.filterwarnings("ignore",category=FutureWarning)
from pprint import pprint

## Get Resources

In [2]:
# Lemmanized text
with open("20240608_PhD_LtrChkLem-N.txt", "rb") as fp:   # Unpickling
    data_lemmatized = pickle.load(fp)

In [3]:
ldamodel = models.ldamodel.LdaModel.load('20240610_PhD_TopicLtrMAL11a9i4k')

In [4]:
# Not needed because I converted from Mallet to LDA in 20240331_PhD_TopicLetterChkNV-Mallet
# Load saved mallet
# lda_mallet = gensim.models.wrappers.LdaMallet.load("20240221_PhD_TopicChkMAL14")
# Convert to LDA
# ldamodel = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(lda_mallet)

In [5]:
pprint(sorted(ldamodel.print_topics(num_words=20)))

[(0,
  '0.086*"day" + 0.054*"time" + 0.045*"winter" + 0.038*"summer" + '
  '0.037*"weather" + 0.030*"health" + 0.026*"week" + 0.023*"month" + '
  '0.020*"place" + 0.018*"country" + 0.018*"town" + 0.017*"spring" + '
  '0.017*"snow" + 0.012*"year" + 0.011*"rain" + 0.011*"fall" + 0.011*"frost" + '
  '0.011*"heat" + 0.009*"account" + 0.009*"climate"'),
 (1,
  '0.055*"dollar" + 0.053*"work" + 0.044*"money" + 0.031*"year" + 0.027*"day" '
  '+ 0.022*"time" + 0.021*"business" + 0.020*"pound" + 0.020*"week" + '
  '0.018*"month" + 0.017*"country" + 0.017*"cent" + 0.015*"house" + '
  '0.015*"board" + 0.014*"man" + 0.013*"place" + 0.012*"wage" + 0.012*"pay" + '
  '0.011*"property" + 0.011*"summer"'),
 (2,
  '0.046*"land" + 0.036*"pound" + 0.034*"acre" + 0.032*"farm" + '
  '0.026*"country" + 0.024*"year" + 0.016*"price" + 0.016*"crop" + '
  '0.016*"wheat" + 0.012*"bushel" + 0.012*"potato" + 0.012*"farmer" + '
  '0.011*"market" + 0.011*"wood" + 0.010*"money" + 0.010*"lot" + 0.010*"horse" '
  '+ 0.00

In [6]:
# Metadata
df = pd.read_csv("20240608_PhD_Data4TopicModel-LetterChunk.csv") 
df = df.rename(columns={'Unnamed: 0':'docID-AT'})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2392 entries, 0 to 2391
Data columns (total 30 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   docID-AT          2392 non-null   int64  
 1   docid             2392 non-null   object 
 2   docyear           2392 non-null   int64  
 3   docmonth          2364 non-null   float64
 4   authorName        2177 non-null   object 
 5   docauthorid       2392 non-null   object 
 6   authorLocation    2392 non-null   object 
 7   authorGender      2392 non-null   object 
 8   nationalOrigin    2392 non-null   object 
 9   irish             2392 non-null   bool   
 10  otherUK           2392 non-null   bool   
 11  relMin            1065 non-null   object 
 12  catholic          1065 non-null   object 
 13  otherChristian    1065 non-null   object 
 14  U                 1253 non-null   object 
 15  M                 1276 non-null   object 
 16  S                 1245 non-null   object 


In [7]:
data = df.text.values.tolist()
# data[0]

# Create Functions & Objects

In [8]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [9]:
def format_topics(ldamodel=ldamodel, corpus=corpus, texts=texts):
    # Init output
    topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num, topn=20)
                topic_keywords = ", ".join([word for word, prop in wp])
                topics_df = topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    topics_df = pd.concat([topics_df, contents], axis=1)
    return(topics_df)

In [10]:
# Check the topic for the first 10 rows. 
df_topic_keywords = format_topics(ldamodel=ldamodel, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head()

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,3,0.2828,"letter, time, home, news, friend, month, write...",July 18 1891 Dear Sister I have waited until I...
1,1,3,0.206,"letter, time, home, news, friend, month, write...",Nov 17th My dearest Kate I got this interestin...
2,2,4,0.2601,"brother, family, friend, mother, letter, siste...",better account since he went. The poor parents...
3,3,3,0.2388,"letter, time, home, news, friend, month, write...",May 25 1892 Dear Sister I write once to Bid yo...
4,4,4,0.3123,"brother, family, friend, mother, letter, siste...",1891 Oct. 12th Miss Weir Dear friend I now ans...


In [11]:
map_Text_topicNumber = {}

In [12]:
for index, item in enumerate(data):
    topic_dist = ldamodel.get_document_topics(corpus[index])

    # topic_dist is a list of tuples: each tuple has a topic number with its corresponding proportion
    sorted_topic_dist = sorted(topic_dist, key=lambda x: (x[1]), reverse=True) # Sort it in descending order
    
    # Pick the top one:
    topic_number = sorted_topic_dist[0][0]
    map_Text_topicNumber[item] = topic_number

In [13]:
# View the first key value pair in dictionary (text, label)
# dict(list(map_Text_topicNumber.items())[0:1])

In [14]:
df['topicNumber'] = df['text'].map(map_Text_topicNumber)
df.loc[:10,['docid', 'docauthorid', 'text', 'topicNumber']]

Unnamed: 0,docid,docauthorid,text,topicNumber
0,20910,IED0107,July 18 1891 Dear Sister I have waited until I...,3
1,21062,IED0179,Nov 17th My dearest Kate I got this interestin...,3
2,21062,IED0179,better account since he went. The poor parents...,4
3,21324,IED0107,May 25 1892 Dear Sister I write once to Bid yo...,3
4,21334,IED0621,1891 Oct. 12th Miss Weir Dear friend I now ans...,4
5,21334,IED0621,is that big strong girl Miss Harrison got on h...,4
6,21354,IED0958,February 1st 90 Dear Cousin It is with pleasur...,0
7,21354,IED0958,now so I think So I think I will soon be able ...,2
8,21470,IED0099,March 6th 1800 My dear Maggie I was sorry to l...,5
9,21470,IED0099,not go in. it was after 5 Pm and the doors are...,5


In [15]:
# Group top 5 sentences under each topic
topics_sorteddf_mallet = pd.DataFrame()

topics_outdf_grpd = df_topic_keywords.groupby('Dominant_Topic')

for i, grp in topics_outdf_grpd:
    topics_sorteddf_mallet = pd.concat([topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
topics_sorteddf_mallet

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Text
0,0,0.7281,"day, time, winter, summer, weather, health, we...",durning the night. this day fine and clear. 12...
1,1,0.7375,"dollar, work, money, year, day, time, business...",At Chippawa I received employment for seventee...
2,2,0.7958,"land, pound, acre, farm, country, year, price,...",increase is about 20 sowing only one bushel Wi...
3,3,0.6189,"letter, time, home, news, friend, month, write...",to hear Mary Cumming stayed so long with you. ...
4,4,0.7057,"brother, family, friend, mother, letter, siste...",he left a wife and 2 children and one born sin...
5,5,0.7566,"room, house, water, tree, side, day, morning, ...",and should you look around to examine the tree...
6,6,0.7586,"book, work, hand, paper, life, copy, thing, mi...",as a writer. His fame like many of the great e...
7,7,0.681,"week, child, day, school, boy, home, night, ev...",Sarah to Martha January 4 This being the anniv...
8,8,0.7638,"ship, boat, day, vessel, gold, sea, thing, boa...",were fellow sufferers. All the passengers part...
9,9,0.7638,"heart, life, child, friend, mother, death, hus...",greeting. Her letter informed me of your sad b...


In [16]:
# Number of Documents for Each Topic
topic_counts = df_topic_keywords['Dominant_Topic'].value_counts()
topic_counts

3     299
7     292
4     266
9     263
2     227
10    214
5     182
0     173
6     171
1     158
8     147
Name: Dominant_Topic, dtype: int64

In [17]:
# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)
topic_contribution

3     0.1250
7     0.1221
4     0.1112
9     0.1099
2     0.0949
10    0.0895
5     0.0761
0     0.0723
6     0.0715
1     0.0661
8     0.0615
Name: Dominant_Topic, dtype: float64

In [18]:
# Topic Number and Keywords
topic_num_keywords = df_topic_keywords[['Dominant_Topic', 'Topic_Keywords']].drop_duplicates().sort_values(by = 'Dominant_Topic')
topic_num_keywords

Unnamed: 0,Dominant_Topic,Topic_Keywords
6,0,"day, time, winter, summer, weather, health, we..."
70,1,"dollar, work, money, year, day, time, business..."
7,2,"land, pound, acre, farm, country, year, price,..."
0,3,"letter, time, home, news, friend, month, write..."
2,4,"brother, family, friend, mother, letter, siste..."
8,5,"room, house, water, tree, side, day, morning, ..."
11,6,"book, work, hand, paper, life, copy, thing, mi..."
57,7,"week, child, day, school, boy, home, night, ev..."
19,8,"ship, boat, day, vessel, gold, sea, thing, boa..."
25,9,"heart, life, child, friend, mother, death, hus..."


In [19]:
topic_num_keywords.reset_index(drop=True, inplace=True)
topic_num_keywords

Unnamed: 0,Dominant_Topic,Topic_Keywords
0,0,"day, time, winter, summer, weather, health, we..."
1,1,"dollar, work, money, year, day, time, business..."
2,2,"land, pound, acre, farm, country, year, price,..."
3,3,"letter, time, home, news, friend, month, write..."
4,4,"brother, family, friend, mother, letter, siste..."
5,5,"room, house, water, tree, side, day, morning, ..."
6,6,"book, work, hand, paper, life, copy, thing, mi..."
7,7,"week, child, day, school, boy, home, night, ev..."
8,8,"ship, boat, day, vessel, gold, sea, thing, boa..."
9,9,"heart, life, child, friend, mother, death, hus..."


In [20]:
# Reordering
# topic_num_keywords = topic_num_keywords.reindex([11,17,14,5,3,2,4,0,18,20,7,1,15,10,16,19,9,6,8,12,13])
# topic_num_keywords

In [21]:
# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts.rename('counts'), topic_contribution.rename('contribution')], axis=1).sort_values(by = 'Dominant_Topic')
df_dominant_topics

Unnamed: 0,Dominant_Topic,Topic_Keywords,counts,contribution
0,0,"day, time, winter, summer, weather, health, we...",173,0.0723
1,1,"dollar, work, money, year, day, time, business...",158,0.0661
2,2,"land, pound, acre, farm, country, year, price,...",227,0.0949
3,3,"letter, time, home, news, friend, month, write...",299,0.125
4,4,"brother, family, friend, mother, letter, siste...",266,0.1112
5,5,"room, house, water, tree, side, day, morning, ...",182,0.0761
6,6,"book, work, hand, paper, life, copy, thing, mi...",171,0.0715
7,7,"week, child, day, school, boy, home, night, ev...",292,0.1221
8,8,"ship, boat, day, vessel, gold, sea, thing, boa...",147,0.0615
9,9,"heart, life, child, friend, mother, death, hus...",263,0.1099


In [22]:
df_dominant_topics.to_csv("20240611_PhD_TopicsLtrChkMAL11a9i4k.csv")

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2392 entries, 0 to 2391
Data columns (total 31 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   docID-AT          2392 non-null   int64  
 1   docid             2392 non-null   object 
 2   docyear           2392 non-null   int64  
 3   docmonth          2364 non-null   float64
 4   authorName        2177 non-null   object 
 5   docauthorid       2392 non-null   object 
 6   authorLocation    2392 non-null   object 
 7   authorGender      2392 non-null   object 
 8   nationalOrigin    2392 non-null   object 
 9   irish             2392 non-null   bool   
 10  otherUK           2392 non-null   bool   
 11  relMin            1065 non-null   object 
 12  catholic          1065 non-null   object 
 13  otherChristian    1065 non-null   object 
 14  U                 1253 non-null   object 
 15  M                 1276 non-null   object 
 16  S                 1245 non-null   object 


In [23]:
df.to_csv("20240611_PhD_Data4NER-LtrChk.csv")