## Merge topic labels to chunk dataframe

In [1]:
import pickle
import gensim
from gensim import models
import gensim.corpora as corpora
import pandas as pd
import warnings
warnings.filterwarnings("ignore",category=FutureWarning)
from pprint import pprint

## Get Resources

In [2]:
# Lemmanized text
with open("20240405_PhD_LtrChkLem-N.txt", "rb") as fp:   # Unpickling
    data_lemmatized = pickle.load(fp)

In [3]:
ldamodel = models.ldamodel.LdaModel.load('20240406_PhD_TopicLtrChkMAL23')

In [4]:
# Not needed because I converted from Mallet to LDA in 20240331_PhD_TopicLetterChkNV-Mallet
# Load saved mallet
# lda_mallet = gensim.models.wrappers.LdaMallet.load("20240221_PhD_TopicChkMAL14")
# Convert to LDA
# ldamodel = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(lda_mallet)

In [5]:
pprint(sorted(ldamodel.print_topics(num_words=10)))

[(0,
  '0.127*"life" + 0.051*"death" + 0.043*"mind" + 0.028*"manner" + 0.024*"soul" '
  '+ 0.021*"nature" + 0.020*"trial" + 0.017*"truth" + 0.016*"coal" + '
  '0.016*"matter"'),
 (1,
  '0.132*"land" + 0.079*"acre" + 0.073*"year" + 0.071*"farm" + 0.034*"wheat" + '
  '0.032*"dollar" + 0.029*"farmer" + 0.024*"crop" + 0.024*"price" + '
  '0.023*"bushel"'),
 (2,
  '0.035*"body" + 0.033*"air" + 0.032*"food" + 0.023*"night" + 0.022*"head" + '
  '0.022*"arm" + 0.020*"blood" + 0.019*"pain" + 0.017*"drink" + 0.016*"heat"'),
 (3,
  '0.154*"child" + 0.145*"letter" + 0.035*"friend" + 0.032*"heart" + '
  '0.031*"baby" + 0.031*"kind" + 0.022*"write" + 0.021*"boy" + 0.020*"news" + '
  '0.020*"girl"'),
 (4,
  '0.047*"water" + 0.033*"tea" + 0.025*"pork" + 0.023*"bread" + 0.022*"thing" '
  '+ 0.020*"piece" + 0.019*"box" + 0.016*"pound" + 0.016*"butter" + '
  '0.016*"sugar"'),
 (6,
  '0.123*"school" + 0.057*"church" + 0.038*"number" + 0.030*"teacher" + '
  '0.024*"pupil" + 0.023*"town" + 0.022*"service" +

In [6]:
# Metadata
df = pd.read_csv("20240405_PhD_Data4TopicModel-LetterChunk.csv") 
df = df.rename(columns={'Unnamed: 0':'docID-AT'})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2270 entries, 0 to 2269
Data columns (total 28 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   docID-AT          2270 non-null   int64  
 1   docauthorid       2270 non-null   object 
 2   docauthorname     2270 non-null   object 
 3   docid             2270 non-null   object 
 4   docyear           2235 non-null   float64
 5   docmonth          2171 non-null   float64
 6   authorgender      2270 non-null   object 
 7   agewriting        1536 non-null   float64
 8   agedeath          1525 non-null   float64
 9   relMin            1870 non-null   object 
 10  nationalOrigin    2266 non-null   object 
 11  authorLocation    2270 non-null   object 
 12  U                 2076 non-null   object 
 13  M                 2076 non-null   object 
 14  S                 2076 non-null   object 
 15  F                 2076 non-null   object 
 16  L                 2076 non-null   object 


In [8]:
data = df.text.values.tolist()
# data[0]

# Create Functions & Objects

In [9]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [10]:
def format_topics(ldamodel=ldamodel, corpus=corpus, texts=texts):
    # Init output
    topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                topics_df = topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    topics_df = pd.concat([topics_df, contents], axis=1)
    return(topics_df)

In [11]:
# Check the topic for the first 10 rows. 
df_topic_keywords = format_topics(ldamodel=ldamodel, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head()

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,7,0.0855,"mind, order, call, man, companion, judge, hour...",TRINIDAD On Train from Steubenville Ohio to Ci...
1,1,10,0.086,"man, mine, time, reason, number, distance, wag...",in the island of Cuba. So we concluded that Cu...
2,2,10,0.1045,"man, mine, time, reason, number, distance, wag...","Trinidad."" ""Where is Trinidad?"" ""A little mini..."
3,3,21,0.09,"room, hand, door, table, hour, fear, stage, ge...",frighten me any more than others who will be t...
4,4,21,0.1162,"room, hand, door, table, hour, fear, stage, ge...",good to anyone in need. He died possessed of t...


In [12]:
map_Text_topicNumber = {}

In [13]:
for index, item in enumerate(data):
    topic_dist = ldamodel.get_document_topics(corpus[index])

    # topic_dist is a list of tuples: each tuple has a topic number with its corresponding proportion
    sorted_topic_dist = sorted(topic_dist, key=lambda x: (x[1]), reverse=True) # Sort it in descending order
    
    # Pick the top one:
    topic_number = sorted_topic_dist[0][0]
    map_Text_topicNumber[item] = topic_number

In [14]:
# View the first key value pair in dictionary (text, label)
# dict(list(map_Text_topicNumber.items())[0:1])

In [16]:
df['topicNumber'] = df['text'].map(map_Text_topicNumber)
df.loc[:10,['docid', 'docauthorid', 'text', 'topicNumber']]

Unnamed: 0,docid,docauthorid,text,topicNumber
0,S1019-D002,per0001043,TRINIDAD On Train from Steubenville Ohio to Ci...,7
1,S1019-D002,per0001043,in the island of Cuba. So we concluded that Cu...,10
2,S1019-D002,per0001043,"Trinidad."" ""Where is Trinidad?"" ""A little mini...",10
3,S1019-D002,per0001043,frighten me any more than others who will be t...,21
4,S1019-D002,per0001043,good to anyone in need. He died possessed of t...,21
5,S1019-D002,per0001043,I met Sisters Gabriella and Delphina walking i...,12
6,S1019-D002,per0001043,interest. You know that up to date the Sisters...,6
7,S1019-D002,per0001043,mother kept quiet. When we were permitted a lo...,15
8,S1019-D002,per0001043,to visit our Ecclesiastical Superior. The Most...,2
9,S1019-D002,per0001043,must not go on this far away mission! Are you ...,16


In [17]:
# Group top 5 sentences under each topic
topics_sorteddf_mallet = pd.DataFrame()

topics_outdf_grpd = df_topic_keywords.groupby('Dominant_Topic')

for i, grp in topics_outdf_grpd:
    topics_sorteddf_mallet = pd.concat([topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
topics_sorteddf_mallet

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Text
0,0,0.3772,"life, death, mind, manner, soul, nature, trial...",to all that mediums tell you. They are often d...
1,1,0.5846,"land, acre, year, farm, wheat, dollar, farmer,...",that crop 485 Harvesting and threshing wheat 1...
2,2,0.5005,"body, air, food, night, head, arm, blood, pain...",minutes or about fourteen times in an hour. I ...
3,3,0.2874,"child, letter, friend, heart, baby, kind, writ...",wrote quite in a huff ‘Supposing I was home an...
4,4,0.4382,"water, tea, pork, bread, thing, piece, box, po...",pleasant and salt cod particularly so when eat...
5,5,0.3006,"work, day, man, business, trade, hand, carpent...",may lose much time and money (to him how preci...
6,6,0.3353,"school, church, number, teacher, pupil, town, ...",draft a bill that will be tabled. Do listen to...
7,7,0.3374,"mind, order, call, man, companion, judge, hour...",to walk between the sheriff and Sister to the ...
8,8,0.1748,"time, day, week, thing, trouble, return, chang...",Sarah to Martha November 13 The tidings of dea...
9,9,0.3132,"book, paper, copy, interest, letter, story, te...",and possesses much dramatic effect and it coul...


In [18]:
# Number of Documents for Each Topic
topic_counts = df_topic_keywords['Dominant_Topic'].value_counts()
topic_counts

3     156
1     154
18    149
9     133
15    131
22    120
10    118
6     109
16    102
4      94
13     94
20     93
11     90
21     87
12     82
7      81
14     78
19     70
5      69
0      69
17     69
8      65
2      57
Name: Dominant_Topic, dtype: int64

In [19]:
# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)
topic_contribution

3     0.0687
1     0.0678
18    0.0656
9     0.0586
15    0.0577
22    0.0529
10    0.0520
6     0.0480
16    0.0449
4     0.0414
13    0.0414
20    0.0410
11    0.0396
21    0.0383
12    0.0361
7     0.0357
14    0.0344
19    0.0308
5     0.0304
0     0.0304
17    0.0304
8     0.0286
2     0.0251
Name: Dominant_Topic, dtype: float64

In [20]:
# Topic Number and Keywords
topic_num_keywords = df_topic_keywords[['Dominant_Topic', 'Topic_Keywords']].drop_duplicates().sort_values(by = 'Dominant_Topic')
topic_num_keywords

Unnamed: 0,Dominant_Topic,Topic_Keywords
151,0,"life, death, mind, manner, soul, nature, trial..."
641,1,"land, acre, year, farm, wheat, dollar, farmer,..."
8,2,"body, air, food, night, head, arm, blood, pain..."
81,3,"child, letter, friend, heart, baby, kind, writ..."
42,4,"water, tea, pork, bread, thing, piece, box, po..."
19,5,"work, day, man, business, trade, hand, carpent..."
6,6,"school, church, number, teacher, pupil, town, ..."
0,7,"mind, order, call, man, companion, judge, hour..."
152,8,"time, day, week, thing, trouble, return, chang..."
100,9,"book, paper, copy, interest, letter, story, te..."


In [21]:
topic_num_keywords.reset_index(drop=True, inplace=True)
topic_num_keywords

Unnamed: 0,Dominant_Topic,Topic_Keywords
0,0,"life, death, mind, manner, soul, nature, trial..."
1,1,"land, acre, year, farm, wheat, dollar, farmer,..."
2,2,"body, air, food, night, head, arm, blood, pain..."
3,3,"child, letter, friend, heart, baby, kind, writ..."
4,4,"water, tea, pork, bread, thing, piece, box, po..."
5,5,"work, day, man, business, trade, hand, carpent..."
6,6,"school, church, number, teacher, pupil, town, ..."
7,7,"mind, order, call, man, companion, judge, hour..."
8,8,"time, day, week, thing, trouble, return, chang..."
9,9,"book, paper, copy, interest, letter, story, te..."


In [22]:
# Reordering
# topic_num_keywords = topic_num_keywords.reindex([11,17,14,5,3,2,4,0,18,20,7,1,15,10,16,19,9,6,8,12,13])
# topic_num_keywords

In [23]:
# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts.rename('counts'), topic_contribution.rename('contribution')], axis=1).sort_values(by = 'Dominant_Topic')
df_dominant_topics

Unnamed: 0,Dominant_Topic,Topic_Keywords,counts,contribution
0,0,"life, death, mind, manner, soul, nature, trial...",69,0.0304
1,1,"land, acre, year, farm, wheat, dollar, farmer,...",154,0.0678
2,2,"body, air, food, night, head, arm, blood, pain...",57,0.0251
3,3,"child, letter, friend, heart, baby, kind, writ...",156,0.0687
4,4,"water, tea, pork, bread, thing, piece, box, po...",94,0.0414
5,5,"work, day, man, business, trade, hand, carpent...",69,0.0304
6,6,"school, church, number, teacher, pupil, town, ...",109,0.048
7,7,"mind, order, call, man, companion, judge, hour...",81,0.0357
8,8,"time, day, week, thing, trouble, return, chang...",65,0.0286
9,9,"book, paper, copy, interest, letter, story, te...",133,0.0586


In [24]:
df_dominant_topics.to_csv("20240411_PhD_TopicsLtrChkMAL23.csv")

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2270 entries, 0 to 2269
Data columns (total 29 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   docID-AT          2270 non-null   int64  
 1   docauthorid       2270 non-null   object 
 2   docauthorname     2270 non-null   object 
 3   docid             2270 non-null   object 
 4   docyear           2235 non-null   float64
 5   docmonth          2171 non-null   float64
 6   authorgender      2270 non-null   object 
 7   agewriting        1536 non-null   float64
 8   agedeath          1525 non-null   float64
 9   relMin            1870 non-null   object 
 10  nationalOrigin    2266 non-null   object 
 11  authorLocation    2270 non-null   object 
 12  U                 2076 non-null   object 
 13  M                 2076 non-null   object 
 14  S                 2076 non-null   object 
 15  F                 2076 non-null   object 
 16  L                 2076 non-null   object 


In [26]:
df.to_csv("20240411_PhD_Data4NER-LtrChk.csv")