## Merge topic labels to chunk dataframe

In [1]:
import pickle
import gensim
from gensim import models
import gensim.corpora as corpora
import pandas as pd
import warnings
warnings.filterwarnings("ignore",category=FutureWarning)
from pprint import pprint

## Get Resources

In [2]:
# Lemmanized text
with open("20240701_PhD_AltDiaChkLem-N.txt", "rb") as fp:   # Unpickling
    data_lemmatized = pickle.load(fp)

In [3]:
ldamodel = models.ldamodel.LdaModel.load('20240701_PhD_TopicDiaChkMAL10a5i25o13')

In [4]:
# Not needed because I converted from Mallet to LDA in 20240331_PhD_TopicLetterChkNV-Mallet
# Load saved mallet
# lda_mallet = gensim.models.wrappers.LdaMallet.load("20240221_PhD_TopicChkMAL14")
# Convert to LDA
# ldamodel = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(lda_mallet)

In [5]:
pprint(sorted(ldamodel.print_topics(num_words=20)))

[(0,
  '0.130*"dinner" + 0.128*"tea" + 0.061*"breakfast" + 0.039*"bath" + '
  '0.034*"bread" + 0.029*"clothe" + 0.023*"washing" + 0.021*"cake" + '
  '0.019*"lot" + 0.019*"fish" + 0.017*"meat" + 0.015*"job" + 0.012*"pain" + '
  '0.011*"beach" + 0.010*"milk" + 0.009*"bird" + 0.009*"goat" + 0.009*"mutton" '
  '+ 0.008*"potato" + 0.008*"garden"'),
 (1,
  '0.090*"town" + 0.080*"port" + 0.058*"pound" + 0.044*"train" + 0.028*"mill" '
  '+ 0.026*"church" + 0.022*"meeting" + 0.021*"land" + 0.015*"ship" + '
  '0.014*"account" + 0.014*"wife" + 0.012*"doctor" + 0.011*"board" + '
  '0.009*"college" + 0.009*"school" + 0.008*"arrangement" + 0.008*"steamer" + '
  '0.008*"bower" + 0.008*"committee" + 0.007*"purchase"'),
 (2,
  '0.086*"letter" + 0.048*"wheat" + 0.034*"flour" + 0.031*"club" + '
  '0.027*"wife" + 0.022*"office" + 0.022*"bank" + 0.018*"pound" + '
  '0.017*"party" + 0.016*"price" + 0.015*"ton" + 0.015*"paper" + 0.014*"sale" '
  '+ 0.013*"whist" + 0.012*"share" + 0.012*"news" + 0.011*"lunche

In [6]:
# Metadata
df = pd.read_csv("20240701_PhD_Data4TopicModel-DiaryChunk.csv") 
df = df.rename(columns={'Unnamed: 0':'docID-AT'})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1023 entries, 0 to 1022
Data columns (total 30 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   docID-AT          1023 non-null   int64  
 1   docid             1023 non-null   object 
 2   docyear           1023 non-null   int64  
 3   docmonth          0 non-null      float64
 4   authorName        1023 non-null   object 
 5   docauthorid       1023 non-null   object 
 6   authorLocation    1023 non-null   object 
 7   authorGender      1023 non-null   object 
 8   nationalOrigin    921 non-null    object 
 9   irish             921 non-null    object 
 10  otherUK           921 non-null    object 
 11  relMin            1023 non-null   bool   
 12  catholic          1023 non-null   bool   
 13  otherChristian    1023 non-null   bool   
 14  U                 1023 non-null   bool   
 15  M                 1023 non-null   bool   
 16  S                 1023 non-null   bool   


In [7]:
data = df.text.values.tolist()
# data[0]

# Create Functions & Objects

In [8]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [9]:
def format_topics(ldamodel=ldamodel, corpus=corpus, texts=texts):
    # Init output
    topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num, topn=20)
                topic_keywords = ", ".join([word for word, prop in wp])
                topics_df = topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    topics_df = pd.concat([topics_df, contents], axis=1)
    return(topics_df)

In [10]:
# Check the topic for the first 10 rows. 
df_topic_keywords = format_topics(ldamodel=ldamodel, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head()

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,8,0.2199,"horse, boy, water, fire, thing, buggy, head, s...",Charra Wednesday Nov 4th 1883 A nice cool day ...
1,1,8,0.2587,"horse, boy, water, fire, thing, buggy, head, s...",night so that I can't get up in the morning ha...
2,2,8,0.2568,"horse, boy, water, fire, thing, buggy, head, s...",Frank did not arrive till 2P.M so after he had...
3,3,8,0.1785,"horse, boy, water, fire, thing, buggy, head, s...",to Camp caused it. Mrs Roberts lent me two pai...
4,4,8,0.3044,"horse, boy, water, fire, thing, buggy, head, s...",which I can't understand but hope to know toni...


In [11]:
map_Text_topicNumber = {}

In [12]:
for index, item in enumerate(data):
    topic_dist = ldamodel.get_document_topics(corpus[index])

    # topic_dist is a list of tuples: each tuple has a topic number with its corresponding proportion
    sorted_topic_dist = sorted(topic_dist, key=lambda x: (x[1]), reverse=True) # Sort it in descending order
    
    # Pick the top one:
    topic_number = sorted_topic_dist[0][0]
    map_Text_topicNumber[item] = topic_number

In [13]:
# View the first key value pair in dictionary (text, label)
# dict(list(map_Text_topicNumber.items())[0:1])

In [14]:
df['topicNumber'] = df['text'].map(map_Text_topicNumber)
df.loc[:10,['docid', 'docauthorid', 'text', 'topicNumber']]

Unnamed: 0,docid,docauthorid,text,topicNumber
0,D0002,D0002,Charra Wednesday Nov 4th 1883 A nice cool day ...,8
1,D0002,D0002,night so that I can't get up in the morning ha...,8
2,D0002,D0002,Frank did not arrive till 2P.M so after he had...,8
3,D0002,D0002,to Camp caused it. Mrs Roberts lent me two pai...,8
4,D0002,D0002,which I can't understand but hope to know toni...,8
5,D0002,D0002,his stores Friday and Saturday at Laura Bay. t...,8
6,D0002,D0002,could hardly keep up had dinner Mrs Roberts ki...,5
7,D0002,D0002,all the work Frank let out her traces then Bil...,0
8,D0002,D0002,to drive me if he wanted to catch the Vessel a...,0
9,D0002,D0002,Wenyss came in and gave me the offer of horses...,0


In [15]:
# Group top 5 sentences under each topic
topics_sorteddf_mallet = pd.DataFrame()

topics_outdf_grpd = df_topic_keywords.groupby('Dominant_Topic')

for i, grp in topics_outdf_grpd:
    topics_sorteddf_mallet = pd.concat([topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
topics_sorteddf_mallet

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Text
0,0,0.3701,"dinner, tea, breakfast, bath, bread, clothe, w...",no use laying in bed pretting about it so jump...
1,1,0.418,"town, port, pound, train, mill, church, meetin...",the long delayed dividend of pounds 153. 9. T...
2,2,0.4167,"letter, wheat, flour, club, wife, office, bank...",after some discussion we agreed to put in a te...
3,3,0.3697,"home, walk, dress, drive, garden, talk, child,...",hack and am now going to read Macanll tor an h...
4,4,0.4256,"home, work, truck, boat, wood, game, aunt, gun...",A nice day I went to the jetty there are 6 Ste...
5,5,0.306,"weather, rain, wind, dinner, thing, black, ves...",came in with us and stayed the day we had some...
6,6,0.3821,"man, people, horse, place, station, road, home...",the grown with the Campbells every one very ki...
7,7,0.5183,"house, question, motion, govt, office, club, c...",House A long question of priviledges raised on...
8,8,0.4399,"horse, boy, water, fire, thing, buggy, head, s...",hot day we were up by 5.30am but for the horse...
9,9,0.3766,"bed, room, sleep, mail, child, breakfast, work...",and was groaning and rolling About for hours M...


In [16]:
# Number of Documents for Each Topic
topic_counts = df_topic_keywords['Dominant_Topic'].value_counts()
topic_counts

3    144
0    135
1    111
7    109
4    107
2    100
8     93
6     89
5     71
9     64
Name: Dominant_Topic, dtype: int64

In [17]:
# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)
topic_contribution

3    0.1408
0    0.1320
1    0.1085
7    0.1065
4    0.1046
2    0.0978
8    0.0909
6    0.0870
5    0.0694
9    0.0626
Name: Dominant_Topic, dtype: float64

In [18]:
# Topic Number and Keywords
topic_num_keywords = df_topic_keywords[['Dominant_Topic', 'Topic_Keywords']].drop_duplicates().sort_values(by = 'Dominant_Topic')
topic_num_keywords

Unnamed: 0,Dominant_Topic,Topic_Keywords
7,0,"dinner, tea, breakfast, bath, bread, clothe, w..."
549,1,"town, port, pound, train, mill, church, meetin..."
566,2,"letter, wheat, flour, club, wife, office, bank..."
33,3,"home, walk, dress, drive, garden, talk, child,..."
90,4,"home, work, truck, boat, wood, game, aunt, gun..."
6,5,"weather, rain, wind, dinner, thing, black, ves..."
25,6,"man, people, horse, place, station, road, home..."
562,7,"house, question, motion, govt, office, club, c..."
0,8,"horse, boy, water, fire, thing, buggy, head, s..."
30,9,"bed, room, sleep, mail, child, breakfast, work..."


In [19]:
topic_num_keywords.reset_index(drop=True, inplace=True)
topic_num_keywords

Unnamed: 0,Dominant_Topic,Topic_Keywords
0,0,"dinner, tea, breakfast, bath, bread, clothe, w..."
1,1,"town, port, pound, train, mill, church, meetin..."
2,2,"letter, wheat, flour, club, wife, office, bank..."
3,3,"home, walk, dress, drive, garden, talk, child,..."
4,4,"home, work, truck, boat, wood, game, aunt, gun..."
5,5,"weather, rain, wind, dinner, thing, black, ves..."
6,6,"man, people, horse, place, station, road, home..."
7,7,"house, question, motion, govt, office, club, c..."
8,8,"horse, boy, water, fire, thing, buggy, head, s..."
9,9,"bed, room, sleep, mail, child, breakfast, work..."


In [20]:
# Reordering
# topic_num_keywords = topic_num_keywords.reindex([11,17,14,5,3,2,4,0,18,20,7,1,15,10,16,19,9,6,8,12,13])
# topic_num_keywords

In [21]:
# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts.rename('counts'), topic_contribution.rename('contribution')], axis=1).sort_values(by = 'Dominant_Topic')
df_dominant_topics

Unnamed: 0,Dominant_Topic,Topic_Keywords,counts,contribution
0,0,"dinner, tea, breakfast, bath, bread, clothe, w...",135,0.132
1,1,"town, port, pound, train, mill, church, meetin...",111,0.1085
2,2,"letter, wheat, flour, club, wife, office, bank...",100,0.0978
3,3,"home, walk, dress, drive, garden, talk, child,...",144,0.1408
4,4,"home, work, truck, boat, wood, game, aunt, gun...",107,0.1046
5,5,"weather, rain, wind, dinner, thing, black, ves...",71,0.0694
6,6,"man, people, horse, place, station, road, home...",89,0.087
7,7,"house, question, motion, govt, office, club, c...",109,0.1065
8,8,"horse, boy, water, fire, thing, buggy, head, s...",93,0.0909
9,9,"bed, room, sleep, mail, child, breakfast, work...",64,0.0626


In [22]:
df_dominant_topics.to_csv("20240701_PhD_TopicsDiaChkMAL10a5i25o13.csv")

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1023 entries, 0 to 1022
Data columns (total 31 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   docID-AT          1023 non-null   int64  
 1   docid             1023 non-null   object 
 2   docyear           1023 non-null   int64  
 3   docmonth          0 non-null      float64
 4   authorName        1023 non-null   object 
 5   docauthorid       1023 non-null   object 
 6   authorLocation    1023 non-null   object 
 7   authorGender      1023 non-null   object 
 8   nationalOrigin    921 non-null    object 
 9   irish             921 non-null    object 
 10  otherUK           921 non-null    object 
 11  relMin            1023 non-null   bool   
 12  catholic          1023 non-null   bool   
 13  otherChristian    1023 non-null   bool   
 14  U                 1023 non-null   bool   
 15  M                 1023 non-null   bool   
 16  S                 1023 non-null   bool   


In [25]:
df.to_csv("20240701_PhD_Data4NER-DiaChk.csv")