<span style="color: red; font-family: Calibri Light;">
  <h1><b>Topic Modelling with LDA:</b></h1>
    <p style = "color: black"> Using LDA model on pre-processed data. Stop word removal during pre-processing is limited only to common english stop words.<br>Any further stop word removal will be done when creating bag of words with the gensim library 
</span>

---

<span style="color: red; font-family: Calibri Light;">
  <h2><b>I. Setting Up Environment</b></h2>
</span>

In [1]:
#pre-processing libraries

import re #regular expressions library for text manipulation
import string
import unicodedata
import itertools
from autocorrect import Speller
import emoji
import contractions

#data transformation libraries
import pandas as pd
import numpy as np
import ast
import csv

#NLP specific libraries
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import FreqDist
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy import displacy
from spacy.lang.en import English
import spacymoji


#topic modelling libraries
from gensim import corpora
from gensim.models import LdaModel, CoherenceModel, LdaMulticore


#for visualization
import matplotlib.pyplot as plt
from prettytable import PrettyTable
import pyLDAvis
import pyLDAvis.gensim_models

from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator

#others
import time
import os
import random
from glob import glob


In [None]:
#additional nlp models
#!python -m spacy download en_core_web_md

In [2]:
#set seed so that code output is deterministic
random.seed(200)  # Set the seed for Python's random module
np.random.seed(200)  # Set the seed for NumPy's random module

---

<span style="color: red; font-family: Calibri Light;">
  <h2><b>II. Import Data into DataFrame</b></h2>
</span>

In [3]:
def clean_comments (filepath):
    """
    import comments from csv file into pandas dataframe,
    
    and carry out initial cleaning including removing deleted comments,
    
    correcting datetime data type, remove unwanted columns like `isSubmitter`,
    
    remove duplicates, etc
    
    """
    #import data
    df = pd.read_csv(filepath, low_memory = False)
    
    #remove deleted comments if any
    df = df[df.Body != '[deleted]']
    df = df[df.Body != '[removed]']
    df = df.dropna(subset=['Body'])
    
    #remove comments with missing id
    drop_index = df[df.isSubmitter.isnull()].index
    df.drop(drop_index, inplace = True)
    
    #remove duplicates if any
    df = df.drop_duplicates(subset =['ID'], ignore_index = True)
    df.reset_index (drop = True, inplace = True)
    
    #correct data types and column label
    df['Date_Created'] = pd.to_datetime(df['Date_Created'])
    df['year'] = df['Date_Created'].dt.year
    df['Score'] = df['Score'].astype('int') 
    df.rename(columns = {'Author_ID': "Author"}, inplace = True) 
    
    #rename 'Body' column to text
    df.rename(columns = {'Body': 'long_text',
                        'Date_Created': 'date_created'}, inplace = True)
    
    #remove unnecessary columns
    df.drop(columns = ['Unnamed: 0', 'Author', 'Score',
       'Parent_ID', 'Submission_ID', 'Subreddit', 'isParent', 'isSubmitter'], inplace = True)
    
    #remove any extra whitespace in column labels
    df.columns = df.columns.str.strip()
    
    #include column to denote row is comment entry
    df['text_type'] = 'comment'
    
    #rearrange column order
    df = df[['text_type','ID','date_created', 'year', 'long_text']]
        
    return df

def clean_submissions(filepath):
    """
    import submissions/posts from csv file into pandas dataframe,
    
    and carry out initial cleaning including removing deleted comments,
    
    correcting datetime data type, remove unwanted columns like `Unnamed`,
    
    remove duplicates, etc
    
    """
    df = pd.read_csv(filepath, low_memory = False)
    
    #drop duplicate posts
    df = df.drop_duplicates(subset =['ID'], ignore_index = True)
    df.reset_index (drop = True, inplace = True)
    
    #create column for post title + post text
    df['long_text'] = df['Title']+ " " +df['Post Text'].fillna('')
    
    #adjust data types
    df['Date Created'] = pd.to_datetime(df['Date Created'])
    df['year'] = df['year'].astype('int')
    
    #rename columns
    df.rename(columns = {'Date Created': 'date_created'}, inplace = True)
    
    #remove unwanted columns
    df = df.drop(columns = ['Unnamed: 0', 'Title','Post Text', 'Score',
       'Total Comments', 'Post URL', 'SubReddit','Unnamed: 0.1'])
    
    #include column to denote row is comment entry
    df['text_type'] = 'submission'
    
    #reorder columns - 'ID', 'Post Text'
    df  = df[['text_type','ID', 'date_created','year', 'long_text']]
    
    return df  

In [4]:
# Access dataset files
folder_path = os.path.join("..","..", "..", "Data")  # Adjust the path accordingly
file_type = "*.csv"

# List of dataset file paths
document_path = glob(os.path.join(folder_path, file_type))

document_path

['../../../Data/subset_sample_no_label.csv',
 '../../../Data/filtered_corpus.csv',
 '../../../Data/full_posts.csv',
 '../../../Data/full_data_no_preprocessing.csv',
 '../../../Data/training_data.csv',
 '../../../Data/lda_train.csv',
 '../../../Data/vocabulary.csv',
 '../../../Data/emoji_subset.csv',
 '../../../Data/sampled_subset.csv',
 '../../../Data/subs_topics.csv',
 '../../../Data/comments.csv']

In [5]:
comments_filepath = '' #file path for csv file of scraped comments 
submissions_filepath = '' #file path for csv file of scrapped submissions

for path in document_path:
    if "comments" in path:
        comments_filepath = comments_filepath + path
    elif "full_posts" in path:
        submissions_filepath = submissions_filepath + path
    

In [6]:
print (comments_filepath, submissions_filepath)

../../../Data/comments.csv ../../../Data/full_posts.csv


In [7]:
data = pd.concat([clean_comments(comments_filepath), 
                  clean_submissions(submissions_filepath)], 
                 ignore_index = True)

data

Unnamed: 0,text_type,ID,date_created,year,long_text
0,comment,gtfo2hl,2021-04-05 13:00:32,2021,"*Cuntry roads, take me hoem*"
1,comment,gtfqkbv,2021-04-05 13:41:40,2021,"That’s been there for several years, sent a pi..."
2,comment,gtfou07,2021-04-05 13:13:23,2021,I am single and I have not traveled to any cun...
3,comment,gtfrgpe,2021-04-05 13:56:09,2021,What happens when you shop at dragon mart...
4,comment,gtg5mwv,2021-04-05 16:51:54,2021,I am cunting on them to do so 😅
...,...,...,...,...,...
128878,submission,14f46ji,2023-06-21 14:40:54,2023,"Best beauty saloons in Dubai? Hello fellas, I ..."
128879,submission,14f4uyi,2023-06-21 15:15:27,2023,Found the r/dubai redditors who kept telling m...
128880,submission,14f4ri3,2023-06-21 15:10:25,2023,Scam ? Healthy.line My sister has a CBD debit ...
128881,submission,14f4k3r,2023-06-21 15:00:34,2023,Thoughts on Expo City properties? Anyone else ...


<span style="color: red; font-family: Calibri Light;">
  <h2><b>III. Text Preprocessing</b></h2>
</span>

<span style="color: red; font-family: Calibri Light;">
  <h3><b>a. convert text to lowercase</b></h3>
</span>

In [8]:
data['clean_text'] = data['long_text'].apply(lambda text: text.lower())

data.sample(n=5)

Unnamed: 0,text_type,ID,date_created,year,long_text,clean_text
52407,comment,jby7q9s,2023-03-12 21:20:25,2023,If I’m on SZR doing 140 (and I’m on there ofte...,if i’m on szr doing 140 (and i’m on there ofte...
91135,comment,jo3w5e6,2023-06-14 18:40:37,2023,So forcing LGBTQ down peoples throat is ok wit...,so forcing lgbtq down peoples throat is ok wit...
18066,comment,g57s994,2020-09-14 13:38:27,2020,Sorry mate. I only bought 1 pack. It was a mix...,sorry mate. i only bought 1 pack. it was a mix...
34113,comment,hkgr4lz,2021-11-13 19:26:35,2021,Not sure any consulting companies will look at...,not sure any consulting companies will look at...
24513,comment,esoqfqw,2019-07-03 20:48:42,2019,Yuck,yuck


<span style="color: red; font-family: Calibri Light;">
  <h3><b>b. expand word contractions</b></h3>
</span>

In [9]:
data['clean_text'] = data['clean_text'].apply(lambda text: contractions.fix(text)) 

data.sample(n=5)

Unnamed: 0,text_type,ID,date_created,year,long_text,clean_text
72442,comment,gd2xczj,2020-11-21 21:29:32,2020,Pwnd,pwnd
81671,comment,ji25q9t,2023-04-28 18:42:20,2023,"If a traffic violation can be excused, so can ...","if a traffic violation can be excused, so can ..."
12803,comment,i0uv3pr,2022-03-16 11:08:03,2022,Is this near wafi?,is this near wafi?
5790,comment,j8ows67,2023-02-16 02:07:49,2023,Oh! Got it,oh! got it
60256,comment,i8mqltf,2022-05-15 03:01:12,2022,"What's the situation with noon food, I've been...","what is the situation with noon food, i have b..."


<span style="color: red; font-family: Calibri Light;">
  <h3><b>c. remove URLs</b></h3>
</span>

In [10]:
#index of rows with urls
html_index = data[data['long_text'].str.contains("https")].index
data.loc[html_index].head()

Unnamed: 0,text_type,ID,date_created,year,long_text,clean_text
17,comment,gtgyqzz,2021-04-05 20:49:12,2021,Meditation And Relaxation Music https://youtu....,meditation and relaxation music https://youtu....
83,comment,gpqwzwc,2021-03-05 10:34:53,2021,I am curious about this Apollo Fintech GSX coi...,i am curious about this apollo fintech gsx coi...
93,comment,gpqwrek,2021-03-05 10:31:45,2021,https://www.instagram.com/p/CKhzbwpH0c4/?igshi...,https://www.instagram.com/p/ckhzbwph0c4/?igshi...
171,comment,gq3tah4,2021-03-07 19:49:41,2021,"With all due respect, do you expect a law enfo...","with all due respect, do you expect a law enfo..."
290,comment,jb5kpz4,2023-03-06 20:35:11,2023,Here: Dubai Festival City\nhttps://maps.app.go...,here: dubai festival city\nhttps://maps.app.go...


In [11]:
#regex pattern for urls
url_pattern = r'https?://\S+'
#replace url with empty string
data['clean_text'] = data['clean_text'].apply(lambda text: re.sub(url_pattern, ' ', text, flags=re.MULTILINE))

data.loc[html_index].head()

Unnamed: 0,text_type,ID,date_created,year,long_text,clean_text
17,comment,gtgyqzz,2021-04-05 20:49:12,2021,Meditation And Relaxation Music https://youtu....,meditation and relaxation music
83,comment,gpqwzwc,2021-03-05 10:34:53,2021,I am curious about this Apollo Fintech GSX coi...,i am curious about this apollo fintech gsx coi...
93,comment,gpqwrek,2021-03-05 10:31:45,2021,https://www.instagram.com/p/CKhzbwpH0c4/?igshi...,\n\nhere is a video explaining the same.
171,comment,gq3tah4,2021-03-07 19:49:41,2021,"With all due respect, do you expect a law enfo...","with all due respect, do you expect a law enfo..."
290,comment,jb5kpz4,2023-03-06 20:35:11,2023,Here: Dubai Festival City\nhttps://maps.app.go...,here: dubai festival city\n \n\nthen follow th...


<span style="color: red; font-family: Calibri Light;">
  <h3><b>d. remove accents from characters</b></h3>
</span>

In [12]:
data['clean_text'] = data['clean_text'].apply(lambda text: 
                                              unicodedata.normalize('NFKD', text).
                                              encode('ASCII', 'ignore').decode('utf-8'))

data.sample(n=5)

Unnamed: 0,text_type,ID,date_created,year,long_text,clean_text
20846,comment,gekzqi7,2020-12-04 14:46:42,2020,They’re keeping a 2m distance horizontally but...,they are keeping a 2m distance horizontally bu...
90704,comment,jolw1gi,2023-06-18 21:40:26,2023,The indoor sports thing at Dubai World Trade c...,the indoor sports thing at dubai world trade c...
5786,comment,j8mroan,2023-02-15 17:37:07,2023,Confirmed; it’s true. Just need to have the ac...,confirmed; it is true. just need to have the a...
76136,comment,gjx3t2n,2021-01-20 10:19:43,2021,"Wait a minute....mask? handwashing, social dis...","wait a minute....mask? handwashing, social dis..."
2828,comment,gdo37cv,2020-11-26 19:15:24,2020,But I heard girls like bad boys & I’m bad at e...,but i heard girls like bad boys & i am bad at ...


<span style="color: red; font-family: Calibri Light;">
  <h3><b>e. remove punctuations</b></h3>
</span>

In [13]:
#index of some rows with punctuations
checker_list = ['ifquow','gzl2ec','147gsfl','vtelex',
 '12pqx6m','fuxrd2','2ui6wu','l4gz0u','14f4uyi','14f8d30']

rows_to_check = data[data['ID'].isin(checker_list)].index.tolist()

rows_to_check.extend([32003, 116022,18460,5786,30109])

rows_to_check.extend(html_index)

print(rows_to_check[:5])

[124628, 125097, 126727, 126869, 127216]


In [14]:
#regex pattern for punctuations
punctuation_pattern = r'[^\w\s_]'

#remove punctuations using `re.sub() method
data['clean_text'] = data['clean_text'].apply(lambda text: re.sub(punctuation_pattern, ' ', text))

data.iloc[rows_to_check].head()

Unnamed: 0,text_type,ID,date_created,year,long_text,clean_text
124628,submission,ifquow,2020-08-24 19:13:26,2020,GUYS WE MADE IT!!! YAY,guys we made it yay
125097,submission,gzl2ec,2020-06-09 15:11:27,2020,"PSA: Immigration to Canada, Australia, NZ Hell...",psa immigration to canada australia nz hell...
126727,submission,147gsfl,2023-06-12 10:22:19,2023,How to reach people who are asking for money/g...,how to reach people who are asking for money g...
126869,submission,12pqx6m,2023-04-17 22:00:27,2023,How do you plan to spend your Eid holiday? Any...,how do you plan to spend your eid holiday any...
127216,submission,vtelex,2022-07-07 13:32:10,2022,"Hi everyone, I'm currently looking for jobs te...",hi everyone i am currently looking for jobs t...


<span style="color: red; font-family: Calibri Light;">
  <h3><b>f. remove new line & tab characters</b></h3>
</span>

In [15]:
#remove `\n` from text
data['clean_text'] = data['clean_text'].str.replace('\n', ' ')
#remove `\t` from text
data['clean_text'] = data['clean_text'].str.replace('\t', ' ')

data.iloc[rows_to_check].head()

Unnamed: 0,text_type,ID,date_created,year,long_text,clean_text
124628,submission,ifquow,2020-08-24 19:13:26,2020,GUYS WE MADE IT!!! YAY,guys we made it yay
125097,submission,gzl2ec,2020-06-09 15:11:27,2020,"PSA: Immigration to Canada, Australia, NZ Hell...",psa immigration to canada australia nz hell...
126727,submission,147gsfl,2023-06-12 10:22:19,2023,How to reach people who are asking for money/g...,how to reach people who are asking for money g...
126869,submission,12pqx6m,2023-04-17 22:00:27,2023,How do you plan to spend your Eid holiday? Any...,how do you plan to spend your eid holiday any...
127216,submission,vtelex,2022-07-07 13:32:10,2022,"Hi everyone, I'm currently looking for jobs te...",hi everyone i am currently looking for jobs t...


<span style="color: red; font-family: Calibri Light;">
  <h3><b>g. remove digits</b></h3>
</span>

In [16]:
data['clean_text'] = data['clean_text'].apply(lambda text:
                                              ' '.join (word for word in text.split() 
                                                        if word.isalpha()))

data.sample(n=5)

Unnamed: 0,text_type,ID,date_created,year,long_text,clean_text
33736,comment,ik6k7ib,2022-08-14 02:16:45,2022,I think the system is too big. You could insta...,i think the system is too big you could instal...
52558,comment,icps7pg,2022-06-17 18:53:45,2022,bro,bro
127352,submission,tjwnth,2022-03-22 10:31:32,2022,Traveller's question - how/where to buy female...,traveller s question how where to buy female l...
33414,comment,g1abqxl,2020-08-13 06:32:20,2020,Are looking into buying in bulk ?,are looking into buying in bulk
77424,comment,eug9zto,2019-07-22 17:52:30,2019,"That generation of Gulf Arabs spoke Urdu well,...",that generation of gulf arabs spoke urdu well ...


<span style="color: red; font-family: Calibri Light;">
  <h3><b>h. Remove words with less than three characters</b></h3>
</span>

In [17]:
data['clean_text'] = data['clean_text'].apply(lambda text: ' '.join(word for word in text.split() if len(word) >= 3))

data.sample(n=5)

Unnamed: 0,text_type,ID,date_created,year,long_text,clean_text
6095,comment,el2q29a,2019-04-17 07:22:15,2019,"As a previous poster mentioned, angel investor...",previous poster mentioned angel investors inve...
62668,comment,ga4h28s,2020-10-26 05:21:03,2020,Bruh really thought he could pass on a VW comm...,bruh really thought could pass commercial his own
58150,comment,jop76jh,2023-06-19 17:06:39,2023,Such a good point. \n \nSide note.. I’m gratef...,such good point side note grateful have two hu...
71470,comment,gfwulud,2020-12-15 15:23:48,2020,Start small and work hard! Apply everywhere a...,start small and work hard apply everywhere and...
91935,comment,fvn05re,2020-06-22 18:14:42,2020,Filli is the real deal,filli the real deal


<span style="color: red; font-family: Calibri Light;">
  <h3><b>i. Lemmatization</b></h3>
</span>

In [18]:
nlp = spacy.load('en_core_web_md')

In [19]:
start_time = time.time()

data['clean_text'] = data['clean_text'].apply(lambda text:
                                              ' '.join(token.lemma_ for token in nlp(text)))

print (f"This code block took {(time.time() - start_time)/60 :.2f} minutes to complete")

data.iloc[rows_to_check].head()

This code block took 76.68 minutes to complete


Unnamed: 0,text_type,ID,date_created,year,long_text,clean_text
124628,submission,ifquow,2020-08-24 19:13:26,2020,GUYS WE MADE IT!!! YAY,guy make yay
125097,submission,gzl2ec,2020-06-09 15:11:27,2020,"PSA: Immigration to Canada, Australia, NZ Hell...",psa immigration canada australia hello dubai r...
126727,submission,147gsfl,2023-06-12 10:22:19,2023,How to reach people who are asking for money/g...,how reach people who be ask for money grocery ...
126869,submission,12pqx6m,2023-04-17 22:00:27,2023,How do you plan to spend your Eid holiday? Any...,how you plan spend your eid holiday any fun ac...
127216,submission,vtelex,2022-07-07 13:32:10,2022,"Hi everyone, I'm currently looking for jobs te...",everyone currently look for job temporary perm...


<span style="color: red; font-family: Calibri Light;">
  <h3><b>j. Remove common stop words</b></h3>
</span>

In [20]:
start_time = time.time()

data['clean_text'] = data['clean_text'].apply(lambda text:
                                              ' '.join([word.text for word in nlp(text) 
                                                        if not word.is_stop]))

print (f"This code block took {(time.time() - start_time)/60 :.2f} minutes to complete")

data.sample(n=5)

KeyboardInterrupt: 

In [None]:
#check top words after removal of common stop words

#list of all words in the dataframe
all_words = [word for text in data['clean_text'] for word in text.split()]

#frequency of word occurrence
fdist = FreqDist(all_words)

common_words_tuples= fdist.most_common(100)
common_words = [word for word, freq in common_words_tuples]

#rare_words_dict = fdist.most_common()[-20:-1]
#rare_words = [word for word, freq in fdist.items() if freq <= 10]

#table of common words
#common_words_table = PrettyTable(['word', 'count'])
#for word, count in common_words.items():
#    common_words_table.add_row([word, count])

#print (len(common_words),'\n\n',rare_words)
print (f'Common words: The top 20 most common words in the dataset are: {common_words}')
#print ('\n')
#print (f'Rare words: There are {len(rare_words)} words that occur less than or equal to 10 times in the dataset')

In [None]:
#wordcloud of most frequent words


# Generate the word cloud
wordcloud = WordCloud(
                width=800, 
                height=400,  
                background_color="black", 
                colormap="Paired").generate_from_frequencies(#dictionary of word and their frequency of occurrence
                                                        FreqDist(
                                                            [word for text in data['clean_text'] for word in text.split()])
                        )

# Plot the word cloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
#create custom stop words list
#custom_sw = rare_words + common_words #create list holding common and rare words
#custom_sw = set(custom_sw) #remove any duplicates

#len(custom_sw)

In [None]:
#remove custom stop words from dataset
#data['clean_text'] = data['clean_text'].apply(lambda text: ' '.join([word for word in text.split() if word not in custom_sw]))

#data.sample(n=5)                                    

<span style="color: red; font-family: Calibri Light;">
  <h3><b>k. remove extra whitespaces</b></h3>
</span> 

In [None]:
data['clean_text'] = data['clean_text'].str.strip().str.replace(r'\s+', ' ', regex = True)

sample_rows = [5786,18460, 103391]

data.loc[sample_rows]

<span style="color: red; font-family: Calibri Light;">
  <h3><b>l. word tokenization</b></h3>
</span>

In [None]:
start_time = time.time()

data['tokens'] = data['clean_text'].apply(lambda text: word_tokenize(text))

print (f"This code block took {(time.time() - start_time)/60 :.2f} minutes to complete")

data.iloc[rows_to_check].head()

<span style="color: red; font-family: Calibri Light;">
  <h3><b>m. insert word count</b></h3>
</span>

In [None]:
data['word_count'] = data['tokens'].apply (lambda tokens_list: len(tokens_list))

data.sort_values(by='word_count', ascending = False).head()

<span style="color: red; font-family: Calibri Light;">
  <h3><b>n. remove subset for manual labelling</b></h3>
</span>

In [None]:
#load list of sample_subset indices

#sampling done previously by randomly selecting entries from each year

with open('../../../Data/sample_subset_index.txt', 'r') as file:
    subset_ids = [line.strip() for line in file]

subset_ids[:5]

In [None]:
#create unlabelled sample subset 
subset_data = data[data['ID'].isin(subset_ids)]
subset_data.head()

In [None]:
#get index of entries in sampled subset
subset_index = data[data['ID'].isin(subset_ids)].index.to_list()

#remove sample subset from data

training_data = data.drop(subset_index, axis = 0)


training_data.head()

In [None]:
training_data.info()

<span style="color: red; font-family: Calibri Light;">
  <h3><b>o. remove short entries</b></h3>
</span>

In [None]:
#select only rows with more than 3 word
lda_training = training_data[training_data['word_count'] > 3]

lda_training.shape

<span style="color: red; font-family: Calibri Light;">
  <h3><b>p. save training dataset for LDAModel</b></h3>
</span>

In [None]:
#save data
filename = '../../../Data/lda_train.csv'

def export_csv():
    '''
    export pre-processed data to CSV
    '''
    lda_training.to_csv(filename, index_label = 'index', quoting = csv.QUOTE_ALL, header = True)

export_csv()

print ('file saved')

---

<span style="color: red; font-family: Calibri Light;">
  <h2><b>IV. Feature Extraction</b></h2>
</span>

In [None]:
#import cleaned data

def list_converter(text):
    #to revert list->str conversion from pd.read_csv
    return ast.literal_eval(text)


lda_data = pd.read_csv('../../../Data/lda_train.csv', converters ={'tokens':list_converter})
lda_data = lda_data.drop(columns = ['index'])
lda_data.head()

In [None]:
lda_data.info()

In [None]:
#convert df['tokens'] to list of strings for bag-of-words model
docs = lda_data['tokens'].tolist()

In [None]:
docs[:5]

In [None]:
#check number of unique words

unique_words = set([word for text in docs for word in text])

print (f'There are {len(unique_words)} unique words in the dataset')


In [None]:
# Create bigrams - code from gensim documentation page
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [None]:
print(docs[2])

In [None]:
#code from gensim bag of words documentation page

# Create a dictionary representation of the documents.
dictionary = corpora.Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=50, no_above=0.50)

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

---

<span style="color: red; font-family: Calibri Light;">
  <h2><b>V. TRAIN MODEL</b></h2>
</span>

In [None]:
#retrain, filter extreme to no_abovr = 70, topics = limit to 50

In [None]:
np.arange(5, 51, 5)

In [None]:
#from gensim documentation at https://radimrehurek.com/gensim/auto_examples/tutorials/run_lda.html#sphx-glr-auto-examples-tutorials-run-lda-py
#https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/?expand_article=1#9createbigramandtrigrammodels
#https://github.com/clevyclev/Deep-Learning-Projects/blob/master/Latent%20Dirichlet%20Allocation%20-%20Bag%20of%20Words%20and%20TF-IDF/Latent_dirichlet_allocation.py

#training parameters
chunksize = 5000
passes = 10
iterations = 400
eval_every = None
id2word = dictionary.id2token
temp = dictionary[0] #to "load" the dictionary

#range of topics
topics_range = np.arange (5,51,step = 5)

# Lists to hold metrics
model_coherence_cv = []
#model_coherence_umass = []
#model_coherence_cnpmi = []
#model_perplexity = []
#topic_diversities = []

start_time = time.time()
for num_topics in topics_range:
    
    # Train LDA model
    lda_model = LdaModel(corpus=corpus,
                         id2word=dictionary,
                         chunksize=chunksize,
                         alpha='auto',
                         eta='auto',
                         passes=passes,
                         iterations=iterations,
                         num_topics=num_topics,
                         per_word_topics=True,
                         random_state=80)
    
    # Compute c_v score
    c_v = CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence='c_v')
    cv_lda = c_v.get_coherence()
    model_coherence_cv.append(cv_lda)
    
    # Compute u_mass score
    #u_mass = CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence='u_mass')
    #umass_lda = u_mass.get_coherence()
    #model_coherence_umass.append(umass_lda)
    
    # Compute c_npmi score
    #c_npmi = CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence='c_npmi')
    #cnpmi_lda = c_npmi.get_coherence()
    #model_coherence_cnpmi.append(cnpmi_lda)
    
    # Compute perplexity
    #perplexity = lda_model.log_perplexity(corpus)
    #model_perplexity.append(perplexity)
    
    # Compute topic diversity
    #top_n = 10  # You can adjust this value
    #top_words = [word for topic_id in range(num_topics) for word, _ in lda_model.show_topic(topic_id, topn=top_n)]
    #diversity = len(set(top_words)) / (num_topics * top_n)
    #topic_diversities.append(diversity)

print(f"This model took {(time.time() - start_time)/60 :.2f} minutes to train")

In [None]:
plt.plot(topics_range, model_coherence_cv)
plt.xlabel('number of topics')
plt.ylabel('coherence score')
plt.title('Coherence Score vs. Number of Topics')
plt.xticks(topics_range)

plt.show()

In [None]:
#training parameters
num_topics = 5
chunksize = 5000
passes = 10
iterations = 400
eval_every = None
id2word = dictionary.id2token
temp = dictionary[0] #to "load" the dictionary




    
#train LDA model
lda_model = LdaModel(corpus = corpus,
                             id2word = dictionary,
                             chunksize = chunksize,
                             alpha = 'auto',
                             eta = 'auto',
                             passes = passes,
                             iterations = iterations,
                             num_topics = num_topics,
                             per_word_topics = True,
                             random_state = 180,
                            )

In [None]:
#check coherence and perplexity

# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm

#number of topics
num_topics = lda_model.num_topics

# Number of words to display for each topic
num_words = 10

#define color map
colors = cm.viridis(np.linspace(0,1, num_topics))

#subplot layout
rows = 3
cols = int(np.ceil(num_topics/rows))

plt.figure(figsize=(15,10))


# Iterate through the topics
for topic_num in range(num_topics):
    # top words and their probabilities for each topic
    top_words = lda_model.show_topic(topic_num, topn=num_words)
    
    # Separate the words and probabilities
    topic_words, probs = zip(*top_words)
    
    #create subplot
    plt.subplot(rows, cols, topic_num +1)
    
    # Plot the words and probabilities as a horizontal bar chart
    plt.barh(topic_words, probs, color = colors[topic_num])
    plt.xlabel('Probability')
    plt.title(f'Topic {topic_num}')
    plt.gca().invert_yaxis()  # Invert y-axis to have the highest probability at the top

plt.tight_layout()
plt.show()

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary, sort_topics = False)
pyLDAvis.display(vis)