# **Latent Dirichlet Allocation (LDA)-based Topic Modeling and Clustering**

In [1]:
import pandas as pd
import nltk
import gensim
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from gensim.parsing.preprocessing import STOPWORDS
import numpy as np
from gensim import corpora,models
import time
import pickle
nltk.download('stopwords')
nltk.download('wordnet')
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\niranjans3ln\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\niranjans3ln\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
#importing the CSV file of the master reports from google drive into a dataframe
master_reports = pd.read_csv('dataset/master_reports.csv')
master_reports = master_reports.drop(columns=['Unnamed: 0'])

In [3]:
def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 5:
            result.append(lemmatize(token))
    return result

In [4]:
master_reports['Description'] = master_reports['Description'].map(preprocess)

### **Creating Bag of Words (BoW)**

In [5]:
#Creating a dictionary using gensim library
dictionary = gensim.corpora.Dictionary(master_reports['Description'])
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [6]:
#Print top 20 words from the dictionary
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 20:
        break

0 actually
1 change
2 comment
3 compare
4 complete
5 consider
6 contain
7 default
8 document
9 editor
10 effect
11 ensure
12 external
13 inconsistent
14 internal
15 modify
16 navigator
17 problem
18 project
19 refresh
20 relate


In [7]:
#Creating BoW using the the dictionary
bow_corpus = [dictionary.doc2bow(doc) for doc in master_reports['Description']]

In [8]:
#Printing the BoW for single document
bow_doc_8 = bow_corpus[8]
for i in range(len(bow_doc_8)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_8[i][0], 
                                               dictionary[bow_doc_8[i][0]], 
bow_doc_8[i][1]))

Word 1 ("change") appears 1 time.
Word 22 ("resource") appears 1 time.
Word 31 ("expect") appears 1 time.
Word 67 ("future") appears 1 time.
Word 89 ("implementation") appears 1 time.


In [9]:
# open a file, where you stored the pickled data
f= open('dataset/bow_corpus.pickle', 'wb')

# dump information to that file
pickle.dump(bow_corpus, f)

In [10]:
# open a file, where you stored the pickled data
file = open('dataset/dictionary.pickle', 'wb')

# dump information to that file
pickle.dump(dictionary, file)

### **LDA-based Topic Modeling**

In [11]:
#Preparing the parameters for LDA model
corpus = bow_corpus
no_of_topics = 10
dictionary = dictionary
p = 20
k = 2
epochs = 100

#Training the LDA model on the BoW corpus
lda_model = gensim.models.LdaMulticore(corpus, num_topics=no_of_topics, id2word=dictionary, passes=p, workers=k, iterations=epochs)

In [12]:
# save model to disk (no need to use pickle module)
lda_model.save('dataset/lda_model.model')

In [13]:
# Printing the topics and the propability distributions of words in those topics
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.261*"editor" + 0.053*"editors" + 0.035*"change" + 0.030*"action" + 0.027*"workbench" + 0.024*"problem" + 0.022*"create" + 0.021*"content" + 0.015*"active" + 0.015*"exception"
Topic: 1 
Words: 0.124*"version" + 0.117*"stream" + 0.050*"project" + 0.043*"compare" + 0.029*"replace" + 0.026*"workspace" + 0.023*"history" + 0.023*"release" + 0.023*"operation" + 0.019*"versions"
Topic: 2 
Words: 0.054*"change" + 0.050*"eclipse" + 0.048*"release" + 0.044*"support" + 0.026*"server" + 0.025*"plugin" + 0.023*"resource" + 0.019*"update" + 0.018*"resources" + 0.017*"ignore"
Topic: 3 
Words: 0.244*"project" + 0.064*"create" + 0.046*"delete" + 0.042*"select" + 0.031*"workspace" + 0.029*"resource" + 0.028*"repository" + 0.028*"location" + 0.024*"dialog" + 0.022*"prompt"
Topic: 4 
Words: 0.096*"perspective" + 0.035*"property" + 0.032*"method" + 0.032*"public" + 0.028*"return" + 0.027*"properties" + 0.025*"perspectives" + 0.023*"plugin" + 0.022*"object" + 0.021*"reference"
Topic: 5 
Wo

In [14]:
master_reports['Description']

0       [project, contain, resource, release, project,...
1       [repository, resource, default, editor, doesnt...
2       [deletion, indicator, viewer, subtle, vision, ...
3       [synchronize, project, repository, different, ...
4       [gettingsetting, manage, resource, methods, is...
                              ...                        
1032    [editor, singleuse, accelerators, associate, e...
1033    [perspective, perspective, windows, desktop, w...
1034    [create, project, create, editor, delete, chan...
1035    [compare, project, folder, stream, resource, c...
1036    [navigator, refresh, accelerator, refresh, sup...
Name: Description, Length: 1037, dtype: object

In [15]:
#Let's evaluate the model using Perplexity and Coherence Bag of words- Title
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(bow_corpus))  # a measure of how good the model is. lower the better.



Perplexity:  -4.988241227659295


In [16]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=master_reports['Description'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.3238917558808005


In [19]:
pip install pyldavis

Collecting pyldavis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
     ---------------------------------------- 1.7/1.7 MB 10.5 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting future
  Downloading future-0.18.2.tar.gz (829 kB)
     ------------------------------------- 829.2/829.2 kB 10.5 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting funcy
  Downloading funcy-1.17-py2.py3-none-any.whl (33 kB)
Building wheels for collected packages: pyldavis, future
  Building wheel for pyldavis (pyproject.toml): 

In [21]:
!pip install pyLDAvis



In [24]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
#import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

In [26]:
# Visualize the topics for LDA model
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, bow_corpus, dictionary)
vis

### **Clustering premised on Topic Modeling**

In [28]:
#Creating 10 empty clusters and pushing master reports in each of them based on topic modeling and saving them in individual csv file
for c in range(10):
    exec('topic_{} = pd.DataFrame()'.format(c))
    for i in range(len(master_reports)):
        topic=lda_model[dictionary.doc2bow(master_reports.Description[i])]
        topic= np.asarray(topic)
        if int(topic[np.argmax(topic[:,1]),0])== c:
            exec('topic_{} = topic_{}.append(master_reports.loc[[i]])'.format(c,c))
            exec('topic_{} = topic_{}.reset_index(drop=True)'.format(c,c))
            exec('topic_{}.to_csv("dataset/topic_{}.csv")'.format(c,c))