<a href="https://colab.research.google.com/github/mayanksethi31/Text_Annotating_System/blob/main/Main_LDA_codefile.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
#!pip install pyLDAvis
import nltk
import tarfile

from nltk.corpus import brown, stopwords
from nltk.probability import FreqDist
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
from nltk.corpus import brown

#Gensim components
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim import models

In [21]:
nltk.download('brown')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [7]:
#Plotting tools for LDA
#!pip install pyLDAvis

import pyLDAvis
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()

import matplotlib.pyplot as plt
from pprint import pprint
import pyLDAvis.gensim_models as gensimvis

import re
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
import os
import string

  from collections import Iterable


In [8]:
from multiprocessing import Process, freeze_support

from wordcloud import WordCloud
import matplotlib.colors as mcolors
import seaborn as sns

In [9]:
###Method to pre-process the documents, i.e. convert to lowercase, remove punctuations, stopwords, perform stemming and lematization
def pre_processing(article):
    ###Initializing the stopwords corpus
    stop_words = stopwords.words('english')
    ###Initializing the stemmer
    stemmer = SnowballStemmer('english')
    
    pre_processed_doc = []
    ###Gensim pre_process lowercases the words and then tokenizes the words
    for token in gensim.utils.simple_preprocess(article):
        if token not in stop_words:
            stemmed_lematized_token = stemmer.stem(WordNetLemmatizer().lemmatize(token,pos='v'))
            pre_processed_doc.append(stemmed_lematized_token)
    
    ###To check the pre_processed result set this condition to True
    check_preprocess_result=False
    if check_preprocess_result:
        words = []
        for word in article:
            words.append(word)
        print(words)
        print('\n\n tokenized and lemmatized document: ')
        print(pre_processed_doc)
    return pre_processed_doc

In [10]:
import warnings
warnings.filterwarnings("ignore")

In [14]:
editorial_sents = brown.sents(categories=['editorial'])
editorials=[]
for i in range(len(editorial_sents)):
    editorials.append(" ".join(editorial_sents[i]))

In [24]:
preprocessed_documents = []
for i in range(len(editorials)):
    preprocessed_documents.append(pre_processing(editorials[i]))
data_compiled['Pre_processed']=preprocessed_documents
dictionary = gensim.corpora.Dictionary(preprocessed_documents)

In [25]:
dictionary.filter_extremes(no_below=20,no_above=0.5,keep_n=100000)

In [26]:
bag_of_words_corpus = [dictionary.doc2bow(document) for document in preprocessed_documents]
print("Length of bag of words corpus:{} must equal the total number of documents".format(len(bag_of_words_corpus)))
print("The first pre processed document")
print(preprocessed_documents[0])
print("Bag of words for the first document")
print(bag_of_words_corpus[0])
print("Length of bag of words for first document = {}".format(len(bag_of_words_corpus[0])))
print("Length of bag of words for second document = {}".format(len(bag_of_words_corpus[1])))

Length of bag of words corpus:2997 must equal the total number of documents
The first pre processed document
['assembl', 'session', 'bring', 'much', 'good']
Bag of words for the first document
[(0, 1), (1, 1), (2, 1), (3, 1)]
Length of bag of words for first document = 4
Length of bag of words for second document = 5


In [27]:
bow_first_doc = bag_of_words_corpus[0]
for i in range(len(bow_first_doc)):
    print("Word {} (\"{}\") appears {} time.".format(bow_first_doc[i][0],dictionary[bow_first_doc[i][0]],bow_first_doc[i][1]))


Word 0 ("assembl") appears 1 time.
Word 1 ("bring") appears 1 time.
Word 2 ("good") appears 1 time.
Word 3 ("much") appears 1 time.


In [28]:
tfidf = models.TfidfModel(bag_of_words_corpus)##fitting thr tf idf model on the bag of words corpus
corpus_tf_idf = tfidf[bag_of_words_corpus]

In [29]:
for doc_tf_idf_vector in corpus_tf_idf:
    pprint(doc_tf_idf_vector)
    break

[(0, 0.5607874010024522),
 (1, 0.5403780026749527),
 (2, 0.44160179334881866),
 (3, 0.44552997790614596)]


In [58]:
compute_using_bag_of_words=True
if compute_using_bag_of_words:
        ###Running LDA using bag of words
        #lda_final_model = gensim.models.ldamodel.LdaModel(bag_of_words_corpus, num_topics=10, id2word=dictionary, passes=50, chunksize=250,update_every=0,alpha='auto',iterations=50,minimum_probability=0.2)
    lda_final_model = gensim.models.LdaMulticore(bag_of_words_corpus, num_topics=6, id2word=dictionary, passes=50, chunksize=250,iterations=50,minimum_probability=0.2,workers=4)
else:
        ###Running LDA using TF-IDF
    lda_final_model = gensim.models.LdaMulticore(corpus_tf_idf, num_topics=10, id2word=dictionary, passes=50, chunksize=250,iterations=50,minimum_probability=0.2,workers=4)    
    ###Save and load LDA model to save time
    #lda_final_model.save('lda.model')
    #lda_final_model = models.LdaMulticore.load('lda.model')
print("The topics and the top words in each topic with weights")
ten_topics=lda_final_model.print_topics(num_words=10)
for topic in ten_topics:
    print(topic)
    
    ##words occuring in each topic with the weights to the words
for idx, topic in lda_final_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))
    
    ###Performance evaluation, i.e. to check which topic a document belongs to laong with the score/probability
print("The first pre-processed document")
print(preprocessed_documents[0])
    
print("Checking the score as in to which topic this document belongs to")
for index, score in sorted(lda_final_model[bag_of_words_corpus[0]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_final_model.print_topic(index, 10)))
    

The topics and the top words in each topic with weights
(0, '0.067*"state" + 0.042*"nation" + 0.040*"unit" + 0.033*"citi" + 0.028*"work" + 0.026*"need" + 0.023*"new" + 0.022*"school" + 0.020*"right" + 0.020*"industri"')
(1, '0.041*"war" + 0.039*"like" + 0.032*"world" + 0.032*"mr" + 0.032*"time" + 0.031*"peac" + 0.031*"american" + 0.028*"know" + 0.025*"west" + 0.025*"editor"')
(2, '0.065*"make" + 0.057*"would" + 0.041*"year" + 0.028*"seem" + 0.025*"peopl" + 0.024*"help" + 0.023*"call" + 0.023*"old" + 0.022*"use" + 0.021*"fact"')
(3, '0.058*"say" + 0.044*"public" + 0.040*"mani" + 0.036*"go" + 0.031*"give" + 0.029*"even" + 0.024*"question" + 0.020*"servic" + 0.019*"way" + 0.019*"get"')
(4, '0.040*"new" + 0.039*"come" + 0.036*"us" + 0.028*"leav" + 0.026*"test" + 0.025*"mr" + 0.025*"take" + 0.022*"presid" + 0.021*"see" + 0.021*"must"')
(5, '0.099*"one" + 0.029*"day" + 0.028*"man" + 0.027*"year" + 0.025*"two" + 0.022*"general" + 0.017*"back" + 0.017*"put" + 0.017*"church" + 0.017*"forc"')
To

In [31]:
data_compiled= pd.DataFrame({'Index_Sent': pd.Series(dtype='int'),
                             'Sentence': pd.Series(dtype='str'),
                             'LDA_topic': pd.Series(dtype='int'),
                            'LDA_score': pd.Series(dtype='float')})
for i in range(len(bag_of_words_corpus)):
    try:
        data_compiled=data_compiled.append({'Index_Sent': i,
                                        'Sentence':editorials[i],
                                        'LDA_topic': 
                                        sorted(lda_final_model[bag_of_words_corpus[i]], key=lambda tup: -1*tup[1])[0][0],
                                            'LDA_score': 
                                        sorted(lda_final_model[bag_of_words_corpus[i]], key=lambda tup: -1*tup[1])[0][1]
                                           }, ignore_index=True)
    except:
        continue

In [57]:
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

if compute_using_bag_of_words:
    lda_vizualization = gensimvis.prepare(lda_final_model,bag_of_words_corpus,dictionary,sort_topics=False)
    pyLDAvis.save_html(lda_vizualization,'LDA_visualization_bow.html')
else:
    lda_vizualization = gensimvis.prepare(lda_final_model,corpus_tf_idf,dictionary,sort_topics=False)
    pyLDAvis.save_html(lda_vizualization,'LDA_visualization_tf_idf.html')

In [32]:
pd.set_option('display.max_colwidth', None)

In [44]:
data_compiled[data_compiled['LDA_topic']==5].sort_values(by='LDA_score', ascending=False).head(10)

Unnamed: 0,Index_Sent,Sentence,LDA_topic,LDA_score
40,43,"It includes a raise in the county minimum wage , creation of several new jobs at the executive level , financing of beefed-up industrial development efforts , and increased expenditures for essential services such as health and welfare , fire protection , sanitation and road maintenance .",5,0.923529
1464,1614,"These programs are volumes of waste paper and lost hours if the citizens of a community must stand aside while land developers tell them when , where , and in what manner the community shall grow .",5,0.915767
903,970,This was just Richard's way of saying that last year the Birds opened spring training with a lot of jobs wide open .,5,0.895292
1120,1214,"In case of a deadlock between prison boards and inmates , a federal arbitration board to include a `` lifer '' and two escapees should decide the issue .",5,0.880744
2087,2311,"Since appeals to morality , to humanity , and to sanity have had such small effect , perhaps our last recourse is the deterrent example .",5,0.879961
2206,2434,"Our complaint is that in many crucial areas the Kennedy programs are not too large but too small , most seriously in regard to the conventional arms build-up and in aid and welfare measures .",5,0.879365
2493,2755,The next days may show where things stand .,5,0.878852
273,290,"The board of suspension of the Interstate Commerce commission has ordered a group of railroads not to reduce their freight rates on grain , as they had planned to do this month .",5,0.860653
1677,1858,"It is a revelation of what has been done , what is being done and what will be done in Newark as shown by architects' plans , models and pictures .",5,0.860491
270,286,"The same can be said about the half-hearted Cuban invasion mounted by the administration last April , which , we trust , is not symptomatic of the methods to be invoked in holding off the felonious Khrushchev .",5,0.860486


In [45]:
topic_names= pd.DataFrame({'LDA_topic': pd.Series(dtype='int'),
                             'topic_name': pd.Series(dtype='str'),
                             })
topic_names=topic_names.append({'LDA_topic': 0, 'topic_name':  "Governments & Nations" }, ignore_index=True)
topic_names=topic_names.append({'LDA_topic': 1, 'topic_name':  "People & Political Parties" }, ignore_index=True)
topic_names=topic_names.append({'LDA_topic': 2, 'topic_name':  "War & Power" }, ignore_index=True)
topic_names=topic_names.append({'LDA_topic': 3, 'topic_name':  "Presidents and Politics" }, ignore_index=True)
topic_names=topic_names.append({'LDA_topic': 4, 'topic_name':  "General Debates of People" }, ignore_index=True)
topic_names=topic_names.append({'LDA_topic': 5, 'topic_name':  "Developement adn Industry" }, ignore_index=True)

In [47]:
data_compiled=data_compiled.merge(topic_names, on='LDA_topic', how='left')

In [56]:
data_compiled.groupby('LDA_topic').head(5).to_csv("Sentences for Grounded_Coding.csv")