In [None]:
#Import required packages
import pickle # for saving and loading objects
import gensim
from gensim.models import CoherenceModel
import gensim.corpora as corpora
from nltk.tag import pos_tag
from gensim.utils import simple_preprocess
import numpy as np
import pandas as pd
import os
import time
import datetime
import tarfile
import re
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from collections import OrderedDict

#Extracting an open source dataset from the New York Times
tar = tarfile.open("20news-bydate.tar.gz")
tar.extractall()
tar.close()
##Make a list of the folders in the dataset
directory = [f for f in os.listdir('./20news-bydate-train') if not f.startswith('.')]
dt_stamp = datetime.datetime.strftime(datetime.datetime.now(),'%Y%m%d_%H_%M_%S')

texts=[]
text_corpus=[]
texts_mod=[]

for i in range(len(directory)):
##Create a list of files in the given dictionary 
    files = os.listdir('./20news-bydate-train/' + directory[i])
    
    for j in range(len(files)):     

        ##Path of each file 
        path = './20news-bydate-train/' + directory[i] + '/' + files[j]

        ##open the file and read it
        text1 = open(path, 'r', errors='ignore').read()        
        texts.append(text1)

# Data Cleansing
for text1 in texts:
    text = text1
    #remove_names()
    new_sentence = []
    text_split=text.split(" ")
    tagged_sentence = pos_tag([word for word in text_split if word])
    for word, tag in tagged_sentence:
        if tag in ['NNP', 'NNPS']:
            lemma_word = ""
        else:
            lemma_word = word

        new_sentence.append(lemma_word)
    text2=""
    for i in new_sentence:
        text2 = text2 + " " + i
    text=text2

    # Converting to Lower case
    text=text.lower()

    # Removing unwanted information 
    text=text.replace("-"," ").replace(".com"," ")
    text=' '.join(re.sub("(@[A-Za-z[0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\/S+)", " ", text).split())

    # Adding Space at begining and End of the Text
    text=" " + text + " "

    #replace_incomplete_word()

    #smart_lemmatize()
    new_sentence = []
    lemma = WordNetLemmatizer()
    text_split=text.split(" ")
    tagged_sentence = pos_tag([word for word in text_split if word])
    for word, tag in tagged_sentence:
        if tag in ['NN', 'NNS', 'NNP', 'NNPS']:
            pos = 'n'
        elif tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
            pos = 'v'
        elif tag in ['JJ', 'JJR', 'JJS']:
            pos = 'a'
        elif tag in ['RB', 'RBR', 'RBS']:
            pos = 'r'
        else:
            pos = 'n'
        lemma_word = lemma.lemmatize(word=word, pos=pos)
        new_sentence.append(lemma_word)
    text2=""
    for i in new_sentence:
        text2 = text2 + " " + i
    text=text2

    #removeUnicode()
    text = re.sub(r'(\\u[0-9A-Fa-f]+)',r' ', text)       
    text = re.sub(r'[^\x00-\x7f]',r' ',text)

    #replaceURL()
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',text)
    text = re.sub(r'#([^\s]+)', r'\1', text)

    #replaceAtUser()
    text = re.sub('@[^\s]+',' ',text)

    #removeHashtagInFrontOfWord()
    text = re.sub(r'#([^\s]+)', r'\1', text)

    #removeNumbers()
    text = ''.join([i for i in text if not i.isdigit()]) 

    #replaceMultiExclamationMark()
    text = re.sub(r"(\!)\1+", ' ', text)

    #replaceMultiQuestionMark()
    text = re.sub(r"(\?)\1+", ' ', text)

    #replaceMultiStopMark()
    text = re.sub(r"(\.)\1+", '.', text)

    #replace_incomplete_word()
    #remove_stop_words()
    from nltk.corpus import stopwords
    rmv_wrd_lst=stopwords.words('english')
    rmv_stop_word=[]
    for i in rmv_wrd_lst:
        #print(i)
        wrd=" " + str(i) + " "
        rmv_stop_word.append(wrd)
    for t in rmv_stop_word:
        text=text.replace(t," ")

    #replaceMultiSpace()
    #text=text.replace("  "," ")
    text = re.sub(r"(\ )\1+", ' ', text)        

    text_corpus.append(gensim.utils.simple_preprocess(str((text.split()))))
    #texts.append(gensim.utils.simple_preprocess(str((clean_text_ngram(text1)).split()),deacc=True))
    texts_mod.append(text)

vectorizer = TfidfVectorizer(analyzer='word',stop_words=rmv_stop_word)
data_vectorized = vectorizer.fit_transform([' '.join(x) for x in text_corpus])
search_params = {'n_components': [1, 2, 3, 4, 5, 10, 15, 20]}
lda = LatentDirichletAllocation()
model = GridSearchCV(lda, param_grid=search_params)
model.fit(data_vectorized)
print("best model params", model.best_params_)

# Cluster Number optimization using LDA Coherence Score
dictionary=corpora.Dictionary(text_corpus)
corpus=[dictionary.doc2bow(text) for text in text_corpus]
coherence_values = []
model_list = []
i_model_list = []
i_cv_list = []
limit=21
start=2
step=1

for num_topics in range(start, limit, step):
            print('Number of Topics', num_topics)
            model = gensim.models.LdaModel(corpus,num_topics=num_topics, id2word=dictionary,random_state=100,chunksize=10000,passes=2,alpha='auto')
            
            coherencemodel = CoherenceModel(model=model, texts=text_corpus,corpus=corpus,dictionary=dictionary, coherence='c_v')
            print(' CV ' , coherencemodel.get_coherence())
            coherence_values.append(coherencemodel.get_coherence())
            model_list.append(num_topics)
            
            
get_ipython().run_line_magic('matplotlib', 'inline')
limit=21
start=2
step=1
x=range(start,limit,step)
plt.plot(x,coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence Score")
plt.legend("Coherence_values",loc='best')
plt.grid()
work_dir=os.getcwd()
title_name="20_newsgroup_plot"
img_name=str(work_dir)+'/'+str(title_name)+ str(".png")
#fig=plt.figure(figsize=(6,4))
plt.savefig(img_name,dpi=300,bbox_inches='tight')
plt.show()
plt.clf()

# Final LDA Model with optimized number of clusters as identified in the above coherence plot
lda_model =  gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=dictionary,num_topics=5,update_every=1,chunksize=10000,passes=1,random_state=100)
print(lda_model.print_topics(12))

# Saving the Final LDA Model and Data as Pickle file
with open ('text_corpus_20Newgroup_' + dt_stamp + 'pkl', 'wb') as pkl_fl:
        pickle.dump(text_corpus,pkl_fl)
        
with open ('lda_model_20Newgroup_' + dt_stamp + 'pkl', 'wb') as pkl_fl:
        pickle.dump(lda_model,pkl_fl)
        
# LDA Visualization
pyLDAvis.enable_notebook()
viz=pyLDAvis.gensim.prepare(lda_model,corpus,dictionary,sort_topics=False)
pyLDAvis.display(viz)

# save the LDA visualization

dt_stamp = datetime.datetime.strftime(datetime.datetime.now(),'%Y%m%d_%H_%M_%S')
pyLDAvis.save_html(viz,'lda_visualization_20Newgroup_' + dt_stamp + '.html')

# # LDA Output
freq_df = viz.topic_info
freq_df['prevalence'] = freq_df['Freq'] / freq_df['Total'] # calculates prevalence ratio (0 – 1)
freq_df = freq_df.loc[freq_df['Category'] != 'Default'] # filters out extra data
freq_df.to_excel('20Newgroup_keywords_' + dt_stamp + '.xlsx') # exports to excel

sent_topics_df = pd.DataFrame()
for i,row_list in enumerate(lda_model[corpus]):
    row = row_list[0] if lda_model.per_word_topics else row_list  # get list of sentences
    row = sorted(row, key=lambda x: (x[1]), reverse=True)

    for j , (topic_num,prop_topic) in enumerate(row):
        if j == 0:
            wp =lda_model.show_topic(topic_num)
            topic_keywords = ", ".join([word for word,prop in wp])
            sent_topics_df=sent_topics_df.append(pd.Series([int(topic_num),round(prop_topic,4),topic_keywords]),ignore_index=True)
        else:
            break
sent_topics_df.columns = ['Dominant_Topic','Perc_Contribution','Topic_Keywords']
contents = pd.Series(texts_mod)
contents_act = pd.Series(texts)
sent_topics_df = pd.concat([sent_topics_df,contents_act,contents],axis =1)

df_dominant_topic = sent_topics_df.reset_index()
df_dominant_topic.columns = ['Document_No' ,'Dominant_Topic','Topic_Perc_Contrib','keywords','Original Text','Clean Text']
df_dominant_topic.to_excel('20Newgroup_topic_by_document_' + dt_stamp + '.xlsx')
df_dominant_topic.head(10)


  'stop_words.' % sorted(inconsistent))
