In [None]:
import pandas as pd

data = pd.read_excel('textLDA.xlsx', error_bad_lines=False);
data_text = data[['Summary']]
documents = data_text

In [None]:
len(documents)

In [27]:
documents[:5]

Unnamed: 0,Summary
0,ENGLISH TRANSLATION: 'A MESSAGE TO THE TRUTHFU...
1,ENGLISH TRANSLATION: SHEIKH FATIH AL JAWLANI '...
2,ENGLISH TRANSLATION: FIRST AUDIO MEETING WITH ...
3,ENGLISH TRANSLATION: SHEIKH NASIR AL WUHAYSHI ...
4,ENGLISH TRANSLATION: AQAP: 'RESPONSE TO SHEIKH...


In [45]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

In [46]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/fayikanova/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [47]:
print(WordNetLemmatizer().lemmatize('went', pos='v'))

go


In [48]:
stemmer = SnowballStemmer('english')
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [64]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [65]:
processed_docs = documents['Summary'].astype(str).map(preprocess)

In [66]:
processed_docs[:10]

0    [english, translat, messag, truth, syria, shei...
1    [english, translat, sheikh, fatih, jawlani, pe...
2    [english, translat, audio, meet, sheikh, fatih...
3    [english, translat, sheikh, nasir, wuhayshi, l...
4    [english, translat, aqap, respons, sheikh, bag...
5    [second, clip, seri, soldier, video, link, htt...
6    [english, transcript, murabit, http, hujlj, kg...
7    [english, translat, collect, word, lama, dawla...
8    [aslm, share, account, previous, suspend, khal...
9    [english, translat, aqap, statement, bless, ra...
Name: Summary, dtype: object

In [67]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [68]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 bzcscxzq
1 english
2 http
3 maqdisi
4 messag
5 muham
6 sheikh
7 syria
8 translat
9 truth
10 xfszsjvr


In [69]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [70]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(219, 1), (464, 1), (623, 1), (915, 1), (1066, 1), (1148, 1), (1420, 1)]

In [71]:
bow_doc_4310 = bow_corpus[4310]

for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                                     dictionary[bow_doc_4310[i][0]], 
                                                     bow_doc_4310[i][1]))

Word 219 ("die") appears 1 time.
Word 464 ("bomb") appears 1 time.
Word 623 ("cluster") appears 1 time.
Word 915 ("wayf") appears 1 time.
Word 1066 ("explod") appears 1 time.
Word 1148 ("student") appears 1 time.
Word 1420 ("rer__") appears 1 time.


In [72]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)


In [73]:
corpus_tfidf = tfidf[bow_corpus]

In [74]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.34493054413034635),
 (1, 0.497045147662217),
 (2, 0.4168438606487528),
 (3, 0.3508802280734451),
 (4, 0.2820665771118998),
 (5, 0.1376582816633768),
 (6, 0.3521443313090525),
 (7, 0.33849123443705564)]


In [107]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=3, id2word=dictionary, passes=2, workers=2)


In [108]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.023*"kill" + 0.018*"break" + 0.016*"armi" + 0.016*"scotsmaninfidel" + 0.015*"isi" + 0.015*"soldier" + 0.014*"iraqi" + 0.013*"forc" + 0.013*"amaqag" + 0.012*"spicylatt"
Topic: 1 
Words: 0.038*"isi" + 0.022*"syria" + 0.020*"islam" + 0.016*"state" + 0.015*"iraq" + 0.013*"ramiallolah" + 0.012*"assad" + 0.011*"report" + 0.009*"allah" + 0.009*"armi"
Topic: 2 
Words: 0.019*"islamicst" + 0.019*"kill" + 0.014*"isi" + 0.011*"muslim" + 0.010*"amaqag" + 0.009*"bomb" + 0.009*"allah" + 0.008*"support" + 0.008*"syria" + 0.008*"break"


In [86]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [87]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.008*"kill" + 0.007*"allah" + 0.007*"soldier" + 0.006*"muslim" + 0.006*"islam" + 0.006*"isi" + 0.005*"alhamdulillah" + 0.005*"islamicst" + 0.005*"know" + 0.005*"syria"
Topic: 1 Word: 0.010*"like" + 0.008*"syria" + 0.008*"muslim" + 0.008*"kill" + 0.008*"isi" + 0.007*"bomb" + 0.007*"today" + 0.007*"assad" + 0.006*"year" + 0.006*"attack"
Topic: 2 Word: 0.014*"allah" + 0.008*"syria" + 0.008*"islam" + 0.008*"arab" + 0.007*"isi" + 0.007*"sparksofirhabi" + 0.007*"state" + 0.007*"true" + 0.006*"http" + 0.006*"iraq"
Topic: 3 Word: 0.012*"break" + 0.010*"islamicst" + 0.010*"kill" + 0.010*"aleppo" + 0.010*"isi" + 0.008*"syria" + 0.006*"forc" + 0.006*"explos" + 0.006*"soldier" + 0.006*"amaqag"
Topic: 4 Word: 0.012*"syria" + 0.010*"assad" + 0.008*"isi" + 0.008*"kill" + 0.007*"state" + 0.007*"islam" + 0.007*"russia" + 0.007*"today" + 0.007*"airstrik" + 0.006*"follow"
Topic: 5 Word: 0.012*"isi" + 0.012*"support" + 0.010*"syria" + 0.009*"islam" + 0.008*"attack" + 0.007*"assad" + 0.007*

In [88]:
processed_docs[4310]

['wayf', 'rer__', 'student', 'die', 'cluster', 'bomb', 'explod']

In [105]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))



Score: 0.7248592972755432	 
Topic: 0.040*"amaqag" + 0.033*"iraqi" + 0.027*"forc" + 0.026*"break" + 0.019*"kill" + 0.017*"ramadi" + 0.016*"destroy" + 0.015*"area" + 0.015*"citi" + 0.013*"armi"

Score: 0.17508241534233093	 
Topic: 0.019*"saudi" + 0.017*"kill" + 0.014*"aleppo" + 0.013*"soldier" + 0.012*"peigneacheveux" + 0.012*"today" + 0.012*"islam" + 0.011*"peopl" + 0.011*"oper" + 0.010*"assad"

Score: 0.012512454763054848	 
Topic: 0.023*"attack" + 0.014*"amaqag" + 0.013*"hit" + 0.013*"gather" + 0.012*"break" + 0.012*"forc" + 0.010*"isi" + 0.010*"syria" + 0.009*"area" + 0.009*"islamic_st"

Score: 0.01250859908759594	 
Topic: 0.021*"support" + 0.020*"follow" + 0.018*"isi" + 0.015*"warreport" + 0.013*"peopl" + 0.013*"kill" + 0.013*"think" + 0.013*"syria" + 0.012*"rebel" + 0.010*"say"

Score: 0.012507976032793522	 
Topic: 0.042*"isi" + 0.038*"islam" + 0.031*"state" + 0.026*"ramadi" + 0.024*"iraq" + 0.020*"attack" + 0.019*"armi" + 0.017*"north" + 0.015*"kill" + 0.014*"report"

Score: 0.012

In [90]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.7110638618469238	 
Topic: 0.010*"kill" + 0.010*"isi" + 0.009*"muslim" + 0.008*"ameen" + 0.008*"syria" + 0.008*"amaqag" + 0.006*"fight" + 0.006*"forc" + 0.006*"wound" + 0.006*"armi"

Score: 0.18890295922756195	 
Topic: 0.012*"syria" + 0.010*"assad" + 0.008*"isi" + 0.008*"kill" + 0.007*"state" + 0.007*"islam" + 0.007*"russia" + 0.007*"today" + 0.007*"airstrik" + 0.006*"follow"

Score: 0.01250555831938982	 
Topic: 0.010*"like" + 0.008*"syria" + 0.008*"muslim" + 0.008*"kill" + 0.008*"isi" + 0.007*"bomb" + 0.007*"today" + 0.007*"assad" + 0.006*"year" + 0.006*"attack"

Score: 0.012505009770393372	 
Topic: 0.012*"break" + 0.010*"islamicst" + 0.010*"kill" + 0.010*"aleppo" + 0.010*"isi" + 0.008*"syria" + 0.006*"forc" + 0.006*"explos" + 0.006*"soldier" + 0.006*"amaqag"

Score: 0.01250418834388256	 
Topic: 0.009*"isi" + 0.008*"news" + 0.007*"kill" + 0.006*"khair" + 0.006*"syria" + 0.006*"polic" + 0.006*"report" + 0.005*"dan" + 0.005*"caliphate_new" + 0.005*"photo"

Score: 0.012504110112

In [106]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(bow_corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs , dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.108053377606274

Coherence Score:  0.3856902700970771


In [92]:
#Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus_tfidf))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model_tfidf, texts=processed_docs , dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.452541413625205

Coherence Score:  0.37149279955818904
