In [2]:
import pandas as pd

data = pd.read_excel('textLDA.xlsx', error_bad_lines=False);
data_text = data[['Summary']]
documents = data_text

In [3]:
len(documents)

16565

In [4]:
documents[:5]

Unnamed: 0,Summary
0,ENGLISH TRANSLATION: 'A MESSAGE TO THE TRUTHFU...
1,ENGLISH TRANSLATION: SHEIKH FATIH AL JAWLANI '...
2,ENGLISH TRANSLATION: FIRST AUDIO MEETING WITH ...
3,ENGLISH TRANSLATION: SHEIKH NASIR AL WUHAYSHI ...
4,ENGLISH TRANSLATION: AQAP: 'RESPONSE TO SHEIKH...


In [5]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

In [6]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/fayikanova/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
print(WordNetLemmatizer().lemmatize('went', pos='v'))

go


In [8]:
stemmer = SnowballStemmer('english')
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [9]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [14]:
processed_docs = documents['Summary'].astype(str).map(preprocess)

In [15]:
processed_docs[:10]

0    [english, translat, messag, truth, syria, shei...
1    [english, translat, sheikh, fatih, jawlani, pe...
2    [english, translat, audio, meet, sheikh, fatih...
3    [english, translat, sheikh, nasir, wuhayshi, l...
4    [english, translat, aqap, respons, sheikh, bag...
5    [second, clip, seri, soldier, video, link, htt...
6    [english, transcript, murabit, http, hujlj, kg...
7    [english, translat, collect, word, lama, dawla...
8    [aslm, share, account, previous, suspend, khal...
9    [english, translat, aqap, statement, bless, ra...
Name: Summary, dtype: object

In [16]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [17]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 bzcscxzq
1 english
2 http
3 maqdisi
4 messag
5 muham
6 sheikh
7 syria
8 translat
9 truth
10 xfszsjvr


In [18]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [19]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(219, 1), (464, 1), (623, 1), (915, 1), (1066, 1), (1148, 1), (1420, 1)]

In [20]:
bow_doc_4310 = bow_corpus[4310]

for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                                     dictionary[bow_doc_4310[i][0]], 
                                                     bow_doc_4310[i][1]))

Word 219 ("die") appears 1 time.
Word 464 ("bomb") appears 1 time.
Word 623 ("cluster") appears 1 time.
Word 915 ("wayf") appears 1 time.
Word 1066 ("explod") appears 1 time.
Word 1148 ("student") appears 1 time.
Word 1420 ("rer__") appears 1 time.


In [21]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)


In [22]:
corpus_tfidf = tfidf[bow_corpus]

In [23]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.34493054413034635),
 (1, 0.497045147662217),
 (2, 0.4168438606487528),
 (3, 0.3508802280734451),
 (4, 0.2820665771118998),
 (5, 0.1376582816633768),
 (6, 0.3521443313090525),
 (7, 0.33849123443705564)]


In [24]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)


In [25]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.033*"ramadi" + 0.032*"iraqi" + 0.030*"armi" + 0.027*"near" + 0.027*"kill" + 0.019*"amaqag" + 0.019*"forc" + 0.018*"area" + 0.018*"north" + 0.017*"destroy"
Topic: 1 
Words: 0.020*"attack" + 0.018*"isi" + 0.015*"kill" + 0.015*"like" + 0.013*"report" + 0.011*"syrian" + 0.011*"aleppo" + 0.010*"scotsmaninfidel" + 0.010*"spicylatt" + 0.009*"guy"
Topic: 2 
Words: 0.026*"islam" + 0.019*"kill" + 0.019*"state" + 0.019*"fight" + 0.018*"saudi" + 0.014*"work" + 0.013*"hezbollah" + 0.013*"mosul" + 0.012*"today" + 0.011*"break"
Topic: 3 
Words: 0.030*"muslim" + 0.015*"warreport" + 0.012*"know" + 0.011*"isi" + 0.011*"support" + 0.011*"follow" + 0.010*"word" + 0.009*"like" + 0.009*"scotsmaninfidel" + 0.009*"time"
Topic: 4 
Words: 0.024*"isi" + 0.022*"captur" + 0.020*"islam" + 0.015*"think" + 0.014*"state" + 0.013*"jund" + 0.012*"aqsa" + 0.012*"video" + 0.010*"iraq" + 0.009*"group"
Topic: 5 
Words: 0.052*"break" + 0.046*"amaqag" + 0.043*"islamicst" + 0.030*"kill" + 0.014*"soldier" + 0

In [26]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [27]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.009*"kill" + 0.009*"syria" + 0.008*"break" + 0.008*"isi" + 0.007*"video" + 0.007*"armi" + 0.006*"iraqi" + 0.006*"civilian" + 0.006*"look" + 0.006*"gaza"
Topic: 1 Word: 0.014*"syria" + 0.010*"isi" + 0.010*"islam" + 0.009*"amaqag" + 0.009*"support" + 0.009*"kill" + 0.008*"assad" + 0.007*"ramiallolah" + 0.007*"attack" + 0.006*"forc"
Topic: 2 Word: 0.016*"allah" + 0.011*"islam" + 0.011*"state" + 0.009*"sparksofirhabi" + 0.009*"kill" + 0.007*"scotsmaninfidel" + 0.007*"send" + 0.006*"report" + 0.006*"airstrik" + 0.005*"syria"
Topic: 3 Word: 0.019*"islamicst" + 0.016*"break" + 0.011*"ameen" + 0.009*"amaqag" + 0.007*"isi" + 0.006*"pour" + 0.006*"kill" + 0.005*"destroy" + 0.005*"russian" + 0.005*"mosul"
Topic: 4 Word: 0.012*"isi" + 0.009*"syria" + 0.008*"turkey" + 0.007*"sparksofirhabi" + 0.007*"islam" + 0.007*"caliphate_new" + 0.006*"state" + 0.006*"syrian" + 0.006*"kill" + 0.005*"citi"
Topic: 5 Word: 0.015*"isi" + 0.011*"iraq" + 0.010*"aleppo" + 0.009*"rebel" + 0.009*"syria" 

In [28]:
processed_docs[4310]

['wayf', 'rer__', 'student', 'die', 'cluster', 'bomb', 'explod']

In [29]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))



Score: 0.8874092102050781	 
Topic: 0.056*"scotsmaninfidel" + 0.047*"sassysassyr" + 0.041*"spicylatt" + 0.040*"kafirkati" + 0.016*"jailamx" + 0.013*"assad" + 0.011*"isi" + 0.009*"children" + 0.009*"africa" + 0.009*"kill"

Score: 0.012513812631368637	 
Topic: 0.052*"break" + 0.046*"amaqag" + 0.043*"islamicst" + 0.030*"kill" + 0.014*"soldier" + 0.013*"citi" + 0.013*"isi" + 0.013*"fighter" + 0.013*"kurdish" + 0.012*"syrian"

Score: 0.01251175720244646	 
Topic: 0.020*"attack" + 0.018*"isi" + 0.015*"kill" + 0.015*"like" + 0.013*"report" + 0.011*"syrian" + 0.011*"aleppo" + 0.010*"scotsmaninfidel" + 0.010*"spicylatt" + 0.009*"guy"

Score: 0.012511327862739563	 
Topic: 0.026*"islam" + 0.019*"kill" + 0.019*"state" + 0.019*"fight" + 0.018*"saudi" + 0.014*"work" + 0.013*"hezbollah" + 0.013*"mosul" + 0.012*"today" + 0.011*"break"

Score: 0.012511054053902626	 
Topic: 0.044*"isi" + 0.023*"syria" + 0.017*"turkey" + 0.016*"aleppo" + 0.012*"kill" + 0.010*"rebel" + 0.009*"north" + 0.009*"fighter" + 0.0

In [30]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.8874506950378418	 
Topic: 0.012*"isi" + 0.009*"syria" + 0.008*"turkey" + 0.007*"sparksofirhabi" + 0.007*"islam" + 0.007*"caliphate_new" + 0.006*"state" + 0.006*"syrian" + 0.006*"kill" + 0.005*"citi"

Score: 0.012509149499237537	 
Topic: 0.008*"isi" + 0.007*"attack" + 0.006*"syria" + 0.006*"nidalgazaui" + 0.006*"kill" + 0.005*"aleppo" + 0.005*"today" + 0.005*"follow" + 0.005*"statement" + 0.005*"islam"

Score: 0.012506229802966118	 
Topic: 0.011*"isi" + 0.010*"kill" + 0.008*"armi" + 0.008*"soldier" + 0.007*"assad" + 0.007*"near" + 0.007*"syria" + 0.006*"hom" + 0.006*"saudi" + 0.006*"report"

Score: 0.012505878694355488	 
Topic: 0.010*"armi" + 0.010*"kill" + 0.010*"isi" + 0.008*"iraqi" + 0.007*"near" + 0.007*"follow" + 0.007*"today" + 0.007*"syria" + 0.007*"report" + 0.007*"iraq"

Score: 0.012505430728197098	 
Topic: 0.009*"kill" + 0.009*"syria" + 0.008*"break" + 0.008*"isi" + 0.007*"video" + 0.007*"armi" + 0.006*"iraqi" + 0.006*"civilian" + 0.006*"look" + 0.006*"gaza"

Score: 