**Assignment Gensim Topic Modeling**  
**Yigao Li**  
*Oct 14, 2018*

In [1]:
import nltk
import gensim
from gensim import corpora
from gensim import models
import os
import string
from collections import defaultdict
import random



In [2]:
directory = "text"
documents = []
for filename in os.listdir(directory):
    with open(directory + "/" + filename, encoding = "utf-8") as f:
        documents.append(f.read().split())

In [3]:
stopwords = nltk.corpus.stopwords.words("english")
texts = [[w for w in document if w.lower() not in stopwords
          and w.lower() not in string.punctuation] for document in documents]
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token.lower()] += 1
texts = [[token for token in text if frequency[token] > 1] for text in texts]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf_model = models.TfidfModel(corpus)
corpus_tfidf = tfidf_model[corpus]

In [4]:
def topic_number(doc):
    n = doc[0][0]
    m = doc[0][1]
    for i in range(1, len(doc)):
        if doc[i][1] > m:
            n = i
            m = doc[i][1]
    return n

lsi_model = models.LsiModel(corpus_tfidf, id2word = dictionary)
corpus_lsi = lsi_model[corpus_tfidf]
for doc in corpus_lsi:
    print(doc)
    print(topic_number(doc))

[(0, 0.2321251273920189), (1, -0.33579563777857274), (2, 0.3915224828409042), (3, -0.06764039063756883), (4, 0.1651434436884204), (5, -0.24711087713098262), (6, 0.2457444792400856), (7, -0.13649591692719215), (8, -0.03521410608498562), (9, 0.19857983785988065), (10, -0.15520800019383757), (11, 0.578145337156231), (12, 0.06112505599280676), (13, -0.26332886751903783), (14, 0.14103758780350723), (15, 0.07826810863835869), (16, -0.04530683969050799), (17, 0.022555662600226786), (18, 0.07310005451004774), (19, 0.046043705769048156)]
11
[(0, 0.35739280955119074), (1, -0.05048937763888813), (2, -0.37478989791279127), (3, -0.17627546508945666), (4, 0.008610144298313098), (5, -0.40795130157973436), (6, -0.03550280833322678), (7, -0.13330164729263386), (8, -0.24402046712447237), (9, 0.1655376664483241), (10, 0.10369780924115037), (11, -0.24703372992945088), (12, -0.3498296983735147), (13, -0.3596135747761595), (14, 0.08788152036849904), (15, 0.19551811367650002), (16, 0.1153908826430276), (17, 

Above result shows that many documents are topic 0 and some are topic 2, 4, 6, 10, 11, 13 and 16.

In [5]:
lsi_model.show_topic(0)

[('missile', 0.12459100530239182),
 ('2003', 0.12238506219744012),
 ('agents', 0.10836267872326862),
 ('plant', 0.10318525178662478),
 ('enrichment', 0.09950024164537559),
 ('uranium', 0.09741909164585387),
 ('nuclear', 0.09482510030807947),
 ('missiles', 0.0948219499767438),
 ('range', 0.09386931658579684),
 ('program', 0.09244981273666449)]

In [6]:
lsi_model.show_topic(2)

[('terrorism', 0.20292917073628064),
 ('inherited', -0.19054614744504259),
 ('terrorist', 0.17498747109291654),
 ('groups', 0.17170259874605637),
 ('said', 0.13214572156481777),
 ('missile', -0.13068516387809567),
 ('export', -0.12375279922873894),
 ('trash', 0.12319269630489971),
 ('drill', 0.10559373968991409),
 ('money', 0.10559373968991409)]

In [7]:
lsi_model.show_topic(4)

[('inherited', 0.28314505765143966),
 ('nerve', -0.13243877172580776),
 ('destroyed', 0.13108479919913887),
 ('mm', -0.12098486761119776),
 ('dismantled', 0.11357176540283813),
 ('transferred', 0.11235117604067908),
 ('collapsed', 0.10668589971184332),
 ('precursors', -0.1032867662538035),
 ('artillery', -0.1024145350163572),
 ('cyclotron', 0.10225170887186648)]

In [8]:
lsi_model.show_topic(6)

[('diplomat', -0.2932991647683368),
 ('cameras', -0.2019665873595518),
 ('halls', -0.17597949886100206),
 ('install', -0.17597949886100206),
 ('plutonium', 0.1566546731703277),
 ('enrichment', -0.1408471689897621),
 ('agency', -0.13346725178386173),
 ('groups', 0.127612030125762),
 ('installation', -0.12747554663308308),
 ('talks', 0.11819704666975497)]

In [9]:
lsi_model.show_topic(10)

[('19', 0.1841782061185558),
 ('2003', 0.17115973981232638),
 ('announcement', 0.12894112666313756),
 ('plant', -0.12048984740111257),
 ('payload', -0.11451557153347668),
 ('coast', -0.11451557153347668),
 ('apartheid', 0.11044745272524649),
 ('groups', -0.10820591663580051),
 ('pledged', 0.10632512273551706),
 ('black', 0.101922601490882)]

In [10]:
lsi_model.show_topic(11)

[('groups', 0.35073653423843665),
 ('terrorism', 0.1440434830458724),
 ('terrorist', 0.13654705074385792),
 ('motivated', 0.12857166735896214),
 ('trash', -0.12534035560288803),
 ('export', -0.12496195422036249),
 ('violence', 0.11493292121873926),
 ('money', -0.10743459051676117),
 ('drill', -0.10743459051676117),
 ('inflict', 0.09851393247320506)]

In [11]:
lsi_model.show_topic(13)

[('export', -0.1969578922618021),
 ('groups', -0.15851827737700233),
 ('launched', 0.1341913488826324),
 ('upon', 0.12160127617597685),
 ('regulations', -0.11341253367455638),
 ('rockets', 0.11136115494243436),
 ('rocket', 0.10180104200638099),
 ('missile', 0.09782721251968816),
 ('artillery', 0.09715633789722698),
 ('nerve', -0.09284507114470177)]

In [12]:
lsi_model.show_topic(16)

[('massive', 0.18027304552691073),
 ('inherited', -0.16634053809186797),
 ('class', 0.13048710899190094),
 ('destroyed', -0.13002944959836096),
 ('lewisite', 0.12018203035127381),
 ('basing', 0.12018203035127381),
 ('submarines', 0.12018203035127381),
 ('reduced', 0.12018203035127381),
 ('anti', 0.11839073983757663),
 ('plague', 0.10814267314206115)]

LSI Topics for each document:  
NTI_BWTutorial_chapter1: group, terrorism, motivate, violence, inflict  
NTI_ChinaOverview: missile, agent, plant, enrichment, uranium, nuclear, range, program  
NTI_Iran_Biological: missile, agent, plant, enrichment, uranium, nuclear, range, program  
NTI_Iran_Chemical: missile, agent, plant, enrichment, uranium, nuclear, range, program  
NTI_Iran_Introduction: missile, agent, plant, enrichment, uranium, nuclear, range, program  
NTI_Iran_Missile: launch, rocket, missile, artillery  
NTI_Iran_Nuclear: missile, agent, plant, enrichment, uranium, nuclear, range, program  
NTI_Kazakhstan: inherit, destroy, dismantle, transfer, collapse, cyclotron  
NTI_LibyaCountry1: 2003, announcement, apartheid, pledge, black  
NTI_NorthKorea_ChemicalOverview: missile, agent, plant, enrichment, uranium, nuclear, range, program  
NTI_NorthKorea_Introduction: missile, agent, plant, enrichment, uranium, nuclear, range, program  
NTI_NorthKorea_NuclearCapabilities: plutonium, group, talk  
NTI_NorthKorea_NuclearOverview: missile, agent, plant, enrichment, uranium, nuclear, range, program  
NTI_Russia_Introduction: massive, class, lewisite, basing, submarine, anti, plague  
NTI_SouthAfrica_Introduction: missile, agent, plant, enrichment, uranium, nuclear, range, program  
NTI_Syria_NuclearOverview: missile, agent, plant, enrichment, uranium, nuclear, range, program  
NTI_Taiwan_Introduction: missile, agent, plant, enrichment, uranium, nuclear, range, program  
NTI_WMDNews_042106: terrorism, trash, drill, money  
NTI_WMDNews_062606: terrorism, trash, drill, money  
NTI_workAdvances: terrorism, trash, drill, money

In [13]:
lda_model = models.LdaModel(corpus_tfidf, id2word = dictionary, num_topics = 55, random_state = 9907)
corpus_lda = lda_model[corpus_tfidf]
for doc in corpus_lda:
    print(doc)
    print(topic_number(doc))

[(3, 0.032454796), (12, 0.02608318), (24, 0.07351397), (25, 0.038422093), (30, 0.64010864), (37, 0.021805478), (42, 0.024473354), (46, 0.05190487), (51, 0.030670607)]
4
[(3, 0.08007345), (24, 0.05415756), (37, 0.76828897), (42, 0.025213055)]
2
[(3, 0.04069466), (5, 0.012764614), (12, 0.025795013), (24, 0.7539511), (25, 0.032846577), (42, 0.021508388), (46, 0.019152576), (51, 0.032728974), (52, 0.014139824)]
3
[(3, 0.053063698), (12, 0.019836083), (16, 0.025859827), (24, 0.13146664), (25, 0.04022062), (30, 0.03168088), (37, 0.029701829), (42, 0.026091047), (46, 0.023740206), (51, 0.03363884), (52, 0.54221016)]
10
[(3, 0.050188437), (5, 0.010740786), (16, 0.02360518), (24, 0.079828985), (25, 0.6562783), (37, 0.02585554), (42, 0.04035366), (51, 0.06497542)]
4
[(3, 0.055276297), (5, 0.5322336), (12, 0.032272287), (16, 0.01982039), (24, 0.079657495), (25, 0.041065328), (30, 0.026238887), (37, 0.02782067), (39, 0.016276492), (42, 0.027514948), (46, 0.033267993), (51, 0.052383732), (52, 0.016

In [14]:
lda_model.show_topic(1)

[('central', 0.0004828585),
 ('challenges', 0.0004828585),
 ('built', 0.0004828585),
 ('caliber', 0.0004828585),
 ('came', 0.0004828585),
 ('capital', 0.0004828585),
 ('cease-fire', 0.0004828585),
 ('bringing', 0.0004828585),
 ('class', 0.0004828585),
 ('coastal', 0.0004828585)]

In [15]:
lda_model.show_topic(3)

[('inherited', 0.007838653),
 ('cyclotron', 0.0055641406),
 ('plant', 0.005418295),
 ('destroyed', 0.0042535914),
 ('phosphoric', 0.004248958),
 ('transferred', 0.0042404574),
 ('dismantled', 0.003407586),
 ('collapsed', 0.0034075857),
 ('reactor', 0.0030600508),
 ('isotopes', 0.002933775)]

In [16]:
lda_model.show_topic(5)

[('missile', 0.0045439904),
 ('launched', 0.003848428),
 ('rockets', 0.0037728571),
 ('rocket', 0.00352651),
 ('upon', 0.003495985),
 ('artillery', 0.003409524),
 ('range', 0.003082492),
 ('long', 0.0030557285),
 ('missiles', 0.002889921),
 ('ballistic', 0.0028296183)]

In [17]:
lda_model.show_topic(10)

[('grade', 0.0040063616),
 ('plutonium', 0.0038537635),
 ('clandestine', 0.0032436776),
 ('reliable', 0.0031656208),
 ('require', 0.0029308875),
 ('mill', 0.0029143204),
 ('explosive', 0.0026178367),
 ('inputs', 0.0026006266),
 ('estimates', 0.0025734769),
 ('yield', 0.0024843859)]

In [18]:
lda_model.show_topic(51)

[('diplomat', 0.007209145),
 ('enrichment', 0.0066736345),
 ('cameras', 0.00554595),
 ('install', 0.0044379723),
 ('halls', 0.0044379723),
 ('agency', 0.004206511),
 ('uranium', 0.0034891008),
 ('installation', 0.0034761885),
 ('conversion', 0.0033902072),
 ('cascade', 0.0032417225)]

LDA Topics for each document:  
NTI_BWTutorial_chapter1: central, challenge, build, caliber, capital, cease-fire, class, coastal  
NTI_ChinaOverview: central, challenge, build, caliber, capital, cease-fire, class, coastal  
NTI_Iran_Biological: inherit, cyclotron, plant, destroy, phosphoric, transfer, dismantle, collapse, reactor, isotopes  
NTI_Iran_Chemical: grade, plutonium, clandestine, reliable, mill, explosive, yield  
NTI_Iran_Introduction: central, challenge, build, caliber, capital, cease-fire, class, coastal  
NTI_Iran_Missile: central, challenge, build, caliber, capital, cease-fire, class, coastal  
NTI_Iran_Nuclear: central, challenge, build, caliber, capital, cease-fire, class, coastal  
NTI_Kazakhstan: inherit, cyclotron, plant, destroy, phosphoric, transfer, dismantle, collapse, reactor, isotopes  
NTI_LibyaCountry1: central, challenge, build, caliber, capital, cease-fire, class, coastal  
NTI_NorthKorea_ChemicalOverview: central, challenge, build, caliber, capital, cease-fire, class, coastal  
NTI_NorthKorea_Introduction: central, challenge, build, caliber, capital, cease-fire, class, coastal  
NTI_NorthKorea_NuclearCapabilities:central, challenge, build, caliber, capital, cease-fire, class, coastal  
NTI_NorthKorea_NuclearOverview: central, challenge, build, caliber, capital, cease-fire, class, coastal  
NTI_Russia_Introduction: central, challenge, build, caliber, capital, cease-fire, class, coastal  
NTI_SouthAfrica_Introduction: inherit, cyclotron, plant, destroy, phosphoric, transfer, dismantle, collapse, reactor, isotopes  
NTI_Syria_NuclearOverview: inherit, cyclotron, plant, destroy, phosphoric, transfer, dismantle, collapse, reactor, isotopes  
NTI_Taiwan_Introduction: central, challenge, build, caliber, capital, cease-fire, class, coastal  
NTI_WMDNews_042106: central, challenge, build, caliber, capital, cease-fire, class, coastal  
NTI_WMDNews_062606: missile, launch, rocket, artillery, range, long, ballistic  
NTI_workAdvances: diplotmat, enrichment, cameras, install, halls, agency, uranium, installation, conversion, cascade

Documents are about missiles, nuclear technology and chemical weapons in countries or regions such as China, Iran, Kazakhstan, Libya, North Korea, Russia, South Africa, Syria and Taiwan.  
LSI model provides both positive and negative values of document's relation to a topic.  
LDA model results differs due to its random process. All result numbers are probabilies that a document belongs to each topic. Even though we have more number of topics, many of them are exactly the same set of topics.