## Topic modeling

In [1]:
import string
import collections
import glob
import codecs
import numpy as np
import scipy as sp
from scipy.sparse import lil_matrix

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
translator = str.maketrans("","", string.punctuation)

In [3]:
import nltk

In [52]:
stopwords = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.stem.SnowballStemmer('english')
def my_tokenizer(s):
    return [ stemmer.stem(x) for x in nltk.tokenize.word_tokenize(s.lower().translate(translator)) if not x in stopwords ]

In [53]:
cv = CountVectorizer(input='filename', encoding='latin1', min_df=20, tokenizer=my_tokenizer)
X = cv.fit_transform([x for directory in glob.glob('20news-bydate-train/*') for x in glob.glob(directory + '/*')])

In [73]:
tfv = TfidfVectorizer(input='filename', encoding='latin1', min_df=10, max_df=200, ngram_range=(1, 2), tokenizer=my_tokenizer)
X = tfv.fit_transform([x for directory in glob.glob('20news-bydate-train/*') for x in glob.glob(directory + '/*')])

In [74]:
X

<11314x22004 sparse matrix of type '<class 'numpy.float64'>'
	with 623753 stored elements in Compressed Sparse Row format>

In [6]:
import gensim

In [75]:
from gensim.corpora.dictionary import Dictionary
corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)
dictionary = Dictionary.from_corpus(corpus, id2word=dict((id, word) for word, id in tfv.vocabulary_.items()))

In [76]:
dictionary.token2id['atheism']

2321

In [89]:
# lsi_model = gensim.models.LsiModel(corpus, id2word=dictionary, num_topics=5)
lda_model = gensim.models.LdaModel(corpus, id2word=dictionary, num_topics=5, alpha=.02)

In [90]:
lda_model.print_topics()

[(0,
  '0.001*"netcom" + 0.001*"organ netcom" + 0.001*"guest" + 0.001*"netcom onlin" + 0.001*"gordon bank" + 0.001*"communic servic" + 0.001*"abc" + 0.001*"keenan" + 0.001*"gari" + 0.001*"gordon"'),
 (1,
  '0.003*"gordon bank" + 0.002*"gordon" + 0.002*"gebcspittedu" + 0.002*"gebcspittedu gordon" + 0.001*"nhl" + 0.001*"playoff" + 0.001*"ticket" + 0.001*"captain" + 0.001*"msg" + 0.001*"diet"'),
 (2,
  '0.002*"msg" + 0.002*"diseas" + 0.001*"penguin" + 0.001*"cancer" + 0.001*"leaf" + 0.001*"patient" + 0.001*"devil" + 0.001*"gordon" + 0.001*"gerald" + 0.001*"ice"'),
 (3,
  '0.002*"playoff" + 0.001*"stat" + 0.001*"gari l" + 0.001*"0 1" + 0.001*"leaf" + 0.001*"dare" + 0.001*"1 1" + 0.001*"espn" + 0.001*"wing" + 0.001*"keller"'),
 (4,
  '0.002*"armenian" + 0.002*"turkish" + 0.001*"nhl" + 0.001*"arab" + 0.001*"greek" + 0.001*"armenia" + 0.001*"palestinian" + 0.001*"firearm" + 0.001*"soldier" + 0.001*"orbit"')]

In [91]:
lsi_model.print_topics()

[(0,
  '-0.199*"armenian" + -0.152*"gordon" + -0.150*"gordon bank" + -0.102*"gebcspittedu gordon" + -0.102*"gebcspittedu" + -0.100*"turkish" + -0.067*"scsi" + -0.064*"armenia" + -0.061*"muslim" + -0.061*"turk"'),
 (1,
  '0.395*"gordon bank" + 0.378*"gordon" + 0.269*"gebcspittedu gordon" + 0.269*"gebcspittedu" + -0.147*"armenian" + 0.135*"n3jxp skeptic" + 0.135*"chastiti intellect" + 0.135*"n3jxp" + 0.135*"bank n3jxp" + 0.135*"skeptic chastiti"'),
 (2,
  '0.520*"armenian" + 0.252*"turkish" + 0.171*"armenia" + 0.170*"argic" + 0.168*"serdar" + 0.166*"serdar argic" + 0.157*"turk" + 0.112*"turkey" + 0.111*"genocid" + 0.103*"serazumauucp"'),
 (3,
  '-0.462*"scsi" + -0.268*"ide" + -0.127*"hard drive" + -0.113*"simm" + -0.104*"isa" + 0.101*"islam" + -0.098*"motherboard" + -0.096*"printer" + -0.095*"armenian" + -0.091*"scsi2"'),
 (4,
  '0.242*"islam" + 0.239*"scsi" + 0.168*"sandviknewtonapplecom" + 0.164*"kent" + 0.141*"ide" + 0.112*"muslim" + 0.104*"schneider" + 0.099*"sandvik" + 0.099*"kent s

In [88]:
model[doc]

[(1, 0.42333399318482828), (4, 0.56645592009405998)]