# Understanding LDA

In [73]:
import pprint
from gensim import corpora, models, similarities

In [138]:
doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father."
doc2 = "My father spends a lot of time driving my sister around to dance practice."
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure."
doc4 = "Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better."
doc5 = "Health experts say that Sugar is not good for your lifestyle."
doc6 = "I drive myself crazy with work. I can't take it anymore."

In [139]:
doc_complete = [doc1, doc2, doc3, doc4, doc5, doc6]

In [140]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in doc_complete]        

In [37]:
import gensim
from gensim import corpora

In [143]:
# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(doc_clean)
print(dictionary)
print(dictionary.token2id)

Dictionary(40 unique tokens: ['sugar', 'bad', 'consume', 'sister', 'like']...)
{'sugar': 0, 'bad': 1, 'consume': 2, 'sister': 3, 'like': 4, 'father': 5, 'spends': 6, 'lot': 7, 'time': 8, 'driving': 9, 'around': 10, 'dance': 11, 'practice': 12, 'doctor': 13, 'suggest': 14, 'may': 15, 'cause': 16, 'increased': 17, 'stress': 18, 'blood': 19, 'pressure': 20, 'sometimes': 21, 'feel': 22, 'perform': 23, 'well': 24, 'school': 25, 'never': 26, 'seems': 27, 'drive': 28, 'better': 29, 'health': 30, 'expert': 31, 'say': 32, 'good': 33, 'lifestyle': 34, 'crazy': 35, 'work': 36, 'cant': 37, 'take': 38, 'anymore': 39}


In [39]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [None]:
tsen='i want to see how '

In [40]:
Lda = gensim.models.ldamodel.LdaModel

In [45]:
ldamodel = Lda(doc_term_matrix, num_topics=4, id2word = dictionary, passes=50)

In [50]:
pp = pprint.PrettyPrinter(indent=2)
pp.pprint(ldamodel.print_topics(num_topics=4, num_words=3))

[ (0, '0.025*"drive" + 0.025*"driving" + 0.025*"father"'),
  (1, '0.050*"driving" + 0.050*"pressure" + 0.050*"doctor"'),
  (2, '0.141*"sugar" + 0.054*"consume" + 0.054*"bad"'),
  (3, '0.073*"sister" + 0.073*"father" + 0.040*"drive"')]


# Tutorial 1

In [51]:
documents = ["Human machine interface for lab abc computer applications",
>>>              "A survey of user opinion of computer system response time",
>>>              "The EPS user interface management system",
>>>              "System and human system engineering testing of EPS",
>>>              "Relation of user perceived response time to error measurement",
>>>              "The generation of random binary unordered trees",
>>>              "The intersection graph of paths in trees",
>>>              "Graph minors IV Widths of trees and well quasi ordering",
>>>              "Graph minors A survey"]

In [53]:
>>> # remove common words and tokenize
>>> stoplist = set('for a of the and to in'.split())
>>> texts = [[word for word in document.lower().split() if word not in stoplist]
>>>          for document in documents]
>>>
>>> # remove words that appear only once
>>> from collections import defaultdict
>>> frequency = defaultdict(int)
>>> for text in texts:
>>>     for token in text:
>>>         frequency[token] += 1
>>>
>>> texts = [[token for token in text if frequency[token] > 1]
>>>          for text in texts]
>>>
>>> from pprint import pprint  # pretty-printer
>>> pprint(texts)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


In [62]:
dictionary = corpora.Dictionary(texts)
print(dictionary)

Dictionary(12 unique tokens: ['human', 'interface', 'computer', 'survey', 'user']...)


In [63]:
print(dictionary.token2id)

{'human': 0, 'interface': 1, 'computer': 2, 'survey': 3, 'user': 4, 'system': 5, 'response': 6, 'time': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}


In [64]:
new_doc="human computer interaction"

In [71]:
print(dictionary.token2id)

{'human': 0, 'interface': 1, 'computer': 2, 'survey': 3, 'user': 4, 'system': 5, 'response': 6, 'time': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}


In [68]:
new_vec=dictionary.doc2bow(new_doc.lower().split())
pprint(new_vec)

[(0, 1), (2, 1)]


In [70]:
corpus = [dictionary.doc2bow(text) for text in texts]
pp.pprint(corpus)

[ [(0, 1), (1, 1), (2, 1)],
  [(2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
  [(1, 1), (4, 1), (5, 1), (8, 1)],
  [(0, 1), (5, 2), (8, 1)],
  [(4, 1), (6, 1), (7, 1)],
  [(9, 1)],
  [(9, 1), (10, 1)],
  [(9, 1), (10, 1), (11, 1)],
  [(3, 1), (10, 1), (11, 1)]]


In [74]:
tfidf=models.TfidfModel(corpus)

In [77]:
doc_bow = [(0, 1), (1, 1)]
print(tfidf[doc_bow])

[(0, 0.7071067811865476), (1, 0.7071067811865476)]


In [85]:
corpus_tfidf=tfidf[corpus]
for doc in corpus_tfidf:
    print(doc)

[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(2, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.3244870206138555), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.44424552527467476)]
[(1, 0.5710059809418182), (4, 0.4170757362022777), (5, 0.4170757362022777), (8, 0.5710059809418182)]
[(0, 0.49182558987264147), (5, 0.7184811607083769), (8, 0.49182558987264147)]
[(4, 0.45889394536615247), (6, 0.6282580468670046), (7, 0.6282580468670046)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(3, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]


In [79]:
lsi = models.LsiModel(corpus_tfidf,id2word=dictionary,num_topics=2)
corpus_lsi=lsi[corpus_tfidf]

In [83]:
lsi.print_topics(2)

[(0,
  '0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"time" + 0.060*"response" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"'),
 (1,
  '-0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"time" + -0.320*"response" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"')]

In [84]:
for doc in corpus_lsi:
    print(doc)

[(0, 0.066007833960905371), (1, -0.52007033063618457)]
[(0, 0.19667592859142696), (1, -0.76095631677000375)]
[(0, 0.089926399724466588), (1, -0.72418606267525032)]
[(0, 0.075858476521784068), (1, -0.63205515860034256)]
[(0, 0.10150299184980247), (1, -0.57373084830029497)]
[(0, 0.70321089393783076), (1, 0.16115180214026015)]
[(0, 0.87747876731198282), (1, 0.16758906864659703)]
[(0, 0.90986246868185749), (1, 0.14086553628719331)]
[(0, 0.61658253505692828), (1, -0.053929075663891324)]


In [131]:
doc="human computer interaction"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi=lsi[vec_bow]
print(vec_lsi)

[(0, 0.079104751174450733), (1, -0.57328352430793994)]


In [132]:
index = similarities.MatrixSimilarity(lsi[corpus])
sims = index[vec_lsi]
print(list(enumerate(sims)))

[(0, 0.99994081), (1, 0.99467081), (2, 0.99994278), (3, 0.999879), (4, 0.99935204), (5, -0.08804217), (6, -0.0515742), (7, -0.023664713), (8, 0.1938726)]


In [137]:
sims = sorted(sims, key=lambda item: -item[1])
print(sims)

IndexError: invalid index to scalar variable.

In [116]:
sorted(sims, key=lambda item)

TypeError: must use keyword argument for key function

In [146]:

from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.

print("Loading dataset...")
t0 = time()
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data[:n_samples]
print("done in %0.3fs." % (time() - t0))


Loading dataset...
done in 1.902s.


In [166]:
a=' '.join(data_samples)

In [None]:
#data_samples = a list of many entries

# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print()

# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)