In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.datasets import fetch_20newsgroups
twenty_newsgroups = fetch_20newsgroups()

In [3]:
len(twenty_newsgroups)

6

In [4]:
# View 20 newsgroups 
twenty_newsgroups.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [5]:
# Print first entry in dataset
print("\n".join(twenty_newsgroups.data[0].split("\n")))

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Creating tf-idf matrix
vectorizer = TfidfVectorizer(stop_words='english')
twenty_newsgroups_tfidf = vectorizer.fit_transform(twenty_newsgroups.data)

# Getting the word list.
terms = vectorizer.get_feature_names()

# Number of topics.
ntopics=20

# Linking words to topics
def word_topic(tfidf,solution, wordlist):
    
    # Loading scores for each word on each topic/component.
    words_by_topic=tfidf.T * solution

    # Linking the loadings to the words in an easy-to-read way.
    components=pd.DataFrame(words_by_topic,index=wordlist)
    
    return components

# Extracts the top N words and their loadings for each topic.
def top_words(components, n_top_words):
    n_topics = range(components.shape[1])
    index= np.repeat(n_topics, n_top_words, axis=0)
    topwords=pd.Series(index=index)
    for column in range(components.shape[1]):
        # Sort the column so that highest loadings are at the top.
        sortedwords=components.iloc[:,column].sort_values(ascending=False)
        # Choose the N highest loadings.
        chosen=sortedwords[:n_top_words]
        # Combine loading and index into a string.
        chosenlist=chosen.index +" "+round(chosen,2).map(str) 
        topwords.loc[column]=chosenlist
    return(topwords)

# Number of words to look at for each topic.
n_top_words = 10

In [7]:
# LSA

from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

svd= TruncatedSVD(ntopics)
lsa = make_pipeline(svd, Normalizer(copy=False))
twenty_newsgroups_lsa = lsa.fit_transform(twenty_newsgroups_tfidf)

components_lsa = word_topic(twenty_newsgroups_tfidf, twenty_newsgroups_lsa, terms)

topwords=pd.DataFrame()
topwords['LSA']=top_words(components_lsa, n_top_words)                

In [8]:
# LDA
from sklearn.decomposition import LatentDirichletAllocation as LDA

lda = LDA(n_topics=ntopics, 
          doc_topic_prior=None, # Prior = 1/n_documents
          topic_word_prior=1/ntopics,
          learning_decay=0.7, # Convergence rate.
          learning_offset=10.0, # Causes earlier iterations to have less influence on the learning
          max_iter=10, # when to stop even if the model is not converging (to prevent running forever)
          evaluate_every=-1, # Do not evaluate perplexity, as it slows training time.
          mean_change_tol=0.001, # Stop updating the document topic distribution in the E-step when mean change is < tol
          max_doc_update_iter=100, # When to stop updating the document topic distribution in the E-step even if tol is not reached
          n_jobs=-1, # Use all available CPUs to speed up processing time.
          verbose=0, # amount of output to give while iterating
          random_state=0
         )

twenty_newsgroups_lda = lda.fit_transform(twenty_newsgroups_tfidf) 

components_lda = word_topic(twenty_newsgroups_tfidf, twenty_newsgroups_lda, terms)

topwords['LDA']=top_words(components_lda, n_top_words)




In [9]:
# NNMF

from sklearn.decomposition import NMF

nmf = NMF(alpha=0.0, 
          init='nndsvdar', # how starting value are calculated
          l1_ratio=0.0, # Sets whether regularization is L2 (0), L1 (1), or a combination (values between 0 and 1)
          max_iter=200, # when to stop even if the model is not converging (to prevent running forever)
          n_components=ntopics, 
          random_state=0, 
          solver='cd', # Use Coordinate Descent to solve
          tol=0.0001, # model will stop if tfidf-WH <= tol
          verbose=0 # amount of output to give while iterating
         )
twenty_newsgroups_nmf = nmf.fit_transform(twenty_newsgroups_tfidf) 

components_nmf = word_topic(twenty_newsgroups_tfidf, twenty_newsgroups_nmf, terms)

topwords['NNMF']=top_words(components_nmf, n_top_words)

In [10]:
for topic in range(ntopics):
    print('Topic {}:'.format(topic))
    print(topwords.loc[topic])

Topic 0:
                  LSA                LDA          NNMF
0          edu 222.97           edu 2.34      car 2.45
0          com 154.56           com 1.58      edu 1.74
0        subject 95.9       subject 1.09      com 1.72
0         lines 95.12         lines 1.08     bike 1.24
0  organization 94.05  organization 1.07     like 1.08
0        writes 91.32    university 0.96     just 1.06
0       article 89.22        writes 0.86       don 1.0
0    university 84.47       article 0.86     good 0.99
0           don 79.59            ca 0.84  article 0.97
0          like 78.45       posting 0.83   writes 0.96
Topic 1:
                LSA                LDA             NNMF
1         god 47.18            edu 2.3         god 8.44
1      people 24.58           com 1.62       jesus 3.67
1       jesus 22.46       subject 1.08       bible 2.37
1       bible 14.77         lines 1.07         edu 2.32
1   christian 14.15  organization 1.06      people 2.31
1  christians 14.12    university 0.94   

In [11]:
# The words to look at.
targetwords=['marriage','love','emma','oh']

# Storing the loadings.
wordloadings=pd.DataFrame(columns=targetwords)

# For each word, extracting and string the loadings for each method.
for word in targetwords:
    loadings=components_lsa.loc[word].append(
        components_lda.loc[word]).append(
            components_nmf.loc[word])
    wordloadings[word]=loadings

# Labeling the data by method and providing an ordering variable for graphing purposes. 
wordloadings['method']=np.repeat(['LSA','LDA','NNMF'], 5, axis=0)
wordloadings['loading']=[0,1,2,3,4]*3

sns.set(style="darkgrid")

for word in targetwords:
    sns.barplot(x="method", y=word, hue="loading", data=wordloadings)
    plt.title(word)
    plt.ylabel("")
    plt.show()

ValueError: Length of values does not match length of index