### LDA-NMF via sklearn

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation

import pandas as pd

In [2]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

In [3]:
no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

In [4]:
no_topics = 20

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

In [5]:
no_top_words = 30

# LDA
# Populate terms and columns for doc2vec input
lda_columns = []
for i in range(0, no_top_words):
    lda_columns.append('term{}'.format(i))

lda_terms = []
for topic_idx, topic in enumerate(lda.components_):
    lda_terms.append([tf_feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])

lda_df = pd.DataFrame(data=lda_terms, columns=lda_columns)
lda_df['topic_id'] = [i for i in range(0, 20)]
lda_df.set_index('topic_id', inplace=True)
lda_df.head()

Unnamed: 0_level_0,term0,term1,term2,term3,term4,term5,term6,term7,term8,term9,...,term20,term21,term22,term23,term24,term25,term26,term27,term28,term29
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,people,gun,state,control,right,guns,crime,states,law,police,...,anti,make,government,rate,military,public,case,person,carry,power
1,time,question,book,years,did,like,don,space,answer,just,...,actually,earth,read,make,post,ask,theory,second,books,large
2,mr,line,rules,science,stephanopoulos,title,current,define,int,yes,...,know,original,question,term,right,ed,discussion,write,job,post
3,key,chip,keys,clipper,encryption,number,des,algorithm,use,bit,...,block,unit,using,security,80,secure,dc,product,technology,data
4,edu,com,cs,vs,w7,cx,mail,uk,17,send,...,ma,article,org,jim,picture,27,john,internet,apr,24


In [6]:
# NMF
nmf_columns = []
for i in range(0, no_top_words):
    nmf_columns.append('term{}'.format(i))

nmf_terms = []
for topic_idx, topic in enumerate(nmf.components_):
    nmf_terms.append([tfidf_feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])

nmf_df = pd.DataFrame(data=nmf_terms, columns=nmf_columns)
nmf_df['topic_id'] = [i for i in range(0, 20)]
nmf_df.set_index('topic_id', inplace=True)
nmf_df.head()

Unnamed: 0_level_0,term0,term1,term2,term3,term4,term5,term6,term7,term8,term9,...,term20,term21,term22,term23,term24,term25,term26,term27,term28,term29
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,people,time,right,did,good,said,say,make,way,government,...,want,things,question,didn,true,case,law,thing,gun,better
1,window,problem,using,server,application,screen,display,motif,manager,running,...,sun,line,try,time,works,tried,memory,mode,having,mit
2,god,jesus,bible,christ,faith,believe,christian,christians,sin,church,...,son,existence,word,atheism,religion,john,true,paul,exist,says
3,game,team,year,games,season,players,play,hockey,win,league,...,time,night,goal,great,won,bad,series,did,years,chicago
4,new,00,sale,10,price,offer,shipping,condition,20,15,...,email,old,excellent,100,16,000,40,best,14,box


In [21]:
lda_df.to_csv('lda-sklearn-output.csv')
nmf_df.to_csv('nmf-sklearn-output.csv')