In [1]:
from sklearn.decomposition import LatentDirichletAllocation

# Need to convert the doc into numerical representation. Most common numerical representation are counts or TF-IDF's
# To compute the counts of each word present in the document, we can use count vectorizer

from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

cvectorizer = CountVectorizer()

In [2]:
corpus = ["i love cooking", "I have prepared a cake today","he is going to new place", "he will learn cooking there"]

In [3]:
cvz = cvectorizer.fit_transform(corpus)

cvz

<4x15 sparse matrix of type '<class 'numpy.int64'>'
	with 17 stored elements in Compressed Sparse Row format>

4 Rows and 15 Columns 

In [4]:
vocab = cvectorizer.get_feature_names()
vocab

['cake',
 'cooking',
 'going',
 'have',
 'he',
 'is',
 'learn',
 'love',
 'new',
 'place',
 'prepared',
 'there',
 'to',
 'today',
 'will']

In [5]:
# Implementing LDA model, this cannot run on text form

# n_components topics to be extracted, max_iter = no of iterations to run before finding the optimal representation of topics and 
# document topics
# random_state - represents seed, useful for reproducibility

lda_model = LatentDirichletAllocation(n_components = 3, max_iter = 20,random_state=20)
X_topics = lda_model.fit_transform(cvz) 
topic_words = lda_model.components_ # gives topic distribution

topic_words

array([[0.33409872, 1.3520179 , 0.33426983, 0.33409872, 0.3344864 ,
        0.33426983, 0.33484162, 1.33184251, 0.33426983, 0.33426983,
        0.33409872, 0.33484162, 0.33426983, 0.33409872, 0.33484162],
       [1.33225166, 0.33510505, 0.33404224, 1.33225166, 0.33407486,
        0.33404224, 0.33419528, 0.33426477, 0.33404224, 0.33404224,
        1.33225166, 0.33419528, 0.33404224, 1.33225166, 0.33419528],
       [0.33364962, 1.31287705, 1.33168793, 0.33364962, 2.33143874,
        1.33168793, 1.33096309, 0.33389272, 1.33168793, 1.33168793,
        0.33364962, 1.33096309, 1.33168793, 0.33364962, 1.33096309]])

Now, LDA model is applied on Count Vectorizer

In [6]:
# Viewing the obtained topics

# Print the number of words present in every topic

n_top_words = 4

for i, topic_dist in enumerate(topic_words):
    sorted_topic_dist = np.argsort(topic_dist) # "argsort" is useful to sort an vector/array/list/metrics
    topic_words = np.array(vocab)[sorted_topic_dist]
    #To view the actual words present in the index. Array conversion helps to access the element by index
    topic_words = topic_words[:-n_top_words:-1] # -n_top_words = reversing the sign of n_top_words
    print("Topic",str(i+1),topic_words)

Topic 1 ['cooking' 'love' 'will']
Topic 2 ['today' 'prepared' 'have']
Topic 3 ['he' 'to' 'place']


Notice that these are not accurate, when we have more amount of data and run for large number of iterations then we often obtain very good representation of topics and words

In [7]:
# Viewing the topics assigned to each document

doc_topic = lda_model.transform(cvz)
for n in range (doc_topic.shape[0]):# Iterating in every possible value of this document till its end. shape[0] represents number of rows in the document
    topic_doc = doc_topic[n].argmax()
    print("Document",n+1,"Topic:---",topic_doc)
    


Document 1 Topic:--- 0
Document 2 Topic:--- 1
Document 3 Topic:--- 2
Document 4 Topic:--- 2


Other Libraries such as Gensem and SpaCy also can be used to implement Topic Modeling and LDA Algorithms

## Topic Modeling using NNMF

LSA is not available. Search in free time

In [8]:
# Importing Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option("display.max_colwidth", 200)

In [9]:
# Using 20 Newsgroup dataset from sklearn

from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle = True, random_state =1, remove = ('headers','footers','quotes'))
documents = dataset.data
len(documents)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


11314

In [10]:
dataset.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

The dataset has 11314 text documents distributed across 20 different newsgroup

## Data Preprocessing

In [12]:
news_df = pd.DataFrame({'document': documents})

#removing everything except alphabets

news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z#]", " ")

#removing short words

news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

#make all text lowercase

news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x:x.lower())

In [14]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

# tokenization
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split()) 

# remove stop-words
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

# de-tokenization
detokenized_doc = []
for i in range(len(news_df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)
    
news_df['clean_doc'] = detokenized_doc

## Document-Term Matrix

This is the first step towards topic modeling

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', 
                             max_features= 1000, # keep top 1000 terms 
                             max_df = 0.5, 
                             smooth_idf=True)

X = vectorizer.fit_transform(news_df['clean_doc'])
X_feature_names = vectorizer.get_feature_names()


X.shape

(11314, 1000)

In [17]:
from sklearn.decomposition import NMF

# number of topics
num_topics = 20

nmf = NMF(n_components=num_topics, random_state=1, 
          alpha=.1, l1_ratio=.5, init='nndsvd').fit(X)

## Display and Evaluate Topics

In [18]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic "+str(topic_idx)+": ")
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [19]:
top_n_words = 10
display_topics(nmf, X_feature_names, top_n_words)

Topic 0: 
right believe make really said point want jesus things question
Topic 1: 
thanks mail advance looking info help information address email appreciated
Topic 2: 
game team year games season players play hockey league teams
Topic 3: 
drive scsi hard drives disk floppy controller internal tape cable
Topic 4: 
windows file files program version directory using running software graphics
Topic 5: 
chip clipper encryption government keys phone data escrow algorithm chips
Topic 6: 
like sounds looks look sound things thing make sure really
Topic 7: 
card video monitor cards drivers driver color memory board mode
Topic 8: 
know anybody want need program appreciated sure maybe help getting
Topic 9: 
people government rights person world country force guns society life
Topic 10: 
think wrong really pretty steve remember makes wait agree original
Topic 11: 
problem problems using apple screen fine error work worked solution
Topic 12: 
good thing pretty better year looking quality world id