In [1]:
#imports
import string
import numpy as np
import pandas as pd
from glob import glob
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF

nltk.download()
ps = PorterStemmer()
dictionary = []

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [2]:
#pre-process files
def extract(folderpath):
    docs = glob(folderpath)    
    for d in docs:
        file = open(d,'r', errors='ignore')
        readfile = file.readlines()
        readfile = ' '.join(readfile)
        #filter text to remove stopwords, punctuation, numbers
        stop_words = set(stopwords.words('english'))
        word_tokens = word_tokenize(readfile)
        punctuation = set(string.punctuation)
        filtered_sentence = ' '.join(w for w in word_tokens if not w in stop_words)
        filtered_sentence = ''.join(ch for ch in filtered_sentence if ch not in punctuation)
        filtered_sentence = ''.join([i for i in filtered_sentence if not i.isdigit()])
        filtered_sentence = filtered_sentence.lower()
        #unigram -- tokenize
        unigram = word_tokenize(filtered_sentence)
        #stemming
        stemming = []
        for u in unigram:
            stemming = stemming+[(ps.stem(u))]
            stemmed = ' '.join(stemming)
        #append into dictionary
        dictionary.append(stemmed)
    return(dictionary)

In [3]:
#display topics
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [4]:
#read files
dictionary = extract('C:/Users/aksha/Desktop/Text Analytics/Homework3/Split train/*.txt')

In [5]:
#tf matrix for LDA, NMF
tfidf = TfidfVectorizer(min_df = 0.01, max_df = 0.99, max_features = 1000)
matrix = np.asarray(tfidf.fit_transform(dictionary).todense())
feature_names = tfidf.get_feature_names()

In [6]:
#Latent Dirichlet Allocation
lda = LatentDirichletAllocation(n_topics = 20, max_iter = 5, learning_method = 'online', learning_offset = 50.,random_state = 1)
lda.fit(matrix)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_jobs=1, n_topics=20, perp_tol=0.1, random_state=1,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [7]:
#Non-Negative Matrix Factorization
nmf = NMF(n_components = 20, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd')
nmf.fit(matrix)

NMF(alpha=0.1, beta=1, eta=0.1, init='nndsvd', l1_ratio=0.5, max_iter=200,
  n_components=20, nls_max_iter=2000, random_state=1, shuffle=False,
  solver='cd', sparseness=None, tol=0.0001, verbose=0)

In [8]:
#lda topics
display_topics(lda, feature_names, 10)

Topic 0:
work one year power famili make anoth noth show rather
Topic 1:
perform movi much entertain filmmak the actor scene nt make
Topic 2:
documentari still think you nt may it would make even
Topic 3:
charact like best play minut need want hard nt film
Topic 4:
enough well even though make film it movi might screen
Topic 5:
film would beauti mani ultim real the stori like it
Topic 6:
bad what without humor know movi film like the it
Topic 7:
director take realli made big long film it time screen
Topic 8:
it rrb lrb funni end hollywood far becom heart movi
Topic 9:
an someth action as lot effect film movi the there
Topic 10:
watch pictur audienc but often find everi screen re film
Topic 11:
thi time plot yet enjoy gener movi film one it
Topic 12:
good could also first might live movi film it one
Topic 13:
come never thriller charm world human keep less tri laugh
Topic 14:
seem cast tale origin ever the make movi film made
Topic 15:
the feel look nt get thing film quit ca like
Topic 

In [9]:
#nmf topics
display_topics(nmf, feature_names, 10)

Topic 0:
the script thing filmmak cast action emot direct seem never
Topic 1:
film made never best come love director there year way
Topic 2:
movi bad made best year kind mani go everi if
Topic 3:
it hard great direct may see also yet look someth
Topic 4:
rrb lrb rather audienc human mani live well love us
Topic 5:
nt ca enough get go if you would want quit
Topic 6:
like feel play look watch seem life minut if thing
Topic 7:
one year best ever anoth thing great mani big world
Topic 8:
stori love littl take also us filmmak give if director
Topic 9:
comedi charm come gener would kind laugh humor enjoy hollywood
Topic 10:
make seem what lack enough sens entertain look director us
Topic 11:
charact plot never audienc interest life us script two cast
Topic 12:
an drama entertain thriller tale ultim documentari point someth look
Topic 13:
thi thing drama pictur long life thriller bad minut kind
Topic 14:
work well may actor interest yet script end often love
Topic 15:
time minut long watch m