In [35]:
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups 
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import euclidean_distances
import pandas as pd
import numpy as np
import nltk
import re
#!pip install ntlk

In [46]:
with open('movie_summaries_bottom_250.txt', 'r') as myfile:
    summary=myfile.readlines()
with open('movie_titles_bottom_250.txt', 'r') as myfile:
    title= myfile.readlines()
title = [x.strip() for x in title] 
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
stopwords = nltk.corpus.stopwords.words('english')

### Data Pre Processing

In [47]:
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [48]:
#strip any proper names from a text...unfortunately right now this is yanking the first word from a sentence too.
import string
def strip_proppers(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent) if word.islower()]
    return "".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in tokens]).strip()

In [49]:
#strip any proper nouns (NNP) or plural proper nouns (NNPS) from a text
from nltk.tag import pos_tag

def strip_proppers_POS(text):
    tagged = pos_tag(text.split()) #use NLTK's part of speech tagger
    non_propernouns = [word for word,pos in tagged if pos != 'NNP' and pos != 'NNPS']
    return non_propernouns

In [50]:
from gensim import corpora, models, similarities 

#remove proper names
%time preprocess = [strip_proppers(doc) for doc in summary]
#tokenize
%time tokenized_text = [tokenize_and_stem(text) for text in preprocess]
#remove stop words
%time texts = [[word for word in text if word not in stopwords] for text in tokenized_text]

Wall time: 422 ms
Wall time: 552 ms
Wall time: 45.6 ms


In [51]:
#create a Gensim dictionary from the texts
dictionary = corpora.Dictionary(texts)

#remove extremes (similar to the min/max df step used when creating the tf-idf matrix)
dictionary.filter_extremes(no_below=1, no_above=0.8)

#convert the dictionary to a bag of words corpus for reference
corpus = [dictionary.doc2bow(text) for text in texts]

In [53]:
%time lda = models.LdaModel(corpus, num_topics=5, id2word=dictionary, update_every=5, chunksize=10000, passes=100)

Wall time: 21.3 s


In [54]:
lda.show_topics()

[(0,
  '0.009*"\'s" + 0.005*"stori" + 0.005*"father" + 0.004*"power" + 0.004*"young" + 0.004*"becom" + 0.004*"want" + 0.004*"time" + 0.004*"love" + 0.004*"transform"'),
 (1,
  '0.011*"\'s" + 0.005*"tri" + 0.005*"ship" + 0.005*"find" + 0.004*"famili" + 0.004*"mask" + 0.004*"girlfriend" + 0.004*"town" + 0.004*"world" + 0.003*"make"'),
 (2,
  '0.009*"\'s" + 0.006*"evil" + 0.006*"get" + 0.006*"find" + 0.005*"treasur" + 0.005*"back" + 0.005*"year" + 0.004*"one" + 0.004*"make" + 0.004*"world"'),
 (3,
  '0.018*"\'s" + 0.009*"one" + 0.007*"world" + 0.007*"evil" + 0.006*"take" + 0.005*"life" + 0.005*"kill" + 0.005*"must" + 0.005*"famili" + 0.004*"friend"'),
 (4,
  '0.019*"\'s" + 0.010*"friend" + 0.010*"girl" + 0.007*"find" + 0.007*"get" + 0.006*"shark" + 0.006*"tri" + 0.006*"live" + 0.005*"decid" + 0.005*"love"')]

In [63]:
topics_matrix = lda.show_topics(formatted=False, num_words=20)
topics_matrix = np.asarray(topics_matrix,  dtype=object)


In [65]:
topics_matrix.shape

(5, 2)

In [66]:
topic_words = topics_matrix[:,1]
for i in topic_words:
    print([str(word) for word in i])
    print()

['("\'s", 0.0091705536)', "('stori', 0.0053786486)", "('father', 0.0045391265)", "('power', 0.0042730854)", "('young', 0.0042711836)", "('becom', 0.0041157184)", "('want', 0.0040469915)", "('time', 0.0039891871)", "('love', 0.0038913589)", "('transform', 0.0038911509)", "('find', 0.0036602519)", "('evil', 0.0036095756)", "('fall', 0.0035109497)", "('friend', 0.0034600263)", "('turn', 0.0032869924)", "('group', 0.0031278566)", "('vampir', 0.0031276133)", "('kill', 0.0031270958)", "('world', 0.0031262017)", "('one', 0.0031257286)"]

['("\'s", 0.010719056)', "('tri', 0.0051570642)", "('ship', 0.0051210546)", "('find', 0.004917311)", "('famili', 0.0041483366)", "('mask', 0.003956818)", "('girlfriend', 0.0038997678)", "('town', 0.0038307717)", "('world', 0.0036804907)", "('make', 0.0034367095)", "('get', 0.0034306513)", "('want', 0.003411059)", "('plan', 0.0034040792)", "('one', 0.0033717749)", "('fight', 0.0031821656)", "('name', 0.0031819777)", "('save', 0.0031777148)", "('game', 0.003149