In [19]:
import re
import pandas as pd
import numpy as np
import gensim
from gensim import models, corpora
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [21]:
def preprocess_text(text):
    tokenized_text = word_tokenize(text.lower())
# Do we want to do any other preprocessing?
    preprocessed_text = tokenized_text
    return preprocessed_text

In [23]:
def run_lda_model(data):
    preprocessed_data = [preprocess_text(plot) for plot in data]
    dictionary = corpora.Dictionary(preprocessed_data)
    corpus = [dictionary.doc2bow(text) for text in preprocessed_data]
    lda_model = models.LdaModel(corpus = corpus, num_topics = 10, id2word = dictionary, random_state = 42)
    for idx, topic in lda_model.print_topics(-1):
        print(topic)
    return lda_model

Load the data from the Wikipedia movie plots dataset and have a look at what's in the dataset.

In [4]:
data = pd.read_csv('wiki_movie_plots.csv')

In [5]:
data.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [6]:
data.Plot.head()

0    A bartender is working at a saloon, serving dr...
1    The moon, painted with a smiling face hangs ov...
2    The film, just over a minute long, is composed...
3    Lasting just 61 seconds and consisting of two ...
4    The earliest known adaptation of the classic f...
Name: Plot, dtype: object

In [7]:
len(data)

34886

We are only interested in the `Plot` column so we'll create a new dataframe containing only the relevant data.

In [8]:
plot_data = data['Plot']

Let's now run the LDA model.

First preprocess the data so that it is tokenized and cleaned.

In [9]:
preprocessed_data = [preprocess_text(plot) for plot in plot_data]

We now build a dictionary. It gives a numerical id to every word in our dataset.

In [11]:
dictionary = corpora.Dictionary(preprocessed_data)

And finally, we build a corpus bag-of-words-style.

In [12]:
corpus = [dictionary.doc2bow(text) for text in preprocessed_data]

Let's now run the LDA model and print out the topics.

In [13]:
lda_model = models.LdaModel(corpus = corpus, num_topics = 10, id2word = dictionary, random_state = 42)

In [15]:
for idx, topic in lda_model.print_topics(-1):
    print(topic)

0.075*"." + 0.068*"," + 0.018*"'s" + 0.014*"(" + 0.014*")" + 0.012*"raja" + 0.004*"get" + 0.004*"money" + 0.004*"back" + 0.003*"gets"
0.085*"," + 0.057*"." + 0.021*"'s" + 0.012*")" + 0.011*"(" + 0.008*"police" + 0.005*"team" + 0.004*"kim" + 0.004*"gang" + 0.003*"one"
0.042*")" + 0.042*"(" + 0.022*"," + 0.019*"." + 0.009*"raju" + 0.009*"'s" + 0.008*"radha" + 0.007*"shiva" + 0.006*"kumar" + 0.005*"rahul"
0.074*"." + 0.048*"," + 0.020*"(" + 0.019*")" + 0.018*"'s" + 0.005*"priya" + 0.005*"ravi" + 0.003*"kill" + 0.003*"babu" + 0.003*"lakshmi"
0.091*"." + 0.088*"," + 0.019*"'s" + 0.006*"police" + 0.005*"house" + 0.005*"tells" + 0.004*"one" + 0.004*"man" + 0.004*"car" + 0.003*"later"
0.096*"," + 0.067*"." + 0.014*"'s" + 0.004*")" + 0.004*"(" + 0.003*"one" + 0.003*"group" + 0.003*"war" + 0.002*"battle" + 0.002*"killed"
0.082*"," + 0.068*"." + 0.020*"'s" + 0.009*")" + 0.007*"(" + 0.007*"father" + 0.007*"family" + 0.007*"son" + 0.007*"king" + 0.005*"mother"
0.067*"," + 0.059*"." + 0.011*"'s" + 0

What do you notice? What do we need to do first?

Go back to the `preprocess_text` function and think what you could change to get better topics?

In [20]:
preprocessed_data = [preprocess_text(plot) for plot in plot_data]
dictionary = corpora.Dictionary(preprocessed_data)
corpus = [dictionary.doc2bow(text) for text in preprocessed_data]
lda_model = models.LdaModel(corpus = corpus, num_topics = 10, id2word = dictionary, random_state = 42)
for idx, topic in lda_model.print_topics(-1):
    print(topic)

0.008*"family" + 0.008*"love" + 0.008*"father" + 0.007*"mother" + 0.007*"life" + 0.005*"one" + 0.005*"son" + 0.004*"also" + 0.004*"day" + 0.004*"wife"
0.009*"love" + 0.006*"father" + 0.005*"get" + 0.004*"vijay" + 0.004*"marriage" + 0.004*"also" + 0.004*"married" + 0.004*"friend" + 0.004*"tells" + 0.004*"family"
0.008*"house" + 0.006*"back" + 0.006*"mother" + 0.005*"one" + 0.005*"tells" + 0.005*"love" + 0.005*"day" + 0.005*"girl" + 0.004*"home" + 0.004*"goes"
0.010*"village" + 0.006*"son" + 0.006*"father" + 0.006*"king" + 0.006*"rao" + 0.004*"love" + 0.004*"one" + 0.004*"family" + 0.004*"arjun" + 0.004*"kill"
0.004*"one" + 0.004*"life" + 0.003*"body" + 0.003*"death" + 0.003*"time" + 0.003*"anna" + 0.003*"father" + 0.003*"woman" + 0.003*"later" + 0.002*"people"
0.018*"police" + 0.006*"kill" + 0.006*"gang" + 0.006*"killed" + 0.005*"one" + 0.004*"murder" + 0.004*"man" + 0.004*"officer" + 0.004*"kills" + 0.004*"case"
0.007*"film" + 0.005*"one" + 0.003*"people" + 0.003*"hari" + 0.003*"two" +

In [24]:
lda_model = run_lda_model(plot_data)

0.004*"one" + 0.004*"alien" + 0.004*"godzilla" + 0.004*"kill" + 0.004*"use" + 0.003*"take" + 0.003*"back" + 0.003*"get" + 0.003*"fli" + 0.003*"space"
0.011*"raja" + 0.009*"get" + 0.007*"money" + 0.007*"polic" + 0.006*"raju" + 0.006*"famili" + 0.006*"find" + 0.005*"take" + 0.005*"father" + 0.005*"meet"
0.008*"find" + 0.005*"back" + 0.005*"kill" + 0.005*"one" + 0.004*"use" + 0.004*"bodi" + 0.004*"take" + 0.004*"attack" + 0.004*"tri" + 0.004*"see"
0.020*"kill" + 0.016*"polic" + 0.009*"rao" + 0.008*"gang" + 0.008*"get" + 0.006*"murder" + 0.006*"take" + 0.006*"find" + 0.006*"one" + 0.005*"escap"
0.007*"love" + 0.006*"get" + 0.006*"day" + 0.006*"friend" + 0.005*"tell" + 0.005*"one" + 0.005*"find" + 0.005*"famili" + 0.005*"home" + 0.005*"mother"
0.014*"love" + 0.010*"marri" + 0.009*"father" + 0.007*"famili" + 0.006*"son" + 0.006*"daughter" + 0.005*"villag" + 0.005*"come" + 0.005*"marriag" + 0.005*"get"
0.010*"get" + 0.009*"friend" + 0.008*"school" + 0.008*"father" + 0.007*"love" + 0.006*"girl