In this notebook, we'll explore topic modeling for discovering broad themes in a collection of **movie summaries**.

Source code from :
https://github.com/dbamman/anlp21/blob/main/5.eda/TopicModel.ipynb

# Topic Modeling

In [1]:
pip install gensim

Collecting gensim
  Obtaining dependency information for gensim from https://files.pythonhosted.org/packages/37/16/9266c7e205d344cd6bea5074ed769e878c9b3919ab4e1e6adf0ad6370eb8/gensim-4.3.2-cp38-cp38-macosx_11_0_arm64.whl.metadata
  Downloading gensim-4.3.2-cp38-cp38-macosx_11_0_arm64.whl.metadata (8.5 kB)
Downloading gensim-4.3.2-cp38-cp38-macosx_11_0_arm64.whl (24.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.0/24.0 MB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.3.2
Note: you may need to restart the kernel to use updated packages.


In [2]:
import nltk
import re
import gensim
from gensim import corpora
import operator

nltk.download('stopwords')
from nltk.corpus import stopwords

import numpy as np
import random

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/owenmonroe/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def read_stopwords(filename):
    stopwords={}
    with open(filename) as file:
        for line in file:
            stopwords[line.rstrip()]=1
    return stopwords

Since we're running topic modeling on text with lots of names, we'll add the Jockers list of stopwords, which includes character names, to our stopword list.

In [4]:
stop_words = {k:1 for k in stopwords.words('english')}
stop_words.update(read_stopwords("Datasets/movies_data/jockers.stopwords"))
stop_words["'s"]=1
stop_words=list(stop_words.keys())

In [5]:
# function to exclude words from a text
def filter(word, stopwords):
    # no stopwords
    if word in stopwords:
        return False
    
    # has to contain at least one letter
    if re.search("[A-Za-z]", word) is not None:
        return True
    
    return False

In [6]:
def read_docs(plotFile, metadataFile, stopwords):
    
    names={}
    box={}
    
    with open(metadataFile, encoding="utf-8") as file:
        for line in file:
            cols=line.rstrip().split("\t")
            idd=cols[0]
            name=cols[2]
            boxoffice=cols[4]
            if len(boxoffice) != 0:
                box[idd]=int(boxoffice)
                names[idd]=name
    
    n=5000
    target_movies={}


    sorted_box = sorted(box.items(), key=operator.itemgetter(1), reverse=True)
    for k, v in sorted_box[:n]:
        target_movies[k]=names[k]
    
    docs=[]
    names=[]
   
    with open(plotFile, encoding="utf-8") as file:
        for line in file:
            cols=line.rstrip().split("\t")
            idd=cols[0]
            text=cols[1]
            
            if idd in target_movies:
                tokens=nltk.word_tokenize(text.lower())
                tokens=[x for x in tokens if filter(x, stopwords)]
                docs.append(tokens)
                name=target_movies[idd]
                names.append(name)
    return docs, names

We'll read in summaries of the 5,000 movies with the highest box office revenues and convert the movie summaries into a bag-of-words representation using gensim's *corpora.dictionary* methods.

In [7]:
metadataFile="Datasets/movies_data/movie.metadata.tsv"
plotFile="Datasets/movies_data/plot_summaries.txt"
data, doc_names=read_docs(plotFile, metadataFile, stop_words)

# create vocab from data; restrict vocab to only the top 10K terms that show up in at least 5 documents and no more than 50% of all documents

dictionary = corpora.Dictionary(data)
dictionary.filter_extremes(no_below=5, no_above=.5, keep_n=10000)

# replace dataset with numeric ids words in vocab (and exclude all other words)
corpus = [dictionary.doc2bow(text) for text in data]
num_topics=20

Now let's run topic modeling on this data using gensim's built-in LDA.

In [8]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=num_topics, 
                                           passes=10,
                                           alpha='auto')

Let's get a sense of the topics by printing the top 10 words with highest P(word|topic) for each topic.

In [None]:
for i in range(num_topics):
    print("topic %s:\t%s" % (i, ' '.join([term for term, freq in lda_model.show_topic(i, topn=10)])))

What do you observe from the topics above?

Another way to understand topics is to print out the documents that have the highest topic representation -- i.e., for a given topic, the documents with highest P(topic=k|document). How much do the documents listed below align with your understanding of the topics?

In [None]:
topic_model=lda_model 

topic_docs=[]
for i in range(num_topics):
    topic_docs.append({})
for doc_id in range(len(corpus)):
    doc_topics=topic_model.get_document_topics(corpus[doc_id])
    for topic_num, topic_prob in doc_topics:
        topic_docs[topic_num][doc_id]=topic_prob

for i in range(num_topics):
    print("%s\n" % ' '.join([term for term, freq in topic_model.show_topic(i, topn=10)]))
    sorted_x = sorted(topic_docs[i].items(), key=operator.itemgetter(1), reverse=True)
    for k, v in sorted_x[:5]:
        print("%s\t%.3f\t%s" % (i,v,doc_names[k]))
    print()