## Upload datasets and join them

In [13]:
path_to_imdb_dataset = 'title.basics.tsv.gz'
path_to_plots_dataset = 'wiki_movie_plots_deduped.csv'
path_to_details_dataset = 'IMDB_movie_details.json'

In [14]:
#!pip install -U pyspark

In [15]:
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [16]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
import pyspark.sql.functions as F

In [17]:
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [18]:
# reading the IMDB dataset
imdb = spark.read.options(header = True, inferSchema = True, delimiter = "\t").csv(path_to_imdb_dataset)

# filter the imdb dataset so that only movies are included
imdb = imdb.filter("titleType = 'movie'")\
  .filter("primaryTitle != ''")\
    .select('tconst', 'primaryTitle', 'startYear')\
      .withColumnRenamed('startYear', 'Year')

print('there is a total of ', imdb.count(), ' movies in the imdb dataset')
imdb.show(3)

[Stage 19:>                                                         (0 + 1) / 1]

there is a total of  653179  movies in the imdb dataset
+---------+--------------------+----+
|   tconst|        primaryTitle|Year|
+---------+--------------------+----+
|tt0000009|          Miss Jerry|1894|
|tt0000147|The Corbett-Fitzs...|1897|
|tt0000502|            Bohemios|1905|
+---------+--------------------+----+
only showing top 3 rows



                                                                                

In [19]:
details = spark.read.json(path_to_details_dataset)
details = details.select('movie_id','plot_synopsis')\
  .filter("plot_synopsis != ''")
print('there is a total of ', details.count(), ' plot summaries left in the details dataset')
details.show(3)

there is a total of  1339  plot summaries left in the details dataset
+---------+--------------------+
| movie_id|       plot_synopsis|
+---------+--------------------+
|tt0105112|Jack Ryan (Ford) ...|
|tt1204975|Four boys around ...|
|tt0040897|Fred Dobbs (Humph...|
+---------+--------------------+
only showing top 3 rows



In [20]:
# join the imdb with details by matching the unique identifier(e.g. tt0000000)
imdb_join_details = imdb.join(details, imdb.tconst == details.movie_id, 'inner')\
  .withColumnRenamed('plot_synopsis', 'Plot')\
    .withColumnRenamed('primaryTitle', 'Title')\
      .withColumnRenamed('tconst', 'id')\
        .select('id', 'Title', 'Plot')

print("The joined dataset has ", imdb_join_details.count(), " entries")

# inspect one entry
imdb_join_details.filter("tconst == 'tt0472062'").show(truncate = False)

                                                                                

The joined dataset has  1324  entries


[Stage 33:>                                                         (0 + 1) / 1]

+---------+--------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id       |Title               |Plot                                                                                                            

                                                                                

In [21]:
plot = spark.read.options(header = True, inferSchema = True, quote = '"', escape = '"', multiLine = True).csv(path_to_plots_dataset)
plot = plot.select('Title', 'Release Year','Plot').withColumnRenamed('Release Year', 'Year')
print('there is a total of ', plot.count(), ' plot summaries in the plot dataset')
plot.show(3)

there is a total of  34886  plot summaries in the plot dataset
+--------------------+----+--------------------+
|               Title|Year|                Plot|
+--------------------+----+--------------------+
|Kansas Saloon Sma...|1901|A bartender is wo...|
|Love by the Light...|1901|The moon, painted...|
|The Martyred Pres...|1901|The film, just ov...|
+--------------------+----+--------------------+
only showing top 3 rows



In [22]:
# join the imdb with the plot dataset by matching movie titles and release year
imdb_join_plot = imdb.join(plot, [imdb.primaryTitle == plot.Title, imdb.Year == plot.Year], 'inner')\
  .withColumnRenamed('tconst', 'id')\
    .select('id', 'Title', 'Plot')

print("The joined dataset has ", imdb_join_plot.count(), " entries")

# inspect the joined dataset
imdb_join_plot.show(5, truncate=False)

                                                                                

The joined dataset has  26958  entries


[Stage 49:>                                                         (0 + 1) / 1]

+---------+-------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [23]:
# drop primaryTitle and movie_id because they provide redundant information 
df = imdb_join_plot.union(imdb_join_details)

print('after merging & cleaning, there is a total of ', df.count(), ' movie plot entries left in the merged dataset')

# inspect the combined new dataset
df.show(3, truncate = False)

                                                                                

after merging & cleaning, there is a total of  28282  movie plot entries left in the merged dataset


[Stage 64:>                                                         (0 + 1) / 1]

+---------+------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

## LDA with Gensim

In [35]:
!pip install -U gensim


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [36]:
from nltk.corpus import stopwords 
stopwords = list(set(stopwords.words('english')))

print('The first 10 stopwords in the list:', stopwords[:10])

The first 10 stopwords in the list: ['aren', 'from', 'because', "shan't", 'your', 'where', 'yours', "haven't", 'into', 'what']


In [37]:
import string
punctuation = list(set(string.punctuation))

print('The punctuations are:', punctuation)

The punctuations are: ['>', '}', '\\', '&', '+', "'", ':', '<', '|', '`', '$', '#', '_', '/', '"', '[', ']', '(', '.', '-', ',', ';', '?', '!', '=', '%', '{', '*', '^', '~', ')', '@']


In [38]:
import re
from nltk.stem.wordnet import WordNetLemmatizer

def cleanup(text):
    '''
        input: raw text
        output: a list of words
    '''
    
    text = " ".join([i for i in text.lower().split() if i not in stopwords])          
    text = "".join(re.sub(r'[^a-zA-Z ]', '', i) for i in text if i not in punctuation) 
    text = [WordNetLemmatizer().lemmatize(i) for i in text.split()]                    
    return text

In [39]:
from gensim.corpora.dictionary import Dictionary

In [40]:
plot_text = pandasdf['Plot'].apply(cleanup)
print('List of tokens:')
plot_text[:10]

List of tokens:


0    [film, mainly, focus, yearold, dave, peck, une...
1    [gary, hook, new, recruit, british, army, take...
2    [total, stranger, dan, hardesty, george, brent...
3    [film, begin, john, christie, murdering, neigh...
4    [balan, k, nair, dulquer, salman, call, bkn, r...
5    [exactly, noon, dadar, railway, station, bomba...
6    [movie, revolves, around, dr, ajay, aj, kumar,...
7    [movie, revolves, around, dr, ajay, aj, kumar,...
8    [story, take, place, pakistan, six, year, bang...
9    [maru, marasigan, darwin, yu, sixteenyear, old...
Name: Plot, dtype: object

In [41]:
dictionary = Dictionary(plot_text)

In [42]:
print('Count of raw tokens: ' + str(len(dictionary.items())))

Count of raw tokens: 164943


In [43]:
## can change the filter extreme values

print('Filter out tokens that appear less than 80 times and more than 80% of the titles')
dictionary.filter_extremes(no_below=80, no_above=0.8)
print('Count of tokens: '+ str(len(dictionary.items())))

Filter out tokens that appear less than 80 times and more than 80% of the titles
Count of tokens: 7290


In [44]:
corpora = [dictionary.doc2bow(doc) for doc in plot_text]

In [45]:
# can change the number of topics  and passes
from gensim.models import ldamodel
lda_model = ldamodel.LdaModel(corpora, num_topics=5, id2word = dictionary, passes=50)

In [46]:

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpora, dictionary)
vis