## Word2vec Model Trainning
- Using the movie plots and summary dataset, train a word2vec model that will recommend movies with similar plot lines 

In [3]:
# enter path to the location of the dataset here
"""# Mike's desktop paths 
path_to_imdb_dataset = 'C:/Users/123/OneDrive/Academic/5430/data/title.basics.tsv.gz'
path_to_reviews_dataset = 'C:/Users/123/OneDrive/Academic/5430/data/IMDB_reviews.json'
path_to_plots_dataset = 'C:/Users/123/OneDrive/Academic/5430/data/wiki_movie_plots_deduped.csv'
path_to_details_dataset = 'C:/Users/123/OneDrive/Academic/5430/data/IMDB_movie_details.json' """

# Mike's laptop paths
path_to_imdb_dataset = '/Users/yupan/Library/CloudStorage/OneDrive-Personal/Academic/5430/data/title.basics.tsv.gz'
path_to_reviews_dataset = '/Users/yupan/Library/CloudStorage/OneDrive-Personal/Academic/5430/data/IMDB_reviews.json'
path_to_plots_dataset = '/Users/yupan/Library/CloudStorage/OneDrive-Personal/Academic/5430/data/wiki_movie_plots_deduped.csv'
path_to_details_dataset = '/Users/yupan/Library/CloudStorage/OneDrive-Personal/Academic/5430/data/IMDB_movie_details.json'

In [2]:
""" # load imdb dataset
import gzip
with gzip.open(path_to_imdb_dataset, 'rt', encoding='utf-8') as f:
    df_imdb = pd.read_csv(f, delimiter='\t') """

" # load imdb dataset\nimport gzip\nwith gzip.open(path_to_imdb_dataset, 'rt', encoding='utf-8') as f:\n    df_imdb = pd.read_csv(f, delimiter='\t') "

Load & inspect the details dataset

In [16]:
import pandas as pd

# load reviews dataset
plot_details = pd.read_json(path_to_details_dataset, lines=True)

# inspect the reviews dataset
plot_details.head(5)

Unnamed: 0,movie_id,plot_summary,duration,genre,rating,release_date,plot_synopsis
0,tt0105112,"Former CIA analyst, Jack Ryan is in England wi...",1h 57min,"[Action, Thriller]",6.9,1992-06-05,"Jack Ryan (Ford) is on a ""working vacation"" in..."
1,tt1204975,"Billy (Michael Douglas), Paddy (Robert De Niro...",1h 45min,[Comedy],6.6,2013-11-01,Four boys around the age of 10 are friends in ...
2,tt0243655,"The setting is Camp Firewood, the year 1981. I...",1h 37min,"[Comedy, Romance]",6.7,2002-04-11,
3,tt0040897,"Fred C. Dobbs and Bob Curtin, both down on the...",2h 6min,"[Adventure, Drama, Western]",8.3,1948-01-24,Fred Dobbs (Humphrey Bogart) and Bob Curtin (T...
4,tt0126886,Tracy Flick is running unopposed for this year...,1h 43min,"[Comedy, Drama, Romance]",7.3,1999-05-07,Jim McAllister (Matthew Broderick) is a much-a...


Load & inspect the plot dataset

In [15]:
# load plot dataset
plot = pd.read_csv(path_to_plots_dataset)

# inspect the plot dataset
plot.head(5)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [18]:
plot.describe()

Unnamed: 0,Release Year
count,34886.0
mean,1981.314252
std,27.815174
min,1901.0
25%,1957.0
50%,1988.0
75%,2007.0
max,2017.0


Three data sources: 
1. IMDB: used for matching movie title & ID
2. Spoiler: contains plots&movie ID, used for trainning
3. Plot: contains plots& movie name, used for trainning

Plot_synopsis is the movies' plot summaries with spoilers. Since we want to analyze the similarity of the plot line of the movies, we will use this variable to train our word2vec model. 

To train the model, let's first initiate a spark session and load the dataset into the spark dataframe

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/01 12:20:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [23]:
# reading the IMDB dataset
imdb = spark.read.options(header = True, inferSchema = True, delimiter = "\t").csv(path_to_imdb_dataset)

# filter the imdb dataset so that only movies are included
imdb = imdb.filter("titleType = 'movie'")\
  .filter("primaryTitle != ''")\
    .select('tconst', 'primaryTitle', 'startYear')\
      .withColumnRenamed('startYear', 'Year')

print('there is a total of ', imdb.count(), ' movies left in the imdb dataset')
imdb.show(3)

[Stage 166:>                                                        (0 + 1) / 1]

there is a total of  651281  movies left in the imdb dataset
+---------+--------------------+----+
|   tconst|        primaryTitle|Year|
+---------+--------------------+----+
|tt0000009|          Miss Jerry|1894|
|tt0000147|The Corbett-Fitzs...|1897|
|tt0000502|            Bohemios|1905|
+---------+--------------------+----+
only showing top 3 rows



                                                                                

In [21]:
details = spark.read.json(path_to_details_dataset)
details = details.select('movie_id','plot_synopsis')\
  .filter("plot_synopsis != ''")
print('there is a total of ', details.count(), ' plot summaries left in the details dataset')
details.show(3)

there is a total of  1339  plot summaries left in the details dataset
+---------+--------------------+
| movie_id|       plot_synopsis|
+---------+--------------------+
|tt0105112|Jack Ryan (Ford) ...|
|tt1204975|Four boys around ...|
|tt0040897|Fred Dobbs (Humph...|
+---------+--------------------+
only showing top 3 rows



In [22]:
# join the imdb with details by matching the unique identifier(e.g. tt0000000)
imdb_join_details = imdb.join(details, imdb.tconst == details.movie_id, 'inner')\
  .withColumnRenamed('plot_synopsis', 'Plot')\
    .withColumnRenamed('primaryTitle', 'Title')\
      .withColumnRenamed('tconst', 'id')\
        .select('id', 'Title', 'Plot')

print("The joined dataset has ", imdb_join_details.count(), " entries")
imdb_join_details.filter("tconst == 'tt0472062'").show(truncate = False)

                                                                                

The joined dataset has  1324  entries


[Stage 163:>                                                        (0 + 1) / 1]

+---------+--------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id       |Title               |Plot                                                                                                            

                                                                                

In [26]:
plot = spark.read.options(header = True, inferSchema = True, quote = '"', escape = '"', multiLine = True).csv(path_to_plots_dataset)
plot = plot.select('Title', 'Release Year','Plot').withColumnRenamed('Release Year', 'Year')
print('there is a total of ', plot.count(), ' plot summaries in the plot dataset')
plot.show(3)

there is a total of  34886  plot summaries in the plot dataset
+--------------------+----+--------------------+
|               Title|Year|                Plot|
+--------------------+----+--------------------+
|Kansas Saloon Sma...|1901|A bartender is wo...|
|Love by the Light...|1901|The moon, painted...|
|The Martyred Pres...|1901|The film, just ov...|
+--------------------+----+--------------------+
only showing top 3 rows



In [27]:
# join the imdb with the plot dataset by matching movie titles and release year
imdb_join_plot = imdb.join(plot, [imdb.primaryTitle == plot.Title, imdb.Year == plot.Year], 'inner')\
  .withColumnRenamed('tconst', 'id')\
    .select('id', 'Title', 'Plot')

print("The joined dataset has ", imdb_join_plot.count(), " entries")
#imdb_join_plot.filter("tconst == 'tt0472062'").show(truncate = False)
imdb_join_plot.show(5, truncate=False)

                                                                                

The joined dataset has  26953  entries


[Stage 185:>                                                        (0 + 1) / 1]

+---------+-------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [28]:
# drop primaryTitle and movie_id because they provide redundant information 
df = imdb_join_plot.union(imdb_join_details)

print('after merging & cleaning, there is a total of ', df.count(), ' movie plot entries left in the merged dataset')

df.show(3, truncate = False)

                                                                                

after merging & cleaning, there is a total of  28277  movie plot entries left in the merged dataset


[Stage 200:>                                                        (0 + 1) / 1]

+---------+------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [7]:
# as you can see, there might be multiple entries of the plot for the same movie
df.select('tconst', 'Title', 'Plot').filter("tconst == 'tt0472062'").show(truncate = False)

NameError: name 'df' is not defined

In [None]:

df = details.withColumn('inputText', F.col('plot_synopsis'))
df.show(3)

+---------+--------------------+--------------------+
| movie_id|       plot_synopsis|           inputText|
+---------+--------------------+--------------------+
|tt0105112|Jack Ryan (Ford) ...|Jack Ryan (Ford) ...|
|tt1204975|Four boys around ...|Four boys around ...|
|tt0040897|Fred Dobbs (Humph...|Fred Dobbs (Humph...|
+---------+--------------------+--------------------+
only showing top 3 rows



## Trainning Model

In [None]:
# tokenize and remove stop words in this cell
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, Word2Vec

# regular expression tokenizer to tokenize inputText into individual tokens (words)
regextok = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'inputText', outputCol = 'tokens')

# StopWordsRemover to remove stopwords in the list of tokens
stopwrmv = StopWordsRemover(inputCol = 'tokens', outputCol = 'tokens_sw_removed')

df = regextok.transform(df)
df = stopwrmv.transform(df)
df.show(3)

+---------+--------------------+--------------------+--------------------+--------------------+
| movie_id|       plot_synopsis|           inputText|              tokens|   tokens_sw_removed|
+---------+--------------------+--------------------+--------------------+--------------------+
|tt0105112|Jack Ryan (Ford) ...|Jack Ryan (Ford) ...|[jack, ryan, ford...|[jack, ryan, ford...|
|tt1204975|Four boys around ...|Four boys around ...|[four, boys, arou...|[four, boys, arou...|
|tt0040897|Fred Dobbs (Humph...|Fred Dobbs (Humph...|[fred, dobbs, hum...|[fred, dobbs, hum...|
+---------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



In [None]:
# train word2vec model
word2vec = Word2Vec(vectorSize = 100, minCount = 5, inputCol = 'tokens_sw_removed', outputCol = 'wordvectors')
model = word2vec.fit(df)

23/07/31 20:43:35 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

In [None]:
# using transform to add wordvectors column to dataframe
df = model.transform(df)
chunks = df.select('movie_id', 'plot_synopsis','wordvectors').limit(30000).collect()

                                                                                

In [None]:
# create search query and transform it to word vectors
SEARCH_QUERY = "Space"
query_df = spark.createDataFrame([(1, SEARCH_QUERY)]).toDF('index','inputText')
query_tok = regextok.transform(query_df)
query_swr = stopwrmv.transform(query_tok)
query_vec = model.transform(query_swr)
query_vec = query_vec.select('wordvectors').collect()[0][0]
query_vec

                                                                                

DenseVector([-0.1093, 0.1738, 0.1203, -0.0619, 0.0404, -0.1352, 0.0359, -0.1146, 0.0247, -0.1153, 0.0172, 0.0099, -0.0225, 0.0746, 0.2018, -0.0797, -0.1509, 0.0516, 0.0658, 0.0586, -0.1578, -0.1075, -0.0821, -0.0891, -0.2407, 0.0961, -0.1569, 0.0241, -0.0993, -0.1362, -0.1827, 0.0139, -0.1461, 0.112, -0.0165, 0.0126, 0.2531, -0.1752, -0.1197, -0.1175, 0.1581, 0.0465, -0.2404, -0.0014, 0.0457, -0.0476, 0.0756, -0.0037, 0.0915, -0.0976, 0.1124, -0.1103, 0.1618, 0.1009, 0.0595, 0.0667, 0.1741, 0.0599, -0.0769, -0.021, 0.0503, 0.0713, -0.1032, -0.1199, 0.2389, -0.1366, 0.0665, -0.0203, -0.0498, -0.2332, 0.0816, 0.0641, 0.1549, 0.1165, -0.0377, -0.0062, -0.1135, 0.1762, -0.0706, -0.0273, 0.1018, 0.1247, 0.1136, -0.0268, 0.1612, 0.0541, -0.0347, 0.1305, 0.0515, 0.094, -0.0534, 0.076, 0.0699, 0.0649, -0.0717, -0.0674, 0.1135, 0.0902, 0.0147, 0.0287])

In [None]:
# define function to calculate cosine similarity
import numpy as np
def cossim(v1, v2): 
    '''
        cossim(v1, v2) calculates the cosine similarity between v1 and v1.
        If v1 or v2 is a zero vector, it will return 0
    '''
    if np.dot(v1, v1) == 0 or np.dot(v2, v2) == 0:
        return 0.0
    return float(np.dot(v1, v2) / np.sqrt(np.dot(v1, v1)) / (np.sqrt(np.dot(v2, v2))))

In [None]:
data = [(i[0], float(cossim(query_vec, i[2])), i[1]) for i in chunks]


In [None]:
sim_df = spark.createDataFrame(data).toDF('movie_id', 'similarity', 'text').orderBy('similarity', ascending=False)
sim_df.show(5, truncate = False)

23/07/31 20:44:00 WARN TaskSetManager: Stage 78 contains a task of very large size (1589 KiB). The maximum recommended task size is 1000 KiB.


+---------+------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------