In [1]:
#!pip install -U pyspark
from pyspark.sql import SparkSession
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

## Section 1: Data Cleaning

There are three data sources used in this section: 
1. IMDB: used for matching movie title & ID
2. Details: contains plots&movie ID, used for trainning
3. Wiki_Plot: contains plots& movie name, used for trainning

In [2]:
path_to_imdb_dataset = '/Users/yupan/Library/CloudStorage/OneDrive-Personal/Academic/5430/data/title.basics.tsv.gz'
path_to_plots_dataset = '/Users/yupan/Library/CloudStorage/OneDrive-Personal/Academic/5430/data/wiki_movie_plots_deduped.csv'
path_to_details_dataset = '/Users/yupan/Library/CloudStorage/OneDrive-Personal/Academic/5430/data/IMDB_movie_details.json'
path_to_reviews_dataset = '/Users/yupan/Library/CloudStorage/OneDrive-Personal/Academic/5430/data/IMDB_reviews.json'

In [3]:
""" path_to_imdb_dataset = 'title.basics.tsv.gz'
path_to_reviews_dataset = 'IMDB_reviews.json'
path_to_plots_dataset = 'wiki_movie_plots_deduped.csv'
path_to_details_dataset = 'IMDB_movie_details.json' """

" path_to_imdb_dataset = 'title.basics.tsv.gz'\npath_to_reviews_dataset = 'IMDB_reviews.json'\npath_to_plots_dataset = 'wiki_movie_plots_deduped.csv'\npath_to_details_dataset = 'IMDB_movie_details.json' "

In [4]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = SparkSession.builder.getOrCreate()
print("Using Apache Spark Version", spark.version)

23/08/10 12:58:18 WARN Utils: Your hostname, Yus-MacBook-Air-2.local resolves to a loopback address: 127.0.0.1; using 192.168.181.65 instead (on interface en0)
23/08/10 12:58:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/10 12:58:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Using Apache Spark Version 3.4.1


In [5]:
# clean & combine the IMDB dataset with details dataset 
# reading the IMDB dataset
imdb = spark.read.options(header = True, inferSchema = True, delimiter = "\t")\
  .csv(path_to_imdb_dataset)
# filter the imdb dataset so that only movies are included
imdb = imdb.filter("titleType = 'movie'")\
  .select('tconst', 'primaryTitle', 'startYear')\
    .withColumnRenamed('startYear', 'Year')\
      .withColumnRenamed('primaryTitle', 'Title')\
        .dropDuplicates(['Title', 'Year'])


# reading the details dataset, preserving only three important variables
details_summary = spark.read.json(path_to_details_dataset)
details_summary = details_summary\
  .select('movie_id','plot_summary')\
    .withColumnRenamed('plot_summary','Plot')

# reading the details dataset, preserving only three important variables
details_synopsis = spark.read.json(path_to_details_dataset)
details_synopsis = details_synopsis.select('movie_id','plot_synopsis')\
  .filter("plot_synopsis != ''")\
    .withColumnRenamed('plot_synopsis', 'Plot')

details = details_summary.union(details_synopsis)


from pyspark.sql.functions import lit
# join the imdb with details by matching the unique identifier(e.g. tt0000000)
imdb_join_details = imdb.join(details, imdb.tconst == details.movie_id, 'inner')\
  .withColumnRenamed('tconst', 'id')\
    .select('id', 'Title', 'Plot')\
      .withColumn("Source", lit("imdb_details"))

print("The joined dataset has ", imdb_join_details.count(), " entries")

[Stage 7:>                                                          (0 + 8) / 8]

The joined dataset has  2857  entries


                                                                                

In [6]:
# clean and combine wiki plot dataset with imdb dataset
from pyspark.sql.functions import length
# reading the plot dataset, preserving only three important variables
wiki_plot = spark.read.options(header = True, inferSchema = True, quote = '"', escape = '"', multiLine = True).csv(path_to_plots_dataset)
wiki_plot = wiki_plot.select('Title', 'Release Year','Plot')\
  .withColumnRenamed('Release Year', 'Year')\
    .filter(length(wiki_plot['Plot']) >= 200) # filter out the very short plot descriptions


# join the imdb with the plot dataset by matching movie titles and release year
imdb_join_plot = imdb.join(wiki_plot, ["Title", "Year"], 'inner')\
  .withColumnRenamed('tconst', 'id')\
    .select('id', 'Title', 'Plot')\
      .withColumn("Source", lit("wiki_plot"))

print("The joined dataset has ", imdb_join_plot.count(), " entries")

[Stage 18:>                                                         (0 + 8) / 8]

The joined dataset has  25361  entries


                                                                                

In [7]:
# combine the above two dataset to get the dataset that we will train the model on
df = imdb_join_plot.union(imdb_join_details)

print('after merging & cleaning, there is a total of ', df.count(), ' movie plot entries left in the merged dataset')
# inspect the combined new dataset
df.show(1)

                                                                                

after merging & cleaning, there is a total of  28218  movie plot entries left in the merged dataset


[Stage 40:>                                                         (0 + 8) / 8]

+---------+-----+--------------------+---------+
|       id|Title|                Plot|   Source|
+---------+-----+--------------------+---------+
|tt0790799|$9.99|The film mainly f...|wiki_plot|
+---------+-----+--------------------+---------+
only showing top 1 row



                                                                                

## Section 2: NER

In [8]:
import json


In [9]:
with open("nyt_articles2.json", 'r') as f:
    newsfeeds = json.load(f)

len(newsfeeds)

1158

In [10]:
import pandas as pd
df_news = pd.DataFrame(newsfeeds)


In [11]:
import requests

In [12]:
df_news.keys()

Index(['Headline', 'Abstract', 'Lead_Paragraph', 'Snippet', 'Published_date',
       'Author', 'News_desk', 'URL', 'Source'],
      dtype='object')

In [13]:
all_URL = []
for index, row in df_news.iterrows():
    all_URL.append(row["URL"])

In [14]:
all_headlines = []
for index, row in df_news.iterrows():
    all_headlines.append(row["Headline"])

In [15]:
all_snippets = []
for index, row in df_news.iterrows():
    all_snippets.append(row["Snippet"])

In [16]:
with open("NER_Tagging.json", "r") as json_file:
    all_JsonResponses = json.load(json_file)

In [17]:
#Get Entities
entity_dict = {}

print('====Entities====')
print('Article_No, Type, Name, Article Title')
art_no = 0
dict_key = 0
for TRITJsonResponse in all_JsonResponses:
    art_no += 1
    for key in TRITJsonResponse:
        dict_key += 1
        if ('_typeGroup' in TRITJsonResponse[key]):
            if TRITJsonResponse[key]['_typeGroup'] == 'entities':
                entities = str(art_no) + ", " + TRITJsonResponse[key]['_type'] + ", " + TRITJsonResponse[key]['name'] + ", " + all_headlines[art_no]
                entity_dict[dict_key] = entities

====Entities====
Article_No, Type, Name, Article Title


In [18]:
def print_output (dictionary, search_value):
    keys_with_search_value = []
    for key, value in dictionary.items():
        if search_value in value:
            keys_with_search_value.append(key)

    output_list = []
    for key in keys_with_search_value:
        output_list.append(f"Dict_Key: {key}, Article: {dictionary[key]}")    

    print('=====Article Titles=====')
    for values in output_list:
        result = values.split(',')[-1].strip()
        print(result)

In [19]:
value = 'warner'

print_output(entity_dict, value)

=====Article Titles=====
The 25 Greatest Actors of the 21st Century (So Far)
‘Shaun the Sheep Movie’: Designing the Characters
Hollywood as Biographer
The 25 Greatest Actors of the 21st Century (So Far)
Film Series Listings
The 25 Best Films of the 21st Century So Far.
10 Sundance Movies With Heat
2018 Academy Awards
The 25 Best Films of the 21st Century So Far.
2018 Golden Globe Awards
Is Dead at 80
Golden Globes: The Projectionist’s Takeaways
What the Movies Taught Me About Being a Woman
2018 Academy Awards
How Will Movies Survive the Next 10 Years?
28 Films for Black History Month
New Books Look at the ‘Peanuts’ Gang
‘Shaun the Sheep Movie’: Designing the Characters
’ in Two Minutes
You Know These 20 Movies. Now Meet the Women Behind Them
28 Films for Black History Month
David Bowie in the Movies
How Will Movies Survive the Next 10 Years?
The 25 Best Films of the 21st Century So Far.
What the Movies Taught Me About Being a Woman
The 25 Best Films of the 21st Century So Far.
The 25 G

In [20]:
# Create a new dictionary to store the structured articles
structured_articles = {}

for article_id, article_content in entity_dict.items():
    tag_info = article_content.split(', ')
    article_number = int(tag_info[0])
    tag_type = tag_info[1]
    value = tag_info[2]
    title = tag_info[3]

    # If the article number already exists in the structured_articles dictionary, update the 'Movies' key
    if article_number in structured_articles:
        if tag_type == 'Movie':
            structured_articles[article_number]['Movies'].append(value)
    else:
        # If the article number doesn't exist, create a new entry with the 'Movies' key
        structured_articles[article_number] = {
            'Article': article_number,
            'Title': title,
            'Movies': [] if tag_type == 'Movie' else [],
        }

# Now, structured_articles is a dictionary with article numbers as keys and their associated information, including movie tags in the 'Movies' key
#print(structured_articles)


In [21]:
def search_movie_in_articles(articles_dict, movie_name):
    found_articles = []
    for article_info in articles_dict.values():
        if 'Movies' in article_info and movie_name in article_info['Movies']:
            found_articles.append(article_info)
    return found_articles

In [22]:
def print_tags(entity_dict, article_number):
    tags_by_article = {}
    for key, value in entity_dict.items():
        tag_info = value.split(', ')
        current_article_number = int(tag_info[0])
        if current_article_number == article_number:
            tag_type = tag_info[1]
            tag_value = tag_info[2]
            tags_by_article.setdefault(tag_type, []).append(tag_value)

    # Print the tags for the specified article number
    tag_number = f"Tags for Article Number {article_number}:"
    for tag_type, tag_values in tags_by_article.items():
        tag_type_value = f"{tag_type}s: {', '.join(tag_values)}"

In [23]:
pandas_df = df.toPandas()

                                                                                

In [24]:
for index, row in pandas_df.iterrows():
    movie_name = row['Title']  # Assuming 'title' is the column name for movie titles

    movie_information = []

    for article_info in search_movie_in_articles(structured_articles, movie_name):
        article_number = f"Article Number: {article_info['Article']}"
        title_info = f"Title: {article_info['Title']}"
        if 'Movies' in article_info:
            movie_info = f"Movies: {', '.join(article_info['Movies'])}" if 'Movies' in article_info else ""
        division = '-' * 50
        tags = print_tags(entity_dict, article_info['Article']) if print_tags(entity_dict, article_info['Article']) else""
        
        movie_information.append([article_number, title_info, movie_info, division, tags])

    entity_info = movie_information# Combine movie information into a single string
    print(entity_info)
    pandas_df.at[index, 'Entity'] = entity_info  # Update 'entity' column for the current row











































































































































































































































































































































































































Article Number: 351
Title: Movies Seek Laughs With All Manner of Sex Scenes
Movies: Self/less, Beauty and the Beast, Being John Malkovich, Child's Play, Source Code
--------------------------------------------------




















Article Number: 131
Title: A Blockbuster Series That Has Legs
Movies: Samsara, Below, Baraka
--------------------------------------------------




































































































































































Article Number: 22
Title: From the BBC
Movies: Black and Tan, 

## Section 3: Sentiment Analysis

In [80]:
with open(path_to_reviews_dataset, 'r') as f:
    json_data = f.read()

# Split the file contents into individual JSON objects
json_objects = json_data.strip().split('\n')

# Load each JSON object and store them in a list
loaded_data = []
for obj in json_objects:
    data = json.loads(obj)
    loaded_data.append(data)



In [26]:
# Convert loaded_data to a DataFrame
loaded_data_df = pd.DataFrame(loaded_data)

In [27]:
import nltk

def split_into_sentences(text):
    # Use the punkt tokenizer to split the text into sentences
    sentences = nltk.sent_tokenize(text)
    return sentences

In [28]:
# Initialize the Emotion Classification
from nrclex import NRCLex

# Initialize the Sentiment Intensity Analyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
sent_analyzer = SentimentIntensityAnalyzer()

In [29]:
# Combine reviews for the same movie
reviews_plot_list = []

# Iterate through the merged_df
for index, row in loaded_data_df[:1000].iterrows():
    id = row['movie_id']
    review_text = row['review_text']

    # Split review_text into sentences
    review_sentences = split_into_sentences(review_text)
    
    # Check if id or title exists in the reviews_plot_list
    existing_movie = next((movie for movie in reviews_plot_list if movie['id'] == id), None)
    if existing_movie:
        existing_movie['reviews'].extend(review_sentences)
    else:
        reviews_plot_list.append({'id': id, 'reviews': review_sentences})

In [30]:
reviews_plot_df = pd.DataFrame(reviews_plot_list)

In [31]:
merged_data = pd.merge(pandas_df, reviews_plot_df, left_on='id', right_on='id', how='left')
merged_data.fillna('', inplace=True)

In [32]:
pandas_df = merged_data

In [33]:
pandas_df['split_plot'] = pandas_df['Plot'].apply(split_into_sentences)

In [34]:
import math

def normalize(score, alpha=15):
    """
    Normalize the score to be between -1 and 1 using an alpha that
    approximates the max expected value
    """
    norm_score = score/math.sqrt((score*score) + alpha)
    return norm_score

In [35]:
# Create an empty dictionary to store all the updated movie data
updated_movies = {}

# Iterate through the movies in reviews_plot_list
for index, movie in pandas_df[:100].iterrows():
    id = movie['id']
    title = movie['Title']
    reviews = movie.get('reviews', [])
    plots = movie.get('split_plot', [])

    # Combine all reviews and plots into one string
    all_reviews_text = ' '.join(reviews)
    all_plots_text = ' '.join(plots)

    # Calculate sentiment scores using sent_analyzer for reviews and normalize
    if reviews:
        review_sentiment_scores = [normalize(sent_analyzer.polarity_scores(sentence)['compound']) for sentence in reviews]
        combined_review_sentiment = sum(review_sentiment_scores) / len(review_sentiment_scores)
        # Perform emotion analysis using NRCLex for reviews
        review_emotion_scores = NRCLex(all_reviews_text).affect_frequencies
    else:
        combined_review_sentiment = None
        review_emotion_scores = None
    
    # Calculate sentiment scores using sent_analyzer for plots and normalize
    if plots:
        plot_sentiment_scores = [normalize(sent_analyzer.polarity_scores(sentence)['compound']) for sentence in plots]
        combined_plot_sentiment = sum(plot_sentiment_scores) / len(plot_sentiment_scores)
        # Perform emotion analysis using NRCLex for plots
        plot_emotion_scores = NRCLex(all_plots_text).affect_frequencies
    else:
        combined_plot_sentiment = None
        plot_emotion_scores = None

    # Sort emotion scores in descending order of value for reviews and plots
    sorted_review_emotion_scores = sorted(review_emotion_scores.items(), key=lambda x: x[1], reverse=True) if review_emotion_scores else None
    sorted_plot_emotion_scores = sorted(plot_emotion_scores.items(), key=lambda x: x[1], reverse=True) if plot_emotion_scores else None

    # Create a new dictionary with additional information
    updated_movie_one = {
        'id': id,
        'title': title,
        'reviews_emotion_scores': sorted_review_emotion_scores,
        'reviews_sentiment': combined_review_sentiment,
        'plots_emotion_scores': sorted_plot_emotion_scores,
        'plots_sentiment': combined_plot_sentiment
    }
    
    # Store the updated movie data in the all_updated_movies dictionary
    updated_movies[id] = updated_movie_one

In [36]:
def search_movie_by_title(updated_movie, search_title):
    matching_movies = []
    for movie_id, movie_data in updated_movie.items():
        if search_title.lower() in movie_data['title'].lower():
            matching_movies.append(movie_data)
    return matching_movies

In [75]:
for index, row in pandas_df.iterrows():
    search_title = row['Title']  # Assuming 'title' is the column name for movie titles

    movie_sentiment = []
    
    for movie in search_movie_by_title(updated_movies, search_title):
        
        for key, value in movie.items():
            if value is None: 
                continue
            if key == 'reviews_emotion_scores':
                review_title = "=====Reviews Emotion Scores:====="
                for emotion, score in value:
                    review_emotion_score = f"{emotion}: {score}"
                    movie_sentiment.append([review_title, review_emotion_score])
            elif key == 'plots_emotion_scores':
                plot_title = "=====Plots Emotion Scores:====="
                for emotion, score in value:
                    plots_emotion_score = f"{emotion}: {score}"
                    movie_sentiment.append([plot_title, plots_emotion_score])
            elif key != 'id' and key != 'title':
                unknown = f"{key}: {value}"
                movie_sentiment.append("")
            
        movie_sentiment_info = movie_sentiment # Combine movie sentiment into a single string
        print(movie_sentiment_info)
        pandas_df.at[index, 'Sentiment'] = movie_sentiment_info  # Update 'Sentiment' column for the current row


[['=====Plots Emotion Scores:=====', 'positive: 0.2'], ['=====Plots Emotion Scores:=====', 'joy: 0.2'], ['=====Plots Emotion Scores:=====', 'fear: 0.13333333333333333'], ['=====Plots Emotion Scores:=====', 'trust: 0.13333333333333333'], ['=====Plots Emotion Scores:=====', 'anticipation: 0.13333333333333333'], ['=====Plots Emotion Scores:=====', 'surprise: 0.06666666666666667'], ['=====Plots Emotion Scores:=====', 'negative: 0.06666666666666667'], ['=====Plots Emotion Scores:=====', 'sadness: 0.06666666666666667'], ['=====Plots Emotion Scores:=====', 'anger: 0.0'], ['=====Plots Emotion Scores:=====', 'anticip: 0.0'], ['=====Plots Emotion Scores:=====', 'disgust: 0.0'], '']
[['=====Plots Emotion Scores:=====', 'negative: 0.19158878504672897'], ['=====Plots Emotion Scores:=====', 'fear: 0.1542056074766355'], ['=====Plots Emotion Scores:=====', 'positive: 0.14485981308411214'], ['=====Plots Emotion Scores:=====', 'anger: 0.1261682242990654'], ['=====Plots Emotion Scores:=====', 'sadness: 0

## Section 4. Word2vec Movie Recommender Model Training
The first part of this notebook is dedicated to data cleaning and trainning the word2vec model to create the following three tools for the studio writers and executives: 
1. Basic movie recommender: The user input one movie; and the system recommends 10 other movies with similar plotlines. 
2. Advance movie recommender: The user input two movies; and the system recommends 10 other movies with plotlines that are similar to the combination of these two movies. 
3. Duplicate plot checker: The user input his/her script for a new movie idea, and the system checks if his/her idea has already been produced in a previous movie. 

### 4.1 Training Word2vec Model 
This Word2Vec model is trained using the cleaned data above, the model is fed with around 28,000 entries of texts that describe movie plots. The resulting model will be useful in finding similarities in movie plotlines. 

In [38]:
# tokenize and remove stop words in this cell
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, Word2Vec

# create a new field by copying Plot
df = df.withColumn('inputText', F.col('Plot')) 

# regular expression tokenizer to tokenize inputText into individual tokens (words)
regextok = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'inputText', outputCol = 'tokens')

# StopWordsRemover to remove stopwords in the list of tokens
stopwrmv = StopWordsRemover(inputCol = 'tokens', outputCol = 'tokens_sw_removed')
df = regextok.transform(df)
df = stopwrmv.transform(df)

In [43]:
# train word2vec model, the parameters here can be changed to optimize the model
word2vec = Word2Vec(vectorSize = 100, minCount = 5, inputCol = 'tokens_sw_removed', outputCol = 'wordvectors')
model = word2vec.fit(df)

                                                                                

In [44]:
# using transform to add wordvectors column to dataframe
df = model.transform(df)

In [70]:
ner_sentiment_df = pandas_df[['id', 'Entity','Sentiment']]
ner_sentiment_df = spark.createDataFrame(ner_sentiment_df) # move this to later cells

chunks = df.select('id', 'Title','wordvectors', 'Plot', 'Source')
chunks = chunks.join(ner_sentiment_df,on = 'id', how = 'outer').limit(30000).collect()

  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
                                                                                

In [46]:
# define function to calculate cosine similarity for later
import numpy as np
def cossim(v1, v2): 
  '''
      cossim(v1, v2) calculates the cosine similarity between v1 and v1.
      If v1 or v2 is a zero vector, it will return 0
  '''
  if np.dot(v1, v1) == 0 or np.dot(v2, v2) == 0:
      return 0.0
  return float(np.dot(v1, v2) / np.sqrt(np.dot(v1, v1)) / (np.sqrt(np.dot(v2, v2))))

### 4.2 Create Basic, Advanced Recommender and Duplicate Plot Checker
** __note that the most of the code in this section is blocked because the final implementation is inside flask__ **

These three tools use the same model at their core. Therefore, to optimize performance, the implementation will create only a single session when running. 

In [47]:
# writing a function to obtain the plot string from the plot dataset
def acquire_plot(base_movie: str): 
  # input: a movie name (precise) or a movie id 
  # output: the movie's plot

  if base_movie.startswith("tt"):   # search by movie name
    base_movie_row = df.filter(df.id == base_movie).collect()
  else:                             # search by movie id
    base_movie_row = df.filter(df.Title == base_movie).collect()

  if base_movie_row: 
    movie_plot = base_movie_row[0]['Plot']
    return movie_plot
  else: 
    print("Sorry, ", base_movie, " is not found in the database. Please type in exact movie names")

In [48]:
def query_preprocessing(plot: str): 
  plot_df = spark.createDataFrame([(1, plot)]).toDF('index','inputText')
  plot_tok = regextok.transform(plot_df)
  plot_swr = stopwrmv.transform(plot_tok)
  plot_vec = model.transform(plot_swr)
  plot_vec = plot_vec.select('wordvectors').collect()[0][0]
  return plot_vec

### 4.2.1 Basic recommender
The Basic Recommender only takes in one movie as the only parameter: 

In [49]:
""" input_user_1 = '2012'    # User input The exact movie name or movie id(e.g. 'tt1023003')
basic_movie_plot = acquire_plot(input_user_1)
basic_vec = query_preprocessing(basic_movie_plot) """

" input_user_1 = '2012'    # User input The exact movie name or movie id(e.g. 'tt1023003')\nbasic_movie_plot = acquire_plot(input_user_1)\nbasic_vec = query_preprocessing(basic_movie_plot) "

### 4.2.2 Advanced Recommender
The Advanced Recommender takes in one extra movie as the second parameter, then our word2vec model will be able to recommend a third movie that has similar plot as the combination of the first two. : 

In [50]:
""" input_user_2 = '' #'tt0468569'    # User input The exact movie name or movie id(e.g. 'tt1023003')
if input_user_2: 
  second_movie_plot = acquire_plot(input_user_2)
  second_vec = query_preprocessing(second_movie_plot)
  combined_vec = basic_vec + second_vec """

" input_user_2 = '' #'tt0468569'    # User input The exact movie name or movie id(e.g. 'tt1023003')\nif input_user_2: \n  second_movie_plot = acquire_plot(input_user_2)\n  second_vec = query_preprocessing(second_movie_plot)\n  combined_vec = basic_vec + second_vec "

### 4.2.3 Duplicate Plot Checker
The duplicate Plot checker is similar to the basic recommender where it finds an existing movie with similar plot as the user's input plot description. 

In [51]:
""" input_user_3 = 'Once upon a time in a crime ridden Gotham City, a member of the rich Wayne family decided to put on a mask and protect the people of Gotham'
check_vec = query_preprocessing(input_user_3) """

" input_user_3 = 'Once upon a time in a crime ridden Gotham City, a member of the rich Wayne family decided to put on a mask and protect the people of Gotham'\ncheck_vec = query_preprocessing(input_user_3) "

### 4.3 implementation of the above three tools
The parameters of the three tools have been created. To optimize performance, the implementation will create only one single session when running. 

In [52]:
""" if input_user_3: 
  
  print("Running Duplicate Plot Checker")
  data = [(i[0], float(cossim(check_vec, i[2])), i[1], i[4], i[3]) for i in chunks]
  sim_df = spark.createDataFrame(data).toDF('movie_id', 'similarity', 'Title', 'Source', 'Plot')
  sim_df = (sim_df.dropDuplicates(['movie_id'])
            .orderBy('similarity', ascending=False)
            .limit(30))
  sim_df.show(10, truncate=False)

elif not input_user_2:  # if input_user_2 is empty, then run the basic recommender
  
  print("Only one movie is input, running Basic Recommender")
  data = [(i[0], float(cossim(basic_vec, i[2])), i[1], i[4], i[3]) for i in chunks]
  sim_df = spark.createDataFrame(data).toDF('movie_id', 'similarity', 'Title', 'Source', 'Plot')
  sim_df = (sim_df.filter((sim_df.Title != input_user_1) & (sim_df.movie_id != input_user_1))
            .dropDuplicates(['movie_id'])
            .orderBy('similarity', ascending=False)
            .limit(30))
  sim_df.show(10, truncate=False)
  
elif input_user_2: 
  
  print("Only two movies are input, running Advanced Recommender")
  data = [(i[0], float(cossim(combined_vec, i[2])), i[1], i[4], i[3]) for i in chunks]
  sim_df = spark.createDataFrame(data).toDF('movie_id', 'similarity', 'Title', 'Source', 'Plot')
  sim_df = (sim_df.filter((sim_df.Title != input_user_1) & (sim_df.movie_id != input_user_1)
                          & (sim_df.Title != input_user_2) & (sim_df.movie_id != input_user_2))
            .dropDuplicates(['movie_id'])
            .orderBy('similarity', ascending=False)
            .limit(30))
  sim_df.show(10, truncate=False) """

' if input_user_3: \n  \n  print("Running Duplicate Plot Checker")\n  data = [(i[0], float(cossim(check_vec, i[2])), i[1], i[4], i[3]) for i in chunks]\n  sim_df = spark.createDataFrame(data).toDF(\'movie_id\', \'similarity\', \'Title\', \'Source\', \'Plot\')\n  sim_df = (sim_df.dropDuplicates([\'movie_id\'])\n            .orderBy(\'similarity\', ascending=False)\n            .limit(30))\n  sim_df.show(10, truncate=False)\n\nelif not input_user_2:  # if input_user_2 is empty, then run the basic recommender\n  \n  print("Only one movie is input, running Basic Recommender")\n  data = [(i[0], float(cossim(basic_vec, i[2])), i[1], i[4], i[3]) for i in chunks]\n  sim_df = spark.createDataFrame(data).toDF(\'movie_id\', \'similarity\', \'Title\', \'Source\', \'Plot\')\n  sim_df = (sim_df.filter((sim_df.Title != input_user_1) & (sim_df.movie_id != input_user_1))\n            .dropDuplicates([\'movie_id\'])\n            .orderBy(\'similarity\', ascending=False)\n            .limit(30))\n  sim_d

## Section 5: LDA Modeling

In [53]:
from nltk.corpus import stopwords 
stopwords = list(set(stopwords.words('english')))

print('The first 10 stopwords in the list:', stopwords[:10])

The first 10 stopwords in the list: ['ma', 'ours', 'itself', 'of', 'hers', 'or', 've', "isn't", 'ain', 'now']


In [54]:
import string
punctuation = list(set(string.punctuation))

print('The punctuations are:', punctuation)

The punctuations are: ['(', '#', '+', '%', '<', '{', '}', '|', '\\', '?', '!', ']', '-', ',', '"', '[', ':', '$', '@', '`', '.', '~', '>', ';', ')', '^', '*', '/', '_', "'", '=', '&']


In [55]:
import re
from nltk.stem.wordnet import WordNetLemmatizer

def cleanup(text):
    '''
        input: raw text
        output: a list of words
    '''
    
    text = " ".join([i for i in text.lower().split() if i not in stopwords])          
    text = "".join(re.sub(r'[^a-zA-Z ]', '', i) for i in text if i not in punctuation) 
    text = [WordNetLemmatizer().lemmatize(i) for i in text.split()]                    
    return text

In [56]:
from gensim.corpora.dictionary import Dictionary

In [57]:
plot_text = pandas_df['Plot'].apply(cleanup)
print('List of tokens:')
plot_text[:10]

List of tokens:


0    [film, mainly, focus, yearold, dave, peck, une...
1    [gary, hook, new, recruit, british, army, take...
2    [total, stranger, dan, hardesty, george, brent...
3    [jessie, middleaged, woman, living, widowed, m...
4    [film, begin, john, christie, murdering, neigh...
5    [balan, k, nair, dulquer, salman, call, bkn, r...
6    [maria, paul, couple, forty, travel, spain, ne...
7    [film, start, dream, sequence, depicting, sara...
8    [exactly, noon, dadar, railway, station, bomba...
9    [yearold, mike, odonnell, zac, efron, learns, ...
Name: Plot, dtype: object

In [58]:
dictionary = Dictionary(plot_text)

In [59]:
print('Count of raw tokens: ' + str(len(dictionary.items())))

Count of raw tokens: 165773


In [60]:
## can change the filter extreme values

print('Filter out tokens that appear less than 80 times and more than 80% of the titles')
dictionary.filter_extremes(no_below=80, no_above=0.8)
print('Count of tokens: '+ str(len(dictionary.items())))

Filter out tokens that appear less than 80 times and more than 80% of the titles
Count of tokens: 7300


In [61]:
corpora = [dictionary.doc2bow(doc) for doc in plot_text]

In [62]:
# can change the number of topics  and passes
from gensim.models import ldamodel
lda_model = ldamodel.LdaModel(corpora, num_topics=5, id2word = dictionary, passes=10)

In [63]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpora, dictionary)
vis

In [64]:
pyLDAvis.save_html(vis, '/Users/yupan/Library/CloudStorage/OneDrive-Personal/Academic/5430/5430-Project/static/lda.html')

## Section 6: Flask

In [89]:
from flask import Flask, request, render_template
import time

app = Flask("JSON_OUTPUT")

@app.route('/')
def form():
    return render_template('FlaskOutputTemplate.html')

@app.route('/topic visualisation')
def lda_vis():
    return render_template('lda_topic.html')
        
@app.route('/submit', methods=['GET','POST'])
def submit():
    start_time = time.time()
  
    if request.method == 'POST':
        input_1 = str(request.form['q1'])
        input_2 = str(request.form['q2'])
        input_3 = str(request.form['q3'])


        if input_1: 
          print('input_1 is not empty')
          input_user_1 = input_1    # User input The exact movie name or movie id(e.g. 'tt1023003')
          basic_movie_plot = acquire_plot(input_user_1)
          basic_vec = query_preprocessing(basic_movie_plot)
          
        if input_2: 
          print('input_2 is not empty')
          input_user_2 = input_2 #'tt0468569'    # User input The exact movie name or movie id(e.g. 'tt1023003')
          if input_user_2: 
            second_movie_plot = acquire_plot(input_user_2)
            second_vec = query_preprocessing(second_movie_plot)
            combined_vec = basic_vec + second_vec
        
        if input_3: 
          print('input_3 is not empty')
          input_user_3 = input_3
          check_vec = query_preprocessing(input_user_3)
        
        if input_3: 
          print("Running Duplicate Plot Checker")
          data = [(i[0], float(cossim(check_vec, i[2])), i[1], i[4], i[3], i[5],i[6]) for i in chunks]
          sim_df = spark.createDataFrame(data).toDF('movie_id', 'similarity', 'Title', 'Source', 'Plot','Entity','Sentiment')
          sim_df = (sim_df.dropDuplicates(['movie_id'])
            .orderBy('similarity', ascending=False)
            .limit(30))

        elif not input_2:  # if input_user_2 is empty, then run the basic recommender
          print("Only one movie is input, running Basic Recommender")
          data = [(i[0], float(cossim(basic_vec, i[2])), i[1], i[4], i[3],i[5],i[6]) for i in chunks]
          sim_df = spark.createDataFrame(data).toDF('movie_id', 'similarity', 'Title', 'Source', 'Plot','Entity','Sentiment')
          sim_df = (sim_df.filter((sim_df.Title != input_user_1) & (sim_df.movie_id != input_user_1))
            .dropDuplicates(['movie_id'])
            .orderBy('similarity', ascending=False)
            .limit(30))
      

        elif input_2:
          print("Only two movies are input, running Advanced Recommender")
          data = [(i[0], float(cossim(combined_vec, i[2])), i[1], i[4], i[3],i[5],i[6]) for i in chunks]
          sim_df = spark.createDataFrame(data).toDF('movie_id', 'similarity', 'Title', 'Source', 'Plot','Entity','Sentiment')
          sim_df = (sim_df.filter((sim_df.Title != input_user_1) & (sim_df.movie_id != input_user_1)
                          & (sim_df.Title != input_user_2) & (sim_df.movie_id != input_user_2))
            .dropDuplicates(['movie_id'])
            .orderBy('similarity', ascending=False)
            .limit(30))

    elapsed_time = round(time.time() - start_time, 2)
    html_table = sim_df.toPandas().to_html(classes='table')
    return render_template('TableOutput.html', table=html_table, time_taken=elapsed_time)

app.run(host='localhost', port=7039)

 * Serving Flask app 'JSON_OUTPUT'
 * Debug mode: off


 * Running on http://localhost:7039
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [10/Aug/2023 14:41:44] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [10/Aug/2023 14:41:44] "[36mGET /static/logo.png HTTP/1.1[0m" 304 -

input_1 is not empty


                                                                                

input_2 is not empty


                                                                                

Only two movies are input, running Advanced Recommender


  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
23/08/10 14:42:32 WARN TaskSetManager: Stage 464 contains a task of very large size (8981 KiB). The maximum recommended task size is 1000 KiB.
INFO:werkzeug:127.0.0.1 - - [10/Aug/2023 14:42:37] "POST /submit HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [10/Aug/2023 14:43:03] "GET /topic%20visualisation HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [10/Aug/2023 14:43:03] "[36mGET /static/logo.png HTTP/1.1[0m" 304 -
INFO:werkzeug:127.0.0.1 - - [10/Aug/2023 14:43:03] "GET /static/lda.html HTTP/1.1" 200 -
23/08/10 14:56:00 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 166952 ms exceeds timeout 120000 ms
23/08/10 14:56:00 WARN SparkContext: Killing executors is not supported by current scheduler.
23/08/10 14:56:03 WARN Executor: Issue communicating with driver in heartbeater
org.apache.spark.SparkException: E

23/08/10 14:56:12 WARN Executor: Issue communicating with driver in heartbeater
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:322)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:101)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:85)
	at org.apache.spark.storage.BlockManagerMaster.registerBlockManager(BlockManagerMaster.scala:80)
	at org.apache.spark.storage.BlockManager.reregister(BlockManager.scala:641)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor.scala:1111)
	at org.apache.spark.executor.Executor.$anonfun$heartbeater$1(Executor.scala:244)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:2088)
	at org.apache.spark.Heartbeater$$anon$1.run(Heartbeater.scala:46)
	at java.util.c