In [1]:
#!pip install -U pyspark
from pyspark.sql import SparkSession
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

## Section 1: Data Cleaning

There are three data sources used in this section: 
1. IMDB: used for matching movie title & ID
2. Details: contains plots&movie ID, used for trainning
3. Wiki_Plot: contains plots& movie name, used for trainning

In [2]:
#path_to_imdb_dataset = '/Users/yupan/Library/CloudStorage/OneDrive-Personal/Academic/5430/data/title.basics.tsv.gz'
#path_to_reviews_dataset = '/Users/yupan/Library/CloudStorage/OneDrive-Personal/Academic/5430/data/IMDB_reviews.json'
#path_to_plots_dataset = '/Users/yupan/Library/CloudStorage/OneDrive-Personal/Academic/5430/data/wiki_movie_plots_deduped.csv'
#path_to_details_dataset = '/Users/yupan/Library/CloudStorage/OneDrive-Personal/Academic/5430/data/IMDB_movie_details.json'

In [3]:
path_to_imdb_dataset = 'title.basics.tsv.gz'
path_to_reviews_dataset = 'IMDB_reviews.json'
path_to_plots_dataset = 'wiki_movie_plots_deduped.csv'
path_to_details_dataset = 'IMDB_movie_details.json'

In [4]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = SparkSession.builder.getOrCreate()
print("Using Apache Spark Version", spark.version)

Using Apache Spark Version 3.4.1


In [5]:
# clean & combine the IMDB dataset with details dataset 
# reading the IMDB dataset
imdb = spark.read.options(header = True, inferSchema = True, delimiter = "\t")\
  .csv(path_to_imdb_dataset)
# filter the imdb dataset so that only movies are included
imdb = imdb.filter("titleType = 'movie'")\
  .select('tconst', 'primaryTitle', 'startYear')\
    .withColumnRenamed('startYear', 'Year')\
      .withColumnRenamed('primaryTitle', 'Title')\
        .dropDuplicates(['Title', 'Year'])
print('there is a total of ', imdb.count(), ' movies in the imdb dataset')


# reading the details dataset, preserving only three important variables
details_summary = spark.read.json(path_to_details_dataset)
details_summary = details_summary\
  .select('movie_id','plot_summary')\
    .withColumnRenamed('plot_summary','Plot')

# reading the details dataset, preserving only three important variables
details_synopsis = spark.read.json(path_to_details_dataset)
details_synopsis = details_synopsis.select('movie_id','plot_synopsis')\
  .filter("plot_synopsis != ''")\
    .withColumnRenamed('plot_synopsis', 'Plot')

details = details_summary.union(details_synopsis)
print('there is a total of ', details.count(), ' plot descriptions in the details dataset')


from pyspark.sql.functions import lit
# join the imdb with details by matching the unique identifier(e.g. tt0000000)
imdb_join_details = imdb.join(details, imdb.tconst == details.movie_id, 'inner')\
  .withColumnRenamed('tconst', 'id')\
    .select('id', 'Title', 'Plot')\
      .withColumn("Source", lit("imdb_details"))

print("The joined dataset has ", imdb_join_details.count(), " entries")
# inspect the joined dataset
imdb_join_details.show(3)

there is a total of  639780  movies in the imdb dataset
there is a total of  2911  plot descriptions in the details dataset
The joined dataset has  2857  entries
+---------+--------------+--------------------+------------+
|       id|         Title|                Plot|      Source|
+---------+--------------+--------------------+------------+
|tt2294449|22 Jump Street|Following their s...|imdb_details|
|tt2294449|22 Jump Street|After making thei...|imdb_details|
|tt0120623|  A Bug's Life|On a small island...|imdb_details|
+---------+--------------+--------------------+------------+
only showing top 3 rows



In [7]:
# clean and combine wiki plot dataset with imdb dataset
from pyspark.sql.functions import length
# reading the plot dataset, preserving only three important variables
wiki_plot = spark.read.options(header = True, inferSchema = True, quote = '"', escape = '"', multiLine = True).csv(path_to_plots_dataset)
wiki_plot = wiki_plot.select('Title', 'Release Year','Plot')\
  .withColumnRenamed('Release Year', 'Year')\
    .filter(length(wiki_plot['Plot']) >= 200) # filter out the very short plot descriptions


# join the imdb with the plot dataset by matching movie titles and release year
imdb_join_plot = imdb.join(wiki_plot, ["Title", "Year"], 'inner')\
  .withColumnRenamed('tconst', 'id')\
    .select('id', 'Title', 'Plot')\
      .withColumn("Source", lit("wiki_plot"))

print("The joined dataset has ", imdb_join_plot.count(), " entries")
# inspect the joined dataset
imdb_join_plot.show(1)

The joined dataset has  25364  entries
+---------+-----+--------------------+---------+
|       id|Title|                Plot|   Source|
+---------+-----+--------------------+---------+
|tt0790799|$9.99|The film mainly f...|wiki_plot|
+---------+-----+--------------------+---------+
only showing top 1 row



In [8]:
# combine the above two dataset to get the dataset that we will train the model on
df = imdb_join_plot.union(imdb_join_details)

print('after merging & cleaning, there is a total of ', df.count(), ' movie plot entries left in the merged dataset')
# inspect the combined new dataset
df.show(1)

after merging & cleaning, there is a total of  28221  movie plot entries left in the merged dataset
+---------+-----+--------------------+---------+
|       id|Title|                Plot|   Source|
+---------+-----+--------------------+---------+
|tt0790799|$9.99|The film mainly f...|wiki_plot|
+---------+-----+--------------------+---------+
only showing top 1 row



## Section 2: NER

In [9]:
import json

In [10]:
with open("nyt_articles2.json", 'r') as f:
    newsfeeds = json.load(f)

len(newsfeeds)

1158

In [11]:
import pandas as pd
df_news = pd.DataFrame(newsfeeds)

In [12]:
import requests

In [13]:
df_news.keys()

Index(['Headline', 'Abstract', 'Lead_Paragraph', 'Snippet', 'Published_date',
       'Author', 'News_desk', 'URL', 'Source'],
      dtype='object')

In [14]:
all_URL = []
for index, row in df_news.iterrows():
    all_URL.append(row["URL"])

In [15]:
all_headlines = []
for index, row in df_news.iterrows():
    all_headlines.append(row["Headline"])

In [16]:
all_snippets = []
for index, row in df_news.iterrows():
    all_snippets.append(row["Snippet"])

In [17]:
with open("NER_Tagging.json", "r") as json_file:
    all_JsonResponses = json.load(json_file)

In [18]:
#Get Entities
entity_dict = {}

print('====Entities====')
print('Article_No, Type, Name, Article Title')
art_no = 0
dict_key = 0
for TRITJsonResponse in all_JsonResponses:
    art_no += 1
    for key in TRITJsonResponse:
        dict_key += 1
        if ('_typeGroup' in TRITJsonResponse[key]):
            if TRITJsonResponse[key]['_typeGroup'] == 'entities':
                entities = str(art_no) + ", " + TRITJsonResponse[key]['_type'] + ", " + TRITJsonResponse[key]['name'] + ", " + all_headlines[art_no]
                entity_dict[dict_key] = entities

====Entities====
Article_No, Type, Name, Article Title


In [19]:
def print_output (dictionary, search_value):
    keys_with_search_value = []
    for key, value in dictionary.items():
        if search_value in value:
            keys_with_search_value.append(key)

    output_list = []
    for key in keys_with_search_value:
        output_list.append(f"Dict_Key: {key}, Article: {dictionary[key]}")    

    print('=====Article Titles=====')
    for values in output_list:
        result = values.split(',')[-1].strip()
        print(result)

In [20]:
value = 'warner'

print_output(entity_dict, value)

=====Article Titles=====
The 25 Greatest Actors of the 21st Century (So Far)
‘Shaun the Sheep Movie’: Designing the Characters
Hollywood as Biographer
The 25 Greatest Actors of the 21st Century (So Far)
Film Series Listings
The 25 Best Films of the 21st Century So Far.
10 Sundance Movies With Heat
2018 Academy Awards
The 25 Best Films of the 21st Century So Far.
2018 Golden Globe Awards
Is Dead at 80
Golden Globes: The Projectionist’s Takeaways
What the Movies Taught Me About Being a Woman
2018 Academy Awards
How Will Movies Survive the Next 10 Years?
28 Films for Black History Month
New Books Look at the ‘Peanuts’ Gang
‘Shaun the Sheep Movie’: Designing the Characters
’ in Two Minutes
You Know These 20 Movies. Now Meet the Women Behind Them
28 Films for Black History Month
David Bowie in the Movies
How Will Movies Survive the Next 10 Years?
The 25 Best Films of the 21st Century So Far.
What the Movies Taught Me About Being a Woman
The 25 Best Films of the 21st Century So Far.
The 25 G

In [21]:
# Create a new dictionary to store the structured articles
structured_articles = {}

for article_id, article_content in entity_dict.items():
    tag_info = article_content.split(', ')
    article_number = int(tag_info[0])
    tag_type = tag_info[1]
    value = tag_info[2]
    title = tag_info[3]

    # If the article number already exists in the structured_articles dictionary, update the 'Movies' key
    if article_number in structured_articles:
        if tag_type == 'Movie':
            structured_articles[article_number]['Movies'].append(value)
    else:
        # If the article number doesn't exist, create a new entry with the 'Movies' key
        structured_articles[article_number] = {
            'Article': article_number,
            'Title': title,
            'Movies': [] if tag_type == 'Movie' else [],
        }

# Now, structured_articles is a dictionary with article numbers as keys and their associated information, including movie tags in the 'Movies' key
#print(structured_articles)

In [22]:
def search_movie_in_articles(articles_dict, movie_name):
    found_articles = []
    for article_info in articles_dict.values():
        if 'Movies' in article_info and movie_name in article_info['Movies']:
            found_articles.append(article_info)
    return found_articles

In [23]:
def print_tags(entity_dict, article_number):
    tags_by_article = {}
    for key, value in entity_dict.items():
        tag_info = value.split(', ')
        current_article_number = int(tag_info[0])
        if current_article_number == article_number:
            tag_type = tag_info[1]
            tag_value = tag_info[2]
            tags_by_article.setdefault(tag_type, []).append(tag_value)

    # Print the tags for the specified article number
    tag_number = f"Tags for Article Number {article_number}:"
    for tag_type, tag_values in tags_by_article.items():
        tag_type_value = f"{tag_type}s: {', '.join(tag_values)}"

In [24]:
pandas_df = df.toPandas()

In [26]:
for index, row in pandas_df.iterrows():
    movie_name = row['Title']  # Assuming 'title' is the column name for movie titles

    movie_information = []

    for article_info in search_movie_in_articles(structured_articles, movie_name):
        article_number = f"Article Number: {article_info['Article']}"
        title_info = f"Title: {article_info['Title']}"
        if 'Movies' in article_info:
            movie_info = f"Movies: {', '.join(article_info['Movies'])}" if 'Movies' in article_info else ""
        division = '-' * 50
        tags = print_tags(entity_dict, article_info['Article']) if print_tags(entity_dict, article_info['Article']) else""
        
        movie_information.append('\n'.join([article_number, title_info, movie_info, division, tags]))

    entity_info = '\n'.join(movie_information) # Combine movie information into a single string
    print(entity_info)
    pandas_df.at[index, 'Entity'] = entity_info  # Update 'entity' column for the current row











































































































































































































































































































































































































Article Number: 351
Title: Movies Seek Laughs With All Manner of Sex Scenes
Movies: Self/less, Beauty and the Beast, Being John Malkovich, Child's Play, Source Code
--------------------------------------------------




















Article Number: 131
Title: A Blockbuster Series That Has Legs
Movies: Samsara, Below, Baraka
--------------------------------------------------




































































































































































Article Number: 22
Title: From the BBC
Movies: Black and Tan, 

Article Number: 127
Title: A Closer Look at ‘Metallica Through the Never’
Movies: Boogie Nights, Fast Times at Ridgemont High, Team America: World Police, Casual Sex?, The Bronze, The D Train
--------------------------------------------------

Article Number: 186
Title: Building the Battle Room for ‘Ender’s Game’
Movies: Boogie Nights, Fast Times at Ridgemont High, Team America: World Police, Casual Sex?, The Bronze, The D Train
--------------------------------------------------

Article Number: 281
Title: Unconventional Perspectives on American History and the Irish Troubles
Movies: Boogie Nights, Fast Times at Ridgemont High, Team America: World Police, Casual Sex?, The Bronze, The D Train
--------------------------------------------------

Article Number: 315
Title: Marlon Brando
Movies: Boogie Nights, Fast Times at Ridgemont High, Team America: World Police, Casual Sex?, The Bronze, The D Train
--------------------------------------------------

Article Number: 352
Title: In 'Backc

Article Number: 36
Title: The 25 Best Films of the 21st Century So Far.
Movies: Hansel and Gretel, Peter Pan, Mutiny on the Bounty, Moana, Bonnie and Clyde, Cinderella, Meshes of the Afternoon, Claudine, Puss in Boots, Stir Crazy, Hollywood Chinese, The Big House, San Quentin, The Big Sleep, Salome of the Tenements, The Champ, The Curse of Quon Gwon, Daisies, State Fair, Nanook of the North, Splendor in the Grass, Gentlemen Prefer Blondes
--------------------------------------------------

Article Number: 178
Title: The 25 Best Films of the 21st Century So Far.
Movies: Hansel and Gretel, Peter Pan, Mutiny on the Bounty, Moana, Bonnie and Clyde, Cinderella, Meshes of the Afternoon, Claudine, Puss in Boots, Stir Crazy, Hollywood Chinese, The Big House, San Quentin, The Big Sleep, Salome of the Tenements, The Champ, The Curse of Quon Gwon, Daisies, State Fair, Nanook of the North, Splendor in the Grass, Gentlemen Prefer Blondes
--------------------------------------------------

Article N

Article Number: 36
Title: The 25 Best Films of the 21st Century So Far.
Movies: Hansel and Gretel, Peter Pan, Mutiny on the Bounty, Moana, Bonnie and Clyde, Cinderella, Meshes of the Afternoon, Claudine, Puss in Boots, Stir Crazy, Hollywood Chinese, The Big House, San Quentin, The Big Sleep, Salome of the Tenements, The Champ, The Curse of Quon Gwon, Daisies, State Fair, Nanook of the North, Splendor in the Grass, Gentlemen Prefer Blondes
--------------------------------------------------

Article Number: 178
Title: The 25 Best Films of the 21st Century So Far.
Movies: Hansel and Gretel, Peter Pan, Mutiny on the Bounty, Moana, Bonnie and Clyde, Cinderella, Meshes of the Afternoon, Claudine, Puss in Boots, Stir Crazy, Hollywood Chinese, The Big House, San Quentin, The Big Sleep, Salome of the Tenements, The Champ, The Curse of Quon Gwon, Daisies, State Fair, Nanook of the North, Splendor in the Grass, Gentlemen Prefer Blondes
--------------------------------------------------

Article N

Article Number: 36
Title: The 25 Best Films of the 21st Century So Far.
Movies: Hansel and Gretel, Peter Pan, Mutiny on the Bounty, Moana, Bonnie and Clyde, Cinderella, Meshes of the Afternoon, Claudine, Puss in Boots, Stir Crazy, Hollywood Chinese, The Big House, San Quentin, The Big Sleep, Salome of the Tenements, The Champ, The Curse of Quon Gwon, Daisies, State Fair, Nanook of the North, Splendor in the Grass, Gentlemen Prefer Blondes
--------------------------------------------------

Article Number: 178
Title: The 25 Best Films of the 21st Century So Far.
Movies: Hansel and Gretel, Peter Pan, Mutiny on the Bounty, Moana, Bonnie and Clyde, Cinderella, Meshes of the Afternoon, Claudine, Puss in Boots, Stir Crazy, Hollywood Chinese, The Big House, San Quentin, The Big Sleep, Salome of the Tenements, The Champ, The Curse of Quon Gwon, Daisies, State Fair, Nanook of the North, Splendor in the Grass, Gentlemen Prefer Blondes
--------------------------------------------------

Article N

Article Number: 22
Title: From the BBC
Movies: Black and Tan, Within Our Gates, The Learning Tree, St. Louis Blues, Cabin in the Sky, A Romance of Happy Valley, The Birth of a Nation, Imitation of Life, A Wrinkle in Time
--------------------------------------------------

Article Number: 84
Title: The 25 Best Films of the 21st Century So Far.
Movies: Black and Tan, Within Our Gates, The Learning Tree, St. Louis Blues, Cabin in the Sky, A Romance of Happy Valley, The Birth of a Nation, Imitation of Life, A Wrinkle in Time
--------------------------------------------------

Article Number: 113
Title: 2018 Golden Globe Awards
Movies: Black and Tan, Within Our Gates, The Learning Tree, St. Louis Blues, Cabin in the Sky, A Romance of Happy Valley, The Birth of a Nation, Imitation of Life, A Wrinkle in Time
--------------------------------------------------

Article Number: 221
Title: The 25 Best Films of the 21st Century So Far.
Movies: Black and Tan, Within Our Gates, The Learning Tree, St

Article Number: 177
Title: You Know These 20 Movies. Now Meet the Women Behind Them
Movies: Lovers, Kramer vs. Kramer, Valentine, Blue Valentine, My Bloody Valentine, Friday the 13th
--------------------------------------------------

Article Number: 336
Title: Why Are There So Few Black Directors in the Criterion Collection?
Movies: Lovers, Kramer vs. Kramer, Valentine, Blue Valentine, My Bloody Valentine, Friday the 13th
--------------------------------------------------

Article Number: 594
Title: The 25 Greatest Actors of the 21st Century (So Far)
Movies: Lovers, Kramer vs. Kramer, Valentine, Blue Valentine, My Bloody Valentine, Friday the 13th
--------------------------------------------------







































































































































































































































































Article Number: 337
Title: Ho

Article Number: 36
Title: The 25 Best Films of the 21st Century So Far.
Movies: Hansel and Gretel, Peter Pan, Mutiny on the Bounty, Moana, Bonnie and Clyde, Cinderella, Meshes of the Afternoon, Claudine, Puss in Boots, Stir Crazy, Hollywood Chinese, The Big House, San Quentin, The Big Sleep, Salome of the Tenements, The Champ, The Curse of Quon Gwon, Daisies, State Fair, Nanook of the North, Splendor in the Grass, Gentlemen Prefer Blondes
--------------------------------------------------

Article Number: 178
Title: The 25 Best Films of the 21st Century So Far.
Movies: Hansel and Gretel, Peter Pan, Mutiny on the Bounty, Moana, Bonnie and Clyde, Cinderella, Meshes of the Afternoon, Claudine, Puss in Boots, Stir Crazy, Hollywood Chinese, The Big House, San Quentin, The Big Sleep, Salome of the Tenements, The Champ, The Curse of Quon Gwon, Daisies, State Fair, Nanook of the North, Splendor in the Grass, Gentlemen Prefer Blondes
--------------------------------------------------

Article N

Article Number: 337
Title: How Will Movies Survive the Next 10 Years?
Movies: Dead Presidents, Medicine for Melancholy, Menace II Society, The Learning Tree, Middle of Nowhere, A Dry White Season, Do the Right Thing, Symbiopsychotaxiplasm: Take One
--------------------------------------------------

Article Number: 596
Title: How Will Movies Survive the Next 10 Years?
Movies: Dead Presidents, Medicine for Melancholy, Menace II Society, The Learning Tree, Middle of Nowhere, A Dry White Season, Do the Right Thing, Symbiopsychotaxiplasm: Take One
--------------------------------------------------

Article Number: 628
Title: What the Movies Taught Me About Being a Woman
Movies: Dead Presidents, Medicine for Melancholy, Menace II Society, The Learning Tree, Middle of Nowhere, A Dry White Season, Do the Right Thing, Symbiopsychotaxiplasm: Take One
--------------------------------------------------

Article Number: 648
Title: 2020 Oscar Nominations: Full Ballot
Movies: Dead Presidents, Medici









































































































































































Article Number: 344
Title: New Books Look at the ‘Peanuts’ Gang
Movies: Rocky
--------------------------------------------------

Article Number: 501
Title: David Bowie in the Movies
Movies: The Merchant of Venice, Rocky, Atlantic
--------------------------------------------------

Article Number: 344
Title: New Books Look at the ‘Peanuts’ Gang
Movies: Rocky
--------------------------------------------------

Article Number: 501
Title: David Bowie in the Movies
Movies: The Merchant of Venice, Rocky, Atlantic
--------------------------------------------------



























































































































































Article Number: 5
Title: Movie Guide and Film Series
Movies: Shaun the Sheep, Shaun the Sheep Movie
------------










Article Number: 602
Title: And the Nominees Should Be...
Movies: Doctor Zhivago, The Great Dictator, Charlie Chaplin, Lawrence of Arabia
--------------------------------------------------

Article Number: 662
Title: Movies Seek Laughs With All Manner of Sex Scenes
Movies: Doctor Zhivago, The Great Dictator, Charlie Chaplin, Lawrence of Arabia
--------------------------------------------------




















































































Article Number: 122
Title: ‘Star Wars’: Their First Time
Movies: Heathers, Concussion, Trainwreck, Credit
--------------------------------------------------

Article Number: 343
Title: ‘Rocky’ and ‘Creed’: a Critical Comparison
Movies: Heathers, Concussion, Trainwreck, Credit
--------------------------------------------------

Article Number: 658
Title: New Books Look at the ‘Peanuts’ Gang
Movies: Heathers, Concussion, Trainwreck, Credit
--------------------------------------------------




















Article Number: 290
Title: 2018 Academy Awards
Movies: Thelma & Louise, Pretty in Pink, The Thin Man Goes Home, The Lost City, Pickup on South Street, Out of the Past, First Man, The Poseidon Adventure, Gun Crazy
--------------------------------------------------

Article Number: 490
Title: You Know These 20 Movies. Now Meet the Women Behind Them
Movies: Thelma & Louise, Pretty in Pink, The Thin Man Goes Home, The Lost City, Pickup on South Street, Out of the Past, First Man, The Poseidon Adventure, Gun Crazy
--------------------------------------------------

Article Number: 629
Title: The 25 Best Films of the 21st Century So Far.
Movies: Thelma & Louise, Pretty in Pink, The Thin Man Goes Home, The Lost City, Pickup on South Street, Out of the Past, First Man, The Poseidon Adventure, Gun Crazy
--------------------------------------------------

Article Number: 652
Title: 28 Days
Movies: Thelma & Louise, Pretty in Pink, The Thin Man Goes Home, The Lost City, Pickup on South Street, Out


























































































































































































































































Article Number: 119
Title: The Best and Worst of the Oscars
Movies: The Wave, Top of the World, Independence Day
--------------------------------------------------

Article Number: 498
Title: The Best and Worst of the Oscars
Movies: The Wave, Top of the World, Independence Day
--------------------------------------------------

































































































































































Article Number: 43
Title: A ‘Star Wars’ Refresher
Movies: Finders Keepers, Lovesong, Wall Street, Gleason, Welcome to the Dollhouse
--------------------------------------------------

Article Number: 121
Title: Golden Globe Nominee Reactions
Movies: Finders Kee

Article Number: 3
Title: Summer Movies Preview: 10 Trailers for June
Movies: Toy Story 3, Boyhood, Wall-E, The Death of Mr. Lazarescu, A Touch of Zen, Before Sunrise, My Neighbor Totoro, Million Dollar Baby, Puss in Boots, The Big Sleep
--------------------------------------------------

Article Number: 37
Title: Goodbye
Movies: Toy Story 3, Boyhood, Wall-E, The Death of Mr. Lazarescu, A Touch of Zen, Before Sunrise, My Neighbor Totoro, Million Dollar Baby, Puss in Boots, The Big Sleep
--------------------------------------------------

Article Number: 85
Title: Feeling the Pain and Sweat of the Ring in ‘Southpaw’
Movies: Toy Story 3, Boyhood, Wall-E, The Death of Mr. Lazarescu, A Touch of Zen, Before Sunrise, My Neighbor Totoro, Million Dollar Baby, Puss in Boots, The Big Sleep
--------------------------------------------------

Article Number: 90
Title: The 50's Gangs of New York
Movies: Toy Story 3, Boyhood, Wall-E, The Death of Mr. Lazarescu, A Touch of Zen, Before Sunrise, My Neig

Article Number: 36
Title: The 25 Best Films of the 21st Century So Far.
Movies: Hansel and Gretel, Peter Pan, Mutiny on the Bounty, Moana, Bonnie and Clyde, Cinderella, Meshes of the Afternoon, Claudine, Puss in Boots, Stir Crazy, Hollywood Chinese, The Big House, San Quentin, The Big Sleep, Salome of the Tenements, The Champ, The Curse of Quon Gwon, Daisies, State Fair, Nanook of the North, Splendor in the Grass, Gentlemen Prefer Blondes
--------------------------------------------------

Article Number: 178
Title: The 25 Best Films of the 21st Century So Far.
Movies: Hansel and Gretel, Peter Pan, Mutiny on the Bounty, Moana, Bonnie and Clyde, Cinderella, Meshes of the Afternoon, Claudine, Puss in Boots, Stir Crazy, Hollywood Chinese, The Big House, San Quentin, The Big Sleep, Salome of the Tenements, The Champ, The Curse of Quon Gwon, Daisies, State Fair, Nanook of the North, Splendor in the Grass, Gentlemen Prefer Blondes
--------------------------------------------------

Article N

Article Number: 3
Title: Summer Movies Preview: 10 Trailers for June
Movies: Toy Story 3, Boyhood, Wall-E, The Death of Mr. Lazarescu, A Touch of Zen, Before Sunrise, My Neighbor Totoro, Million Dollar Baby, Puss in Boots, The Big Sleep
--------------------------------------------------

Article Number: 37
Title: Goodbye
Movies: Toy Story 3, Boyhood, Wall-E, The Death of Mr. Lazarescu, A Touch of Zen, Before Sunrise, My Neighbor Totoro, Million Dollar Baby, Puss in Boots, The Big Sleep
--------------------------------------------------

Article Number: 85
Title: Feeling the Pain and Sweat of the Ring in ‘Southpaw’
Movies: Toy Story 3, Boyhood, Wall-E, The Death of Mr. Lazarescu, A Touch of Zen, Before Sunrise, My Neighbor Totoro, Million Dollar Baby, Puss in Boots, The Big Sleep
--------------------------------------------------

Article Number: 90
Title: The 50's Gangs of New York
Movies: Toy Story 3, Boyhood, Wall-E, The Death of Mr. Lazarescu, A Touch of Zen, Before Sunrise, My Neig

Article Number: 32
Title: The 25 Greatest Actors of the 21st Century (So Far)
Movies: Old Man Yells at Cloud, Far From Home, Lawrence of Arabia, Slim, Spider-Man, Endgame, Crazy Rich Asians
--------------------------------------------------

Article Number: 111
Title: 2018 Academy Awards
Movies: Old Man Yells at Cloud, Far From Home, Lawrence of Arabia, Slim, Spider-Man, Endgame, Crazy Rich Asians
--------------------------------------------------

Article Number: 289
Title: What the Movies Taught Me About Being a Woman
Movies: Old Man Yells at Cloud, Far From Home, Lawrence of Arabia, Slim, Spider-Man, Endgame, Crazy Rich Asians
--------------------------------------------------

Article Number: 338
Title: 28 Days
Movies: Old Man Yells at Cloud, Far From Home, Lawrence of Arabia, Slim, Spider-Man, Endgame, Crazy Rich Asians
--------------------------------------------------

Article Number: 488
Title: Everything You Need to Know Before ‘Avengers: Endgame,’ in Two Minutes
Movies: Old M
















































































































Article Number: 335
Title: Movies for the Lovers and Haters of Valentine’s Day
Movies: Halloween, It Happened One Night, Manhunter, The Silence of the Lambs, New Jersey The movie, Henry: Portrait of a Serial Killer
--------------------------------------------------
















































































































































































































Article Number: 3
Title: Summer Movies Preview: 10 Trailers for June
Movies: Toy Story 3, Boyhood, Wall-E, The Death of Mr. Lazarescu, A Touch of Zen, Before Sunrise, My Neighbor Totoro, Million Dollar Baby, Puss in Boots, The Big Sleep
--------------------------------------------------

Article Number: 37
Title: Goodbye
Movies: Toy Story 3, Boyhood, Wall-E, The Death of Mr. Lazarescu, A Touch of Zen, Before Sun

Article Number: 177
Title: You Know These 20 Movies. Now Meet the Women Behind Them
Movies: Lovers, Kramer vs. Kramer, Valentine, Blue Valentine, My Bloody Valentine, Friday the 13th
--------------------------------------------------

Article Number: 336
Title: Why Are There So Few Black Directors in the Criterion Collection?
Movies: Lovers, Kramer vs. Kramer, Valentine, Blue Valentine, My Bloody Valentine, Friday the 13th
--------------------------------------------------

Article Number: 594
Title: The 25 Greatest Actors of the 21st Century (So Far)
Movies: Lovers, Kramer vs. Kramer, Valentine, Blue Valentine, My Bloody Valentine, Friday the 13th
--------------------------------------------------

Article Number: 177
Title: You Know These 20 Movies. Now Meet the Women Behind Them
Movies: Lovers, Kramer vs. Kramer, Valentine, Blue Valentine, My Bloody Valentine, Friday the 13th
--------------------------------------------------

Article Number: 336
Title: Why Are There So Few Black Di

Article Number: 337
Title: How Will Movies Survive the Next 10 Years?
Movies: Dead Presidents, Medicine for Melancholy, Menace II Society, The Learning Tree, Middle of Nowhere, A Dry White Season, Do the Right Thing, Symbiopsychotaxiplasm: Take One
--------------------------------------------------

Article Number: 596
Title: How Will Movies Survive the Next 10 Years?
Movies: Dead Presidents, Medicine for Melancholy, Menace II Society, The Learning Tree, Middle of Nowhere, A Dry White Season, Do the Right Thing, Symbiopsychotaxiplasm: Take One
--------------------------------------------------

Article Number: 628
Title: What the Movies Taught Me About Being a Woman
Movies: Dead Presidents, Medicine for Melancholy, Menace II Society, The Learning Tree, Middle of Nowhere, A Dry White Season, Do the Right Thing, Symbiopsychotaxiplasm: Take One
--------------------------------------------------

Article Number: 648
Title: 2020 Oscar Nominations: Full Ballot
Movies: Dead Presidents, Medici

Article Number: 3
Title: Summer Movies Preview: 10 Trailers for June
Movies: Toy Story 3, Boyhood, Wall-E, The Death of Mr. Lazarescu, A Touch of Zen, Before Sunrise, My Neighbor Totoro, Million Dollar Baby, Puss in Boots, The Big Sleep
--------------------------------------------------

Article Number: 36
Title: The 25 Best Films of the 21st Century So Far.
Movies: Hansel and Gretel, Peter Pan, Mutiny on the Bounty, Moana, Bonnie and Clyde, Cinderella, Meshes of the Afternoon, Claudine, Puss in Boots, Stir Crazy, Hollywood Chinese, The Big House, San Quentin, The Big Sleep, Salome of the Tenements, The Champ, The Curse of Quon Gwon, Daisies, State Fair, Nanook of the North, Splendor in the Grass, Gentlemen Prefer Blondes
--------------------------------------------------

Article Number: 37
Title: Goodbye
Movies: Toy Story 3, Boyhood, Wall-E, The Death of Mr. Lazarescu, A Touch of Zen, Before Sunrise, My Neighbor Totoro, Million Dollar Baby, Puss in Boots, The Big Sleep
--------------

Article Number: 32
Title: The 25 Greatest Actors of the 21st Century (So Far)
Movies: Old Man Yells at Cloud, Far From Home, Lawrence of Arabia, Slim, Spider-Man, Endgame, Crazy Rich Asians
--------------------------------------------------

Article Number: 111
Title: 2018 Academy Awards
Movies: Old Man Yells at Cloud, Far From Home, Lawrence of Arabia, Slim, Spider-Man, Endgame, Crazy Rich Asians
--------------------------------------------------

Article Number: 289
Title: What the Movies Taught Me About Being a Woman
Movies: Old Man Yells at Cloud, Far From Home, Lawrence of Arabia, Slim, Spider-Man, Endgame, Crazy Rich Asians
--------------------------------------------------

Article Number: 338
Title: 28 Days
Movies: Old Man Yells at Cloud, Far From Home, Lawrence of Arabia, Slim, Spider-Man, Endgame, Crazy Rich Asians
--------------------------------------------------

Article Number: 488
Title: Everything You Need to Know Before ‘Avengers: Endgame,’ in Two Minutes
Movies: Old M




































































































































































Article Number: 335
Title: Movies for the Lovers and Haters of Valentine’s Day
Movies: Halloween, It Happened One Night, Manhunter, The Silence of the Lambs, New Jersey The movie, Henry: Portrait of a Serial Killer
--------------------------------------------------

Article Number: 335
Title: Movies for the Lovers and Haters of Valentine’s Day
Movies: Halloween, It Happened One Night, Manhunter, The Silence of the Lambs, New Jersey The movie, Henry: Portrait of a Serial Killer
--------------------------------------------------














































Article Number: 290
Title: 2018 Academy Awards
Movies: Thelma & Louise, Pretty in Pink, The Thin Man Goes Home, The Lost City, Pickup on South Street, Out of the Past, First Man, The Poseidon Adventure, Gun Crazy
---------------------------------------------

Article Number: 3
Title: Summer Movies Preview: 10 Trailers for June
Movies: Toy Story 3, Boyhood, Wall-E, The Death of Mr. Lazarescu, A Touch of Zen, Before Sunrise, My Neighbor Totoro, Million Dollar Baby, Puss in Boots, The Big Sleep
--------------------------------------------------

Article Number: 37
Title: Goodbye
Movies: Toy Story 3, Boyhood, Wall-E, The Death of Mr. Lazarescu, A Touch of Zen, Before Sunrise, My Neighbor Totoro, Million Dollar Baby, Puss in Boots, The Big Sleep
--------------------------------------------------

Article Number: 85
Title: Feeling the Pain and Sweat of the Ring in ‘Southpaw’
Movies: Toy Story 3, Boyhood, Wall-E, The Death of Mr. Lazarescu, A Touch of Zen, Before Sunrise, My Neighbor Totoro, Million Dollar Baby, Puss in Boots, The Big Sleep
--------------------------------------------------

Article Number: 90
Title: The 50's Gangs of New York
Movies: Toy Story 3, Boyhood, Wall-E, The Death of Mr. Lazarescu, A Touch of Zen, Before Sunrise, My Neig

Article Number: 3
Title: Summer Movies Preview: 10 Trailers for June
Movies: Toy Story 3, Boyhood, Wall-E, The Death of Mr. Lazarescu, A Touch of Zen, Before Sunrise, My Neighbor Totoro, Million Dollar Baby, Puss in Boots, The Big Sleep
--------------------------------------------------

Article Number: 37
Title: Goodbye
Movies: Toy Story 3, Boyhood, Wall-E, The Death of Mr. Lazarescu, A Touch of Zen, Before Sunrise, My Neighbor Totoro, Million Dollar Baby, Puss in Boots, The Big Sleep
--------------------------------------------------

Article Number: 85
Title: Feeling the Pain and Sweat of the Ring in ‘Southpaw’
Movies: Toy Story 3, Boyhood, Wall-E, The Death of Mr. Lazarescu, A Touch of Zen, Before Sunrise, My Neighbor Totoro, Million Dollar Baby, Puss in Boots, The Big Sleep
--------------------------------------------------

Article Number: 90
Title: The 50's Gangs of New York
Movies: Toy Story 3, Boyhood, Wall-E, The Death of Mr. Lazarescu, A Touch of Zen, Before Sunrise, My Neig

Article Number: 32
Title: The 25 Greatest Actors of the 21st Century (So Far)
Movies: Old Man Yells at Cloud, Far From Home, Lawrence of Arabia, Slim, Spider-Man, Endgame, Crazy Rich Asians
--------------------------------------------------

Article Number: 111
Title: 2018 Academy Awards
Movies: Old Man Yells at Cloud, Far From Home, Lawrence of Arabia, Slim, Spider-Man, Endgame, Crazy Rich Asians
--------------------------------------------------

Article Number: 289
Title: What the Movies Taught Me About Being a Woman
Movies: Old Man Yells at Cloud, Far From Home, Lawrence of Arabia, Slim, Spider-Man, Endgame, Crazy Rich Asians
--------------------------------------------------

Article Number: 338
Title: 28 Days
Movies: Old Man Yells at Cloud, Far From Home, Lawrence of Arabia, Slim, Spider-Man, Endgame, Crazy Rich Asians
--------------------------------------------------

Article Number: 488
Title: Everything You Need to Know Before ‘Avengers: Endgame,’ in Two Minutes
Movies: Old M

Article Number: 184
Title: Paul Rudd’s Scientific-Sounding Guide to Ants
Movies: Freaks, Hellboy, Pan, Labyrinth, Hellboy II
--------------------------------------------------

Article Number: 269
Title: Frankly
Movies: Freaks, Hellboy, Pan, Labyrinth, Hellboy II
--------------------------------------------------

Article Number: 507
Title: In Making ‘Everest,’ Filmmakers Embrace Towering Ambitions
Movies: Freaks, Hellboy, Pan, Labyrinth, Hellboy II
--------------------------------------------------

Article Number: 709
Title: Watch the Trailers: 14 Movies Coming This Summer
Movies: Freaks, Hellboy, Pan, Labyrinth, Hellboy II
--------------------------------------------------

Article Number: 184
Title: Paul Rudd’s Scientific-Sounding Guide to Ants
Movies: Freaks, Hellboy, Pan, Labyrinth, Hellboy II
--------------------------------------------------

Article Number: 269
Title: Frankly
Movies: Freaks, Hellboy, Pan, Labyrinth, Hellboy II
--------------------------------------------------

Article Number: 3
Title: Summer Movies Preview: 10 Trailers for June
Movies: Toy Story 3, Boyhood, Wall-E, The Death of Mr. Lazarescu, A Touch of Zen, Before Sunrise, My Neighbor Totoro, Million Dollar Baby, Puss in Boots, The Big Sleep
--------------------------------------------------

Article Number: 37
Title: Goodbye
Movies: Toy Story 3, Boyhood, Wall-E, The Death of Mr. Lazarescu, A Touch of Zen, Before Sunrise, My Neighbor Totoro, Million Dollar Baby, Puss in Boots, The Big Sleep
--------------------------------------------------

Article Number: 85
Title: Feeling the Pain and Sweat of the Ring in ‘Southpaw’
Movies: Toy Story 3, Boyhood, Wall-E, The Death of Mr. Lazarescu, A Touch of Zen, Before Sunrise, My Neighbor Totoro, Million Dollar Baby, Puss in Boots, The Big Sleep
--------------------------------------------------

Article Number: 90
Title: The 50's Gangs of New York
Movies: Toy Story 3, Boyhood, Wall-E, The Death of Mr. Lazarescu, A Touch of Zen, Before Sunrise, My Neig

## Section 3: Sentiment Analysis

In [36]:
with open(path_to_reviews_dataset, 'r') as f:
    json_data = f.read()

# Split the file contents into individual JSON objects
json_objects = json_data.strip().split('\n')

# Load each JSON object and store them in a list
loaded_data = []
for obj in json_objects:
    data = json.loads(obj)
    loaded_data.append(data)

In [60]:
# Convert loaded_data to a DataFrame
loaded_data_df = pd.DataFrame(loaded_data)

In [27]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jessi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [28]:
def split_into_sentences(text):
    # Use the punkt tokenizer to split the text into sentences
    sentences = nltk.sent_tokenize(text)
    return sentences

In [34]:
# pip install nrclex

In [31]:
# Initialize the Emotion Classification
from nrclex import NRCLex

# Initialize the Sentiment Intensity Analyzer
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
sent_analyzer = SentimentIntensityAnalyzer()

In [63]:
# Combine reviews for the same movie
reviews_plot_list = []

# Iterate through the merged_df
for index, row in loaded_data_df[:10000].iterrows():
    id = row['movie_id']
    review_text = row['review_text']

    # Split review_text into sentences
    review_sentences = split_into_sentences(review_text)
    
    # Check if id or title exists in the reviews_plot_list
    existing_movie = next((movie for movie in reviews_plot_list if movie['id'] == id), None)
    if existing_movie:
        existing_movie['reviews'].extend(review_sentences)
    else:
        reviews_plot_list.append({'id': id, 'reviews': review_sentences})

In [64]:
reviews_plot_df = pd.DataFrame(reviews_plot_list)

In [77]:
merged_data = pd.merge(pandas_df, reviews_plot_df, left_on='id', right_on='id', how='left')
merged_data.fillna('', inplace=True)

In [79]:
pandas_df = merged_data

In [83]:
pandas_df['split_plot'] = pandas_df['Plot'].apply(split_into_sentences)

In [29]:
import math

def normalize(score, alpha=15):
    import math
    
    """
    Normalize the score to be between -1 and 1 using an alpha that
    approximates the max expected value
    """
    norm_score = score/math.sqrt((score*score) + alpha)
    return norm_score

In [89]:
pandas_df

Unnamed: 0,id,Title,Plot,Source,Entity,reviews,split_plot
0,tt0790799,$9.99,The film mainly focuses on 28-year-old Dave Pe...,wiki_plot,,,[The film mainly focuses on 28-year-old Dave P...
1,tt2614684,'71,"Gary Hook, a new recruit to the British Army, ...",wiki_plot,,,"[Gary Hook, a new recruit to the British Army,..."
2,tt0032176,'Til We Meet Again,Total strangers Dan Hardesty (George Brent) an...,wiki_plot,,,[Total strangers Dan Hardesty (George Brent) a...
3,tt0090556,"'night, Mother",Jessie is a middle-aged woman living with her ...,wiki_plot,,,[Jessie is a middle-aged woman living with her...
4,tt0066730,10 Rillington Place,The film begins in 1944 with John Christie mur...,wiki_plot,,,[The film begins in 1944 with John Christie mu...
...,...,...,...,...,...,...,...
28216,tt1790885,Zero Dark Thirty,Maya is a CIA operative whose first experience...,imdb_details,,,[Maya is a CIA operative whose first experienc...
28217,tt0443706,Zodiac,"The film starts on July 4, 1969, with the Zodi...",imdb_details,,,"[The film starts on July 4, 1969, with the Zod..."
28218,tt0443706,Zodiac,A serial killer in the San Francisco Bay Area ...,imdb_details,,,[A serial killer in the San Francisco Bay Area...
28219,tt2948356,Zootopia,"In the town of Bunnyburrow, 9 year old bunny, ...",imdb_details,,,"[In the town of Bunnyburrow, 9 year old bunny,..."


In [None]:
# Create an empty dictionary to store all the updated movie data
updated_movies = {}

# Iterate through the movies in reviews_plot_list
for index, movie in pandas_df.iterrows():
    id = movie['id']
    title = movie['Title']
    reviews = movie.get('reviews', [])
    plots = movie.get('split_plot', [])

    # Combine all reviews and plots into one string
    all_reviews_text = ' '.join(reviews)
    all_plots_text = ' '.join(plots)

    # Calculate sentiment scores using sent_analyzer for reviews and normalize
    if reviews:
        review_sentiment_scores = [normalize(sent_analyzer.polarity_scores(sentence)['compound']) for sentence in reviews]
        combined_review_sentiment = sum(review_sentiment_scores) / len(review_sentiment_scores)
        # Perform emotion analysis using NRCLex for reviews
        review_emotion_scores = NRCLex(all_reviews_text).affect_frequencies
    else:
        combined_review_sentiment = None
        review_emotion_scores = None
    
    # Calculate sentiment scores using sent_analyzer for plots and normalize
    if plots:
        plot_sentiment_scores = [normalize(sent_analyzer.polarity_scores(sentence)['compound']) for sentence in plots]
        combined_plot_sentiment = sum(plot_sentiment_scores) / len(plot_sentiment_scores)
        # Perform emotion analysis using NRCLex for plots
        plot_emotion_scores = NRCLex(all_plots_text).affect_frequencies
    else:
        combined_plot_sentiment = None
        plot_emotion_scores = None

    # Sort emotion scores in descending order of value for reviews and plots
    sorted_review_emotion_scores = sorted(review_emotion_scores.items(), key=lambda x: x[1], reverse=True) if review_emotion_scores else None
    sorted_plot_emotion_scores = sorted(plot_emotion_scores.items(), key=lambda x: x[1], reverse=True) if plot_emotion_scores else None

    # Create a new dictionary with additional information
    updated_movie_one = {
          'id': id,
        'title': title,
        'reviews_emotion_scores': sorted_review_emotion_scores,
        'reviews_sentiment': combined_review_sentiment,
        'plots_emotion_scores': sorted_plot_emotion_scores,
        'plots_sentiment': combined_plot_sentiment
    }
    
    # Store the updated movie data in the all_updated_movies dictionary
    updated_movies[id] = updated_movie_one

In [None]:
def search_movie_by_title(updated_movie, search_title):
    matching_movies = []
    for movie_id, movie_data in updated_movie.items():
        if search_title.lower() in movie_data['title'].lower():
            matching_movies.append(movie_data)
    return matching_movies

In [None]:
search_title = 'Gold rush'  # Replace with the desired movie title (partial or full)

movies_by_title = search_movie_by_title(updated_movies, search_title)

if movies_by_title:
    for movie in movies_by_title:
#         print(f"Movie ID: {movie_by_id['id']}")
#         print(f"Movie Title: {movie_by_id['title']}")
        for key, value in movie.items():
            if key == 'reviews_emotion_scores':
                print("=====Reviews Emotion Scores:=====")
                for emotion, score in value:
                    print(f"{emotion}: {score}")
            elif key == 'plots_emotion_scores':
                print("\n=====Plots Emotion Scores:=====")
                for emotion, score in value:
                    print(f"{emotion}: {score}")
            elif key != 'id' and key != 'title':
                print(f"{key}: {value}")
    print("\n")
else:
    print("Movie with title not found")


In [None]:
for index, row in pandas_df.iterrows():
    search_title = row['Title']  # Assuming 'title' is the column name for movie titles

    movie_sentiment = []
    
    for movie in search_movie_by_title(updated_movies, search_title):
        
        for key, value in movie.items():
            if key == 'reviews_emotion_scores':
                review_title = "=====Reviews Emotion Scores:====="
                for emotion, score in value:
                    review_emotion_score = f"{emotion}: {score}"
                    movie_sentiment.appened('\n'.join([review_title, review_emotion_score]))
            elif key -= 'plots_emotion_scores':
                plot_title = "\n=====Plots Emotion Scores:=====
                for emotion, score in value:
                    plots_emotion_score = f"{emotion}: {score}"
                    movie_sentiment.appened('\n'.join([plot_title, plots_emotion_score]))
            elif key != 'id' and key != 'title':
                unknown = f"{key}: {value}"
                movie_sentiment.append("")
            
        movie_sentiment_info = '\n'.join(movie_sentiment) # Combine movie sentiment into a single string
        print(movie_sentiment_info)
        pandas_df.at[index, 'Sentiment'] = movie_sentiment_info  # Update 'Sentiment' column for the current row
                    

## Section 4. Word2vec Movie Recommender Model Training
The first part of this notebook is dedicated to data cleaning and trainning the word2vec model to create the following three tools for the studio writers and executives: 
1. Basic movie recommender: The user input one movie; and the system recommends 10 other movies with similar plotlines. 
2. Advance movie recommender: The user input two movies; and the system recommends 10 other movies with plotlines that are similar to the combination of these two movies. 
3. Duplicate plot checker: The user input his/her script for a new movie idea, and the system checks if his/her idea has already been produced in a previous movie. 

### 4.1 Training Word2vec Model 
This Word2Vec model is trained using the cleaned data above, the model is fed with around 28,000 entries of texts that describe movie plots. The resulting model will be useful in finding similarities in movie plotlines. 

In [None]:
df = spark.createDataFrame(pandas_df)

In [49]:
# tokenize and remove stop words in this cell
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, Word2Vec

# create a new field by copying Plot
df = df.withColumn('inputText', F.col('Plot')) 

# regular expression tokenizer to tokenize inputText into individual tokens (words)
regextok = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'inputText', outputCol = 'tokens')

# StopWordsRemover to remove stopwords in the list of tokens
stopwrmv = StopWordsRemover(inputCol = 'tokens', outputCol = 'tokens_sw_removed')
df = regextok.transform(df)
df = stopwrmv.transform(df)
df.show(1)



+---------+-----+--------------------+---------+--------------------+--------------------+--------------------+
|       id|Title|                Plot|   Source|           inputText|              tokens|   tokens_sw_removed|
+---------+-----+--------------------+---------+--------------------+--------------------+--------------------+
|tt0790799|$9.99|The film mainly f...|wiki_plot|The film mainly f...|[the, film, mainl...|[film, mainly, fo...|
+---------+-----+--------------------+---------+--------------------+--------------------+--------------------+
only showing top 1 row



                                                                                

In [50]:
# train word2vec model, the parameters here can be changed to optimize the model
word2vec = Word2Vec(vectorSize = 100, minCount = 5, inputCol = 'tokens_sw_removed', outputCol = 'wordvectors')
model = word2vec.fit(df)

# using transform to add wordvectors column to dataframe
df = model.transform(df)
chunks = df.select('id', 'Title','wordvectors', 'Plot', 'Source', 'Entity','Sentiment').limit(30000).collect()

ERROR:root:KeyboardInterrupt while sending command.                 (0 + 1) / 1]
Traceback (most recent call last):
  File "/Users/meenuselvakesari/opt/anaconda3/lib/python3.9/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Users/meenuselvakesari/opt/anaconda3/lib/python3.9/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/Users/meenuselvakesari/opt/anaconda3/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
# define function to calculate cosine similarity for later
import numpy as np
def cossim(v1, v2): 
  '''
      cossim(v1, v2) calculates the cosine similarity between v1 and v1.
      If v1 or v2 is a zero vector, it will return 0
  '''
  if np.dot(v1, v1) == 0 or np.dot(v2, v2) == 0:
      return 0.0
  return float(np.dot(v1, v2) / np.sqrt(np.dot(v1, v1)) / (np.sqrt(np.dot(v2, v2))))

### 4.2 Create Basic, Advanced Recommender and Duplicate Plot Checker
These three tools use the same model at their core. Therefore, to optimize performance, the implementation will create only a single session when running. 

In [10]:
# writing a function to obtain the plot string from the plot dataset
def acquire_plot(base_movie: str): 
  # input: a movie name (precise) or a movie id 
  # output: the movie's plot

  if base_movie.startswith("tt"):   # search by movie name
    base_movie_row = df.filter(df.id == base_movie).collect()
  else:                             # search by movie id
    base_movie_row = df.filter(df.Title == base_movie).collect()

  if base_movie_row: 
    movie_plot = base_movie_row[0]['Plot']
    return movie_plot
  else: 
    print("Sorry, ", base_movie, " is not found in the database. Please type in exact movie names")

In [11]:
def query_preprocessing(plot: str): 
  plot_df = spark.createDataFrame([(1, plot)]).toDF('index','inputText')
  plot_tok = regextok.transform(plot_df)
  plot_swr = stopwrmv.transform(plot_tok)
  plot_vec = model.transform(plot_swr)
  plot_vec = plot_vec.select('wordvectors').collect()[0][0]
  return plot_vec

### 4.2.1 Basic recommender
The Basic Recommender only takes in one movie as the only parameter: 

In [12]:
input_user_1 = '2012'    # User input The exact movie name or movie id(e.g. 'tt1023003')
basic_movie_plot = acquire_plot(input_user_1)
basic_vec = query_preprocessing(basic_movie_plot)

                                                                                

### 4.2.2 Advanced Recommender
The Advanced Recommender takes in one extra movie as the second parameter, then our word2vec model will be able to recommend a third movie that has similar plot as the combination of the first two. : 

In [13]:
input_user_2 = '' #'tt0468569'    # User input The exact movie name or movie id(e.g. 'tt1023003')
if input_user_2: 
  second_movie_plot = acquire_plot(input_user_2)
  second_vec = query_preprocessing(second_movie_plot)
  combined_vec = basic_vec + second_vec

### 4.2.3 Duplicate Plot Checker
The duplicate Plot checker is similar to the basic recommender where it finds an existing movie with similar plot as the user's input plot description. 

In [14]:
input_user_3 = 'Once upon a time in a crime ridden Gotham City, a member of the rich Wayne family decided to put on a mask and protect the people of Gotham'
check_vec = query_preprocessing(input_user_3)

### 4.3 implementation of the above three tools
The parameters of the three tools have been created. To optimize performance, the implementation will create only one single session when running. 

__Note for Meenu__ : the order of the IF clauses in this following cell is important, the user can only run one of the three tools at once. 

In [15]:
if input_user_3: 
  
  print("Running Duplicate Plot Checker")
  data = [(i[0], float(cossim(check_vec, i[2])), i[1], i[4], i[3]) for i in chunks]
  sim_df = spark.createDataFrame(data).toDF('movie_id', 'similarity', 'Title', 'Source', 'Plot')
  sim_df = (sim_df.dropDuplicates(['movie_id'])
            .orderBy('similarity', ascending=False)
            .limit(30))
  sim_df.show(10, truncate=False)

elif not input_user_2:  # if input_user_2 is empty, then run the basic recommender
  
  print("Only one movie is input, running Basic Recommender")
  data = [(i[0], float(cossim(basic_vec, i[2])), i[1], i[4], i[3]) for i in chunks]
  sim_df = spark.createDataFrame(data).toDF('movie_id', 'similarity', 'Title', 'Source', 'Plot')
  sim_df = (sim_df.filter((sim_df.Title != input_user_1) & (sim_df.movie_id != input_user_1))
            .dropDuplicates(['movie_id'])
            .orderBy('similarity', ascending=False)
            .limit(30))
  sim_df.show(10, truncate=False)
  
elif input_user_2: 
  
  print("Only two movies are input, running Advanced Recommender")
  data = [(i[0], float(cossim(combined_vec, i[2])), i[1], i[4], i[3]) for i in chunks]
  sim_df = spark.createDataFrame(data).toDF('movie_id', 'similarity', 'Title', 'Source', 'Plot')
  sim_df = (sim_df.filter((sim_df.Title != input_user_1) & (sim_df.movie_id != input_user_1)
                          & (sim_df.Title != input_user_2) & (sim_df.movie_id != input_user_2))
            .dropDuplicates(['movie_id'])
            .orderBy('similarity', ascending=False)
            .limit(30))
  sim_df.show(10, truncate=False)

Running Duplicate Plot Checker


23/08/09 14:50:04 WARN TaskSetManager: Stage 129 contains a task of very large size (7098 KiB). The maximum recommended task size is 1000 KiB.
[Stage 131:>                                                        (0 + 8) / 9]

+---------+------------------+---------------------+------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [None]:
## Search Engine Template -> uses spark df to query based on user input
### need to different html -> one is the home page (user input) then the second one is the output screen
#### there is a way to only use one html for simplicity 

## Section 5: LDA Modeling

In [None]:
!pip install -U gensim

In [None]:
from nltk.corpus import stopwords 
stopwords = list(set(stopwords.words('english')))

print('The first 10 stopwords in the list:', stopwords[:10])

In [None]:
import string
punctuation = list(set(string.punctuation))

print('The punctuations are:', punctuation)

In [None]:
import re
from nltk.stem.wordnet import WordNetLemmatizer

def cleanup(text):
    '''
        input: raw text
        output: a list of words
    '''
    
    text = " ".join([i for i in text.lower().split() if i not in stopwords])          
    text = "".join(re.sub(r'[^a-zA-Z ]', '', i) for i in text if i not in punctuation) 
    text = [WordNetLemmatizer().lemmatize(i) for i in text.split()]                    
    return text

In [None]:
from gensim.corpora.dictionary import Dictionary

In [None]:
plot_text = pandasdf['Plot'].apply(cleanup)
print('List of tokens:')
plot_text[:10]

In [None]:
dictionary = Dictionary(plot_text)

In [None]:
print('Count of raw tokens: ' + str(len(dictionary.items())))

In [None]:
## can change the filter extreme values

print('Filter out tokens that appear less than 80 times and more than 80% of the titles')
dictionary.filter_extremes(no_below=80, no_above=0.8)
print('Count of tokens: '+ str(len(dictionary.items())))

In [None]:
corpora = [dictionary.doc2bow(doc) for doc in plot_text]

In [None]:
# can change the number of topics  and passes
from gensim.models import ldamodel
lda_model = ldamodel.LdaModel(corpora, num_topics=5, id2word = dictionary, passes=10)

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpora, dictionary)
vis

In [None]:
pyLDAvis.save_html(vis, '/Users/yupan/Library/CloudStorage/OneDrive-Personal/Academic/5430/static/lda.html')

## Section 6: Flask

In [54]:
from flask import Flask, request, jsonify, redirect, url_for, render_template
import numpy as np

app = Flask("JSON_OUTPUT")

@app.route('/')
def form():
    return render_template('FlaskOutputTemplate.html')

@app.route('/topic visualisation')
def lda_vis():
    return render_template('lda_topic.html')
        
@app.route('/submit', methods=['GET','POST'])
def submit():
    if request.method == 'POST':
        input_1 = str(request.form['q1'])
        input_2 = str(request.form['q2'])
        input_3 = str(request.form['q3'])
        
        # tokenize and remove stop words in this cell
        from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, Word2Vec

        # create a new field by copying Plot
        df = df.withColumn('inputText', F.col('Plot')) 

        # regular expression tokenizer to tokenize inputText into individual tokens (words)
        regextok = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'inputText', outputCol = 'tokens')

        # StopWordsRemover to remove stopwords in the list of tokens
        stopwrmv = StopWordsRemover(inputCol = 'tokens', outputCol = 'tokens_sw_removed')
        df = regextok.transform(df)
        df = stopwrmv.transform(df)
        
        # train word2vec model, the parameters here can be changed to optimize the model
        word2vec = Word2Vec(vectorSize = 100, minCount = 5, inputCol = 'tokens_sw_removed', outputCol = 'wordvectors')
        model = word2vec.fit(df)

        # using transform to add wordvectors column to dataframe
        df = model.transform(df)
        chunks = df.select('id', 'Title','wordvectors', 'Plot', 'Source').limit(30000).collect()
        
        def cossim(v1, v2):
            dot_product = np.sum(v1 * v2)
            mag_v1 = np.sqrt(np.sum(np.power(v1, 2)))
            mag_v2 = np.sqrt(np.sum(np.power(v2, 2)))
            return dot_product / (mag_v1 * mag_v2 + 0.1)
        
        
        def acquire_plot(base_movie: str): 
          # input: a movie name (precise) or a movie id 
          # output: the movie's plot

          if base_movie.startswith("tt"):   # search by movie name
            base_movie_row = df.filter(df.id == base_movie).collect()
          else:                             # search by movie id
            base_movie_row = df.filter(df.Title == base_movie).collect()

          if base_movie_row: 
            movie_plot = base_movie_row[0]['Plot']
            return movie_plot
          else: 
            print("Sorry, ", base_movie, " is not found in the database. Please type in exact movie names")
        
        
        def query_preprocessing(plot: str): 
          plot_df = spark.createDataFrame([(1, plot)]).toDF('index','inputText')
          plot_tok = regextok.transform(plot_df)
          plot_swr = stopwrmv.transform(plot_tok)
          plot_vec = model.transform(plot_swr)
          plot_vec = plot_vec.select('wordvectors').collect()[0][0]
          return plot_vec
        
        input_user_1 = input_1    # User input The exact movie name or movie id(e.g. 'tt1023003')
        basic_movie_plot = acquire_plot(input_user_1)
        basic_vec = query_preprocessing(basic_movie_plot)
        
        input_user_2 = input_2 #'tt0468569'    # User input The exact movie name or movie id(e.g. 'tt1023003')
        if input_user_2: 
          second_movie_plot = acquire_plot(input_user_2)
          second_vec = query_preprocessing(second_movie_plot)
          combined_vec = basic_vec + second_vec
        
        input_user_3 = input_3
        check_vec = query_preprocessing(input_user_3)
       
        if input_user_3: 
  
          print("Running Duplicate Plot Checker")
          data = [(i[0], float(cossim(check_vec, i[2])), i[1], i[4], i[3], i[5],i[6]) for i in chunks]
          sim_df = spark.createDataFrame(data).toDF('movie_id', 'similarity', 'Title', 'Source', 'Plot','Entity','Sentiment')
          sim_df = (sim_df.dropDuplicates(['movie_id'])
            .orderBy('similarity', ascending=False)
            .limit(30))
          sim_df.show(10, truncate=False)

        elif not input_user_2:  # if input_user_2 is empty, then run the basic recommender
  
          print("Only one movie is input, running Basic Recommender")
          data = [(i[0], float(cossim(basic_vec, i[2])), i[1], i[4], i[3],i[5],i[6]) for i in chunks]
          sim_df = spark.createDataFrame(data).toDF('movie_id', 'similarity', 'Title', 'Source', 'Plot','Entity','Sentiment')
          sim_df = (sim_df.filter((sim_df.Title != input_user_1) & (sim_df.movie_id != input_user_1))
            .dropDuplicates(['movie_id'])
            .orderBy('similarity', ascending=False)
            .limit(30))
      
  
        elif input_user_2: 
  
          print("Only two movies are input, running Advanced Recommender")
          data = [(i[0], float(cossim(combined_vec, i[2])), i[1], i[4], i[3],i[5],i[6]) for i in chunks]
          sim_df = spark.createDataFrame(data).toDF('movie_id', 'similarity', 'Title', 'Source', 'Plot','Entity','Sentiment')
          sim_df = (sim_df.filter((sim_df.Title != input_user_1) & (sim_df.movie_id != input_user_1)
                          & (sim_df.Title != input_user_2) & (sim_df.movie_id != input_user_2))
            .dropDuplicates(['movie_id'])
            .orderBy('similarity', ascending=False)
            .limit(30))


    pandas_df = sim_df.toPandas()
    html_table = pandas_df.head(10).to_html(classes='table')
    return render_template('TableOutput.html', table=html_table)

app.run(host='localhost', port=7039)

 * Serving Flask app 'JSON_OUTPUT'
 * Debug mode: off


 * Running on http://localhost:7039
INFO:werkzeug:[33mPress CTRL+C to quit[0m
