# Imports

In [422]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

import warnings; warnings.simplefilter('ignore')

# Create Data Frames

In [423]:
imdb_ratings_df = pd.read_csv('zippedData/imdb.title.ratings.csv.gz')

In [424]:
title_basics_df = pd.read_csv('zippedData/imdb.title.basics.csv.gz')

In [425]:
movie_gross_df = pd.read_csv('zippedData/bom.movie_gross.csv.gz')

In [426]:
movie_budgets_df = pd.read_csv('zippedData/tn.movie_budgets.csv.gz')

In [427]:
name_basics_df = pd.read_csv('zippedData/imdb.name.basics.csv.gz')

## Modify Names dataframe


In [428]:
name_basics_df.head()

Unnamed: 0,nconst,primary_name,birth_year,death_year,primary_profession,known_for_titles
0,nm0061671,Mary Ellen Bauder,,,"miscellaneous,production_manager,producer","tt0837562,tt2398241,tt0844471,tt0118553"
1,nm0061865,Joseph Bauer,,,"composer,music_department,sound_department","tt0896534,tt6791238,tt0287072,tt1682940"
2,nm0062070,Bruce Baum,,,"miscellaneous,actor,writer","tt1470654,tt0363631,tt0104030,tt0102898"
3,nm0062195,Axel Baumann,,,"camera_department,cinematographer,art_department","tt0114371,tt2004304,tt1618448,tt1224387"
4,nm0062798,Pete Baxter,,,"production_designer,art_department,set_decorator","tt0452644,tt0452692,tt3458030,tt2178256"


In [429]:
name_basics_df = name_basics_df.drop(columns = 'death_year')

In [430]:
name_basics_df = name_basics_df.dropna()

In [431]:
name_basics_df.isna().sum()

nconst                0
primary_name          0
birth_year            0
primary_profession    0
known_for_titles      0
dtype: int64

In [432]:
name_basics_df = (name_basics_df.set_index(['nconst', 'primary_name','birth_year'])
   .stack()
   .str.split(',', expand=True)
   .stack()
   .unstack(-2)
   .reset_index(-1, drop=True)
   .reset_index()
)

In [433]:
name_basics_df.head()

Unnamed: 0,nconst,primary_name,birth_year,primary_profession,known_for_titles
0,nm0000002,Lauren Bacall,1924.0,actress,tt0038355
1,nm0000002,Lauren Bacall,1924.0,soundtrack,tt0117057
2,nm0000002,Lauren Bacall,1924.0,,tt0071877
3,nm0000002,Lauren Bacall,1924.0,,tt0037382
4,nm0000003,Brigitte Bardot,1934.0,actress,tt0049189


In [434]:
name_basics_df = name_basics_df.rename({'known_for_titles': 'tconst'}, axis='columns')

# Merge Data Sets

In [435]:
budget_tconst = pd.merge(title_basics_df, movie_budgets_df , left_on= 'primary_title', right_on= 'movie', how = 'right')

In [436]:
df = pd.merge(budget_tconst,  imdb_ratings_df, on=['tconst'])

In [437]:
df.head()

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,averagerating,numvotes
0,tt1775309,Avatar,Abatâ,2011.0,93.0,Horror,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279",6.1,43
1,tt1298650,Pirates of the Caribbean: On Stranger Tides,Pirates of the Caribbean: On Stranger Tides,2011.0,136.0,"Action,Adventure,Fantasy",2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875",6.6,447624
2,tt6565702,Dark Phoenix,Dark Phoenix,2019.0,113.0,"Action,Adventure,Sci-Fi",3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350",6.0,24451
3,tt2395427,Avengers: Age of Ultron,Avengers: Age of Ultron,2015.0,141.0,"Action,Adventure,Sci-Fi",4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963",7.3,665594
4,tt4154756,Avengers: Infinity War,Avengers: Infinity War,2018.0,149.0,"Action,Adventure,Sci-Fi",7,"Apr 27, 2018",Avengers: Infinity War,"$300,000,000","$678,815,482","$2,048,134,200",8.5,670926


In [438]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2875 entries, 0 to 2874
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   tconst             2875 non-null   object 
 1   primary_title      2875 non-null   object 
 2   original_title     2875 non-null   object 
 3   start_year         2875 non-null   float64
 4   runtime_minutes    2757 non-null   float64
 5   genres             2867 non-null   object 
 6   id                 2875 non-null   int64  
 7   release_date       2875 non-null   object 
 8   movie              2875 non-null   object 
 9   production_budget  2875 non-null   object 
 10  domestic_gross     2875 non-null   object 
 11  worldwide_gross    2875 non-null   object 
 12  averagerating      2875 non-null   float64
 13  numvotes           2875 non-null   int64  
dtypes: float64(3), int64(2), object(9)
memory usage: 336.9+ KB


In [439]:
def column_conversion(df, col):
    df[col] = df[col].str.replace("$","").str.replace(",","").astype('int64')
    return df

In [440]:
column_to_convert = ['production_budget','domestic_gross','worldwide_gross']

for col in column_to_convert:
    df = column_conversion(df, col)

In [441]:
df.head()

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,averagerating,numvotes
0,tt1775309,Avatar,Abatâ,2011.0,93.0,Horror,1,"Dec 18, 2009",Avatar,425000000,760507625,2776345279,6.1,43
1,tt1298650,Pirates of the Caribbean: On Stranger Tides,Pirates of the Caribbean: On Stranger Tides,2011.0,136.0,"Action,Adventure,Fantasy",2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875,6.6,447624
2,tt6565702,Dark Phoenix,Dark Phoenix,2019.0,113.0,"Action,Adventure,Sci-Fi",3,"Jun 7, 2019",Dark Phoenix,350000000,42762350,149762350,6.0,24451
3,tt2395427,Avengers: Age of Ultron,Avengers: Age of Ultron,2015.0,141.0,"Action,Adventure,Sci-Fi",4,"May 1, 2015",Avengers: Age of Ultron,330600000,459005868,1403013963,7.3,665594
4,tt4154756,Avengers: Infinity War,Avengers: Infinity War,2018.0,149.0,"Action,Adventure,Sci-Fi",7,"Apr 27, 2018",Avengers: Infinity War,300000000,678815482,2048134200,8.5,670926


In [442]:
df = pd.merge(df, name_basics_df, on=['tconst'])

# Clean Data

In [443]:
df.drop(['original_title', 'movie', 'tconst', 'release_date'], axis='columns', inplace = True)

In [444]:
df

Unnamed: 0,primary_title,start_year,runtime_minutes,genres,id,production_budget,domestic_gross,worldwide_gross,averagerating,numvotes,nconst,primary_name,birth_year,primary_profession
0,Pirates of the Caribbean: On Stranger Tides,2011.0,136.0,"Action,Adventure,Fantasy",2,410600000,241063875,1045663875,6.6,447624,nm0004851,Penélope Cruz,1974.0,actress
1,Pirates of the Caribbean: On Stranger Tides,2011.0,136.0,"Action,Adventure,Fantasy",2,410600000,241063875,1045663875,6.6,447624,nm0019885,Roger Allam,1953.0,
2,Pirates of the Caribbean: On Stranger Tides,2011.0,136.0,"Action,Adventure,Fantasy",2,410600000,241063875,1045663875,6.6,447624,nm0046087,Jennifer Badger,1976.0,stunts
3,Pirates of the Caribbean: On Stranger Tides,2011.0,136.0,"Action,Adventure,Fantasy",2,410600000,241063875,1045663875,6.6,447624,nm0263740,Steve Evets,1960.0,actor
4,Pirates of the Caribbean: On Stranger Tides,2011.0,136.0,"Action,Adventure,Fantasy",2,410600000,241063875,1045663875,6.6,447624,nm0273797,Sebastian Armesto,1982.0,assistant_director
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17431,Krisha,2015.0,83.0,Drama,38,30000,144822,144822,7.2,5917,nm5703842,Olivia Grace Applegate,1991.0,
17432,Exeter,2015.0,91.0,"Horror,Mystery,Thriller",49,25000,0,489792,4.5,5156,nm1885051,Nick Nicotera,1981.0,editor
17433,Ten,2013.0,118.0,Drama,51,25000,0,0,5.5,31,nm2793783,Colin Burt Vidler,1970.0,actor
17434,Ten,2017.0,82.0,"Horror,Mystery,Thriller",51,25000,0,0,4.7,191,nm1211485,Raquel Castro,1994.0,


In [445]:
df.isna().sum()

primary_title            0
start_year               0
runtime_minutes         68
genres                   5
id                       0
production_budget        0
domestic_gross           0
worldwide_gross          0
averagerating            0
numvotes                 0
nconst                   0
primary_name             0
birth_year               0
primary_profession    6444
dtype: int64

In [446]:
df = df.dropna()

In [447]:
df.isna().sum()

primary_title         0
start_year            0
runtime_minutes       0
genres                0
id                    0
production_budget     0
domestic_gross        0
worldwide_gross       0
averagerating         0
numvotes              0
nconst                0
primary_name          0
birth_year            0
primary_profession    0
dtype: int64

In [448]:
df.shape

(10958, 14)

In [449]:
df = df.drop_duplicates(subset = 'primary_title')

In [450]:
df['primary_title'].nunique

<bound method IndexOpsMixin.nunique of 0        Pirates of the Caribbean: On Stranger Tides
33                                      Dark Phoenix
42                           Avengers: Age of Ultron
63                            Avengers: Infinity War
108                                   Justice League
                            ...                     
17427                                 Raymond Did It
17429                                Foreign Letters
17432                                         Exeter
17433                                            Ten
17435                                         Red 11
Name: primary_title, Length: 1723, dtype: object>

In [451]:
df.shape

(1723, 14)

# Feature Engineering

In [452]:
df['genres_split'] = df['genres'].map(lambda x: x.split(",") if x else x)

In [453]:
all_genres = set()
for genres in df['genres_split']:
    if genres:
        all_genres.update(genres)

In [454]:
df['1_genre'] = df['genres_split'].map(lambda x: x[0])

In [455]:
df.head()

Unnamed: 0,primary_title,start_year,runtime_minutes,genres,id,production_budget,domestic_gross,worldwide_gross,averagerating,numvotes,nconst,primary_name,birth_year,primary_profession,genres_split,1_genre
0,Pirates of the Caribbean: On Stranger Tides,2011.0,136.0,"Action,Adventure,Fantasy",2,410600000,241063875,1045663875,6.6,447624,nm0004851,Penélope Cruz,1974.0,actress,"[Action, Adventure, Fantasy]",Action
33,Dark Phoenix,2019.0,113.0,"Action,Adventure,Sci-Fi",3,350000000,42762350,149762350,6.0,24451,nm0126208,John Byrne,1950.0,art_department,"[Action, Adventure, Sci-Fi]",Action
42,Avengers: Age of Ultron,2015.0,141.0,"Action,Adventure,Sci-Fi",4,330600000,459005868,1403013963,7.3,665594,nm0006568,Maria Callas,1923.0,actress,"[Action, Adventure, Sci-Fi]",Action
63,Avengers: Infinity War,2018.0,149.0,"Action,Adventure,Sci-Fi",7,300000000,678815482,2048134200,8.5,670926,nm0000982,Josh Brolin,1968.0,actor,"[Action, Adventure, Sci-Fi]",Action
108,Justice League,2017.0,120.0,"Action,Adventure,Fantasy",9,300000000,229024295,655945209,6.5,329135,nm0006516,Chris Terrio,1976.0,director,"[Action, Adventure, Fantasy]",Action


# Data Preparation

In [456]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [457]:
# Apply clean_data function to your features.
features = ['1_genre', 'primary_name', 'primary_profession']

for feature in features:
    df[feature] = df[feature].apply(clean_data)

In [458]:
df.head()

Unnamed: 0,primary_title,start_year,runtime_minutes,genres,id,production_budget,domestic_gross,worldwide_gross,averagerating,numvotes,nconst,primary_name,birth_year,primary_profession,genres_split,1_genre
0,Pirates of the Caribbean: On Stranger Tides,2011.0,136.0,"Action,Adventure,Fantasy",2,410600000,241063875,1045663875,6.6,447624,nm0004851,penélopecruz,1974.0,actress,"[Action, Adventure, Fantasy]",action
33,Dark Phoenix,2019.0,113.0,"Action,Adventure,Sci-Fi",3,350000000,42762350,149762350,6.0,24451,nm0126208,johnbyrne,1950.0,art_department,"[Action, Adventure, Sci-Fi]",action
42,Avengers: Age of Ultron,2015.0,141.0,"Action,Adventure,Sci-Fi",4,330600000,459005868,1403013963,7.3,665594,nm0006568,mariacallas,1923.0,actress,"[Action, Adventure, Sci-Fi]",action
63,Avengers: Infinity War,2018.0,149.0,"Action,Adventure,Sci-Fi",7,300000000,678815482,2048134200,8.5,670926,nm0000982,joshbrolin,1968.0,actor,"[Action, Adventure, Sci-Fi]",action
108,Justice League,2017.0,120.0,"Action,Adventure,Fantasy",9,300000000,229024295,655945209,6.5,329135,nm0006516,christerrio,1976.0,director,"[Action, Adventure, Fantasy]",action


In [459]:
df['soup'] = df['1_genre'] + ' ' + df['primary_name'] + ' ' + df['primary_profession']


In [460]:
df.head(3)

Unnamed: 0,primary_title,start_year,runtime_minutes,genres,id,production_budget,domestic_gross,worldwide_gross,averagerating,numvotes,nconst,primary_name,birth_year,primary_profession,genres_split,1_genre,soup
0,Pirates of the Caribbean: On Stranger Tides,2011.0,136.0,"Action,Adventure,Fantasy",2,410600000,241063875,1045663875,6.6,447624,nm0004851,penélopecruz,1974.0,actress,"[Action, Adventure, Fantasy]",action,action penélopecruz actress
33,Dark Phoenix,2019.0,113.0,"Action,Adventure,Sci-Fi",3,350000000,42762350,149762350,6.0,24451,nm0126208,johnbyrne,1950.0,art_department,"[Action, Adventure, Sci-Fi]",action,action johnbyrne art_department
42,Avengers: Age of Ultron,2015.0,141.0,"Action,Adventure,Sci-Fi",4,330600000,459005868,1403013963,7.3,665594,nm0006568,mariacallas,1923.0,actress,"[Action, Adventure, Sci-Fi]",action,action mariacallas actress


# Algorithmic Modeling 

In [461]:
count = CountVectorizer()
count_matrix = count.fit_transform(df['soup'])

In [462]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [463]:
print(cosine_sim)

[[1.         0.33333333 0.66666667 ... 0.         0.         0.        ]
 [0.33333333 1.         0.33333333 ... 0.         0.         0.        ]
 [0.66666667 0.33333333 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.33333333]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.33333333 0.         1.        ]]


In [464]:
cosine_sim.shape

(1723, 1723)

In [465]:
df = df.reset_index()
titles = df['primary_title']
indices = pd.Series(df.index, index=df['primary_title'])

In [466]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [467]:
get_recommendations('The Amazing Spider-Man')

3                                Avengers: Infinity War
6                                 The Dark Knight Rises
7                               Solo: A Star Wars Story
11                           Captain America: Civil War
17     Pirates of the Caribbean: Dead Men Tell No Tales
18                                         The Avengers
19                                         Man of Steel
23                                       Jurassic World
43                                            The Mummy
49                                          Pacific Rim
59                                 The Legend of Tarzan
69                               Spider-Man: Homecoming
77                                           Iron Man 2
78                  Captain America: The Winter Soldier
83                                  Alita: Battle Angel
97                                             Godzilla
99                            The Sorcerer's Apprentice
101                                   Deepwater 

In [468]:
get_recommendations('Tangled')

15       The Hobbit: The Battle of the Five Armies
50                               The Good Dinosaur
58         Fantastic Beasts and Where to Find Them
291                              Christopher Robin
367                       Smurfs: The Lost Village
403                                   Annihilation
443                                  Your Highness
450                                      Early Man
591                                 Identity Thief
727                           A Bad Moms Christmas
13               The Hobbit: An Unexpected Journey
14             The Hobbit: The Desolation of Smaug
30                                    Finding Dory
31                                     Toy Story 3
33                             Alice in Wonderland
35                             Monsters University
36                       Oz the Great and Powerful
39                                          Cars 2
40     Fantastic Beasts: The Crimes of Grindelwald
51                             