In [106]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt 
import seaborn as sns
sns.set()
%matplotlib inline

In [107]:
mf = pd.read_csv(r'./dataparsed/metadata.csv')

In [108]:
mf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45014 entries, 0 to 45013
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   movie_id      45014 non-null  int64  
 1   title         45014 non-null  object 
 2   release_date  44936 non-null  object 
 3   popularity    45014 non-null  float64
 4   budget        45014 non-null  int64  
 5   revenue       45014 non-null  float64
 6   vote_average  45014 non-null  float64
 7   vote_count    45014 non-null  float64
dtypes: float64(4), int64(2), object(2)
memory usage: 2.7+ MB


In [109]:
mf.popularity.describe()

count    45014.000000
mean         2.939093
std          6.023441
min          0.000000
25%          0.391015
50%          1.135734
75%          3.730199
max        547.488298
Name: popularity, dtype: float64

In [110]:
mf.release_date = pd.to_datetime(mf.release_date)
mf['release_year'] = mf.release_date.dt.year
mf = mf[mf.popularity > 5]

In [111]:
mf.shape 

(9126, 9)

In [112]:
directors = pd.read_csv(r'./dataparsed/directors.csv')
directors_gp = directors.groupby('movie_id').name.apply(list).reset_index(name='directors')

In [113]:
cast = pd.read_csv(r'./dataparsed/cast.csv')
cast = cast[cast.order < 11]
cast_gp = cast.groupby('movie_id').name.apply(list).reset_index(name='cast')

In [114]:
keywords = pd.read_csv(r'./dataparsed/keywords.csv')
keywords_gp = keywords.groupby('movie_id').keyword.apply(list).reset_index(name='keywords')

In [115]:
genres = pd.read_csv(r'./dataparsed/genres.csv')
genres_gp = genres.groupby('movie_id').genre.apply(list).reset_index(name= 'genres')

In [116]:
movies_gp = pd.merge(cast_gp, directors_gp, on= 'movie_id', how= 'outer')
temp = pd.merge(keywords_gp, genres_gp, on= 'movie_id', how= 'left')
temp = pd.merge(temp, mf[['movie_id', 'title', 'vote_average']], on='movie_id', how='outer')
movies_gp = pd.merge(movies_gp, temp, on= 'movie_id', how= 'outer')
movies_gp.head(1)

Unnamed: 0,movie_id,cast,directors,keywords,genres,title,vote_average
0,2,"[Turo Pajala, Susanna Haavisto, Matti Pellonpä...",[Aki Kaurismäki],"[underdog, prison, factory worker, prisoner, h...","[Drama, Crime]",,


In [117]:
movies_gp.isna().sum()

movie_id            0
cast             2065
directors         518
keywords        13972
genres          14518
title           35946
vote_average    35946
dtype: int64

In [118]:
# following steps in this notebook
# https://www.kaggle.com/code/ibtesama/getting-started-with-a-movie-recommendation-system/notebook
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    # should leave nulls alone
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]

In [119]:
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'directors', 'genres']

for feature in features:
    movies_gp[feature] = movies_gp[feature].apply(clean_data)

In [120]:
# Eliminate NA values for soup to work
features = ['cast', 'keywords', 'directors', 'genres']
for att in features:
    movies_gp[att] = movies_gp[att].fillna('')


In [121]:
movies_gp.isna().sum()

movie_id            0
cast                0
directors           0
keywords            0
genres              0
title           35946
vote_average    35946
dtype: int64

In [122]:
movies_gp.dropna(inplace= True)

In [123]:
movies_gp.reset_index(drop= True, inplace= True)

In [124]:
movies_gp.head(2)

Unnamed: 0,movie_id,cast,directors,keywords,genres,title,vote_average
0,5,"[timroth, antoniobanderas, jenniferbeals, mado...","[allisonanders, alexandrerockwell, robertrodri...","[hotel, newyear'seve, witch, bet, hotelroom, s...","[crime, comedy]",Four Rooms,6.5
1,6,"[emilioestevez, cubagoodingjr., denisleary, je...",[stephenhopkins],"[chicago, drugdealer, boxingmatch, escape, one...","[action, thriller, crime]",Judgment Night,6.4


In [125]:
# Function joins the words in each list 
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + ' '.join(x['directors']) + ' ' + ' '.join(x['genres'])

In [126]:
movies_gp['soup'] = movies_gp.apply(create_soup, axis= 1)

Counting the frequency of each word in each movie

In [127]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
count_matrix = count.fit_transform(movies_gp['soup'])

In [128]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [129]:
cosine_sim.shape

(9126, 9126)

In [130]:
# Construct a reverse map of indices and movie titles
movie_bytitle = pd.Series(movies_gp.index, index=movies_gp['title'])

In [131]:
# refrence https://www.kaggle.com/code/ibtesama/getting-started-with-a-movie-recommendation-system#kln-56
def get_similar_movies(title, n= 10):
    idx = movie_bytitle[title]

    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:n+1]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movies_gp['title'].iloc[movie_indices]

In [132]:
get_similar_movies('Batman Begins')

97                 The Dark Knight
6288         The Dark Knight Rises
265                 Batman & Robin
8921      Batman: The Killing Joke
6640              Batman: Year One
5274                         Ninja
5761                      Defendor
7931     Batman: Assault on Arkham
6002    Batman: Under the Red Hood
3730       Rise of the Footsoldier
Name: title, dtype: object

In [133]:
get_similar_movies('Inception')

6288                    The Dark Knight Rises
3382                                  Peacock
4893                      L: change the WorLd
8360                         Now You See Me 2
9112                      Yu-Gi-Oh! The Movie
2218                                 Paycheck
2604                                    Congo
4010                               Nancy Drew
1454    Sky Captain and the World of Tomorrow
4227              G.I. Joe: The Rise of Cobra
Name: title, dtype: object

In [138]:
get_similar_movies('Avengers: Age of Ultron')

5182                             The Avengers
8191               Captain America: Civil War
2507                               Iron Man 2
7064      Captain America: The Winter Soldier
880        Captain America: The First Avenger
7081                                  Ant-Man
8106    Marvel Studios: Assembling a Universe
6769                     Thor: The Dark World
6623                               Iron Man 3
9040                                Team Thor
Name: title, dtype: object