# **Movie Recommender System**


In [1]:
# imports
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
df_movies = pd.read_csv('data/ml-25m/movies.csv')
df_ratings = pd.read_csv('data/ml-25m/ratings.csv')
df_tags = pd.read_csv('data/ml-25m/tags.csv')
df_links = pd.read_csv('data/ml-25m/links.csv')
df_movies_metadata = pd.read_csv('data/movies-metadata.csv')
df_credits = pd.read_csv('data/credits.csv')
df_keywords = pd.read_csv('data/keywords.csv')


In [3]:
df_movies_metadata.head()


Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,status,tagline,title,video,vote_average,vote_count,belongs_to_collection.id,belongs_to_collection.name,belongs_to_collection.poster_path,belongs_to_collection.backdrop_path
0,False,/cXQH2u7wUIX1eoIdEj51kHXoWhX.jpg,,1350000,"[{'id': 35, 'name': 'Comedy'}, {'id': 80, 'nam...",http://www.universalstudiosentertainment.com/l...,100,tt0120735,en,"Lock, Stock and Two Smoking Barrels",...,Released,A Disgrace to Criminals Everywhere.,"Lock, Stock and Two Smoking Barrels",False,8.1,5798,,,,
1,False,/cbTvuGya7E1PnL8t95AWzumjqHg.jpg,,0,"[{'id': 18, 'name': 'Drama'}]",,100017,tt0488903,de,Verfolgt,...,Released,,Punish Me,False,4.6,16,,,,
2,False,,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,100032,tt0099137,en,The Great Los Angeles Earthquake,...,Released,"There is no safe harbor, there is no escape......",The Great Los Angeles Earthquake,False,6.9,13,,,,
3,False,,,0,"[{'id': 27, 'name': 'Horror'}, {'id': 35, 'nam...",,100034,tt0462634,en,The Worst Horror Movie Ever Made,...,Released,,The Worst Horror Movie Ever Made,False,3.0,8,,,,
4,False,,,0,"[{'id': 10402, 'name': 'Music'}]",,100038,,en,Meshuggah - Nothing,...,Released,,Meshuggah - Nothing,False,4.0,2,,,,


In [4]:
df_credits.head()


Unnamed: 0,id,cast,crew
0,100,"[{'adult': False, 'gender': 2, 'id': 973, 'kno...","[{'adult': False, 'gender': 2, 'id': 960, 'kno..."
1,100017,"[{'adult': False, 'gender': 2, 'id': 5202, 'kn...","[{'adult': False, 'gender': 1, 'id': 2338, 'kn..."
2,100032,"[{'adult': False, 'gender': 1, 'id': 87038, 'k...","[{'adult': False, 'gender': 2, 'id': 36116, 'k..."
3,100034,"[{'adult': False, 'gender': 0, 'id': 1022808, ...","[{'adult': False, 'gender': 2, 'id': 99005, 'k..."
4,100038,[],[]


In [5]:
df_keywords.head()


Unnamed: 0,id,keywords
0,100,"[{'id': 502, 'name': 'ambush'}, {'id': 567, 'n..."
1,100017,"[{'id': 2843, 'name': 'fetishism'}, {'id': 326..."
2,100032,"[{'id': 2708, 'name': 'hitman'}, {'id': 3521, ..."
3,100034,[]
4,100038,[]


### Three types of recommender systems

1. Demographic Filtering
2. Content Based Filtering
3. Collaborative Filtering


In [6]:
# join movies metadata and credits
df_movies_info = df_movies_metadata.merge(df_credits, on='id')
df_movies_info = df_movies_info.merge(df_keywords, on='id')


In [7]:
df_movies_info.head()


Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,video,vote_average,vote_count,belongs_to_collection.id,belongs_to_collection.name,belongs_to_collection.poster_path,belongs_to_collection.backdrop_path,cast,crew,keywords
0,False,/cXQH2u7wUIX1eoIdEj51kHXoWhX.jpg,,1350000,"[{'id': 35, 'name': 'Comedy'}, {'id': 80, 'nam...",http://www.universalstudiosentertainment.com/l...,100,tt0120735,en,"Lock, Stock and Two Smoking Barrels",...,False,8.1,5798,,,,,"[{'adult': False, 'gender': 2, 'id': 973, 'kno...","[{'adult': False, 'gender': 2, 'id': 960, 'kno...","[{'id': 502, 'name': 'ambush'}, {'id': 567, 'n..."
1,False,/cbTvuGya7E1PnL8t95AWzumjqHg.jpg,,0,"[{'id': 18, 'name': 'Drama'}]",,100017,tt0488903,de,Verfolgt,...,False,4.6,16,,,,,"[{'adult': False, 'gender': 2, 'id': 5202, 'kn...","[{'adult': False, 'gender': 1, 'id': 2338, 'kn...","[{'id': 2843, 'name': 'fetishism'}, {'id': 326..."
2,False,,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,100032,tt0099137,en,The Great Los Angeles Earthquake,...,False,6.9,13,,,,,"[{'adult': False, 'gender': 1, 'id': 87038, 'k...","[{'adult': False, 'gender': 2, 'id': 36116, 'k...","[{'id': 2708, 'name': 'hitman'}, {'id': 3521, ..."
3,False,,,0,"[{'id': 27, 'name': 'Horror'}, {'id': 35, 'nam...",,100034,tt0462634,en,The Worst Horror Movie Ever Made,...,False,3.0,8,,,,,"[{'adult': False, 'gender': 0, 'id': 1022808, ...","[{'adult': False, 'gender': 2, 'id': 99005, 'k...",[]
4,False,,,0,"[{'id': 10402, 'name': 'Music'}]",,100038,,en,Meshuggah - Nothing,...,False,4.0,2,,,,,[],[],[]


## Demographic Filtering


Steps:

1. We require a metric to score or rate a movie.
2. Calculate the score for every movie.
3. Sort the scores and recommend the best rated movie to the users.


The average rating would be an obvious choice but this is not a fair score to use since a movie with a 9.3 average rating with 3 voters should not be considered better than a a movie with 8.8 average rating with 40 voters. I will use IMDB's weighted rating which is given as:


Weighted Rating (WR) = $(\frac{v}{v + m} \cdot R) + (\frac{m}{v + m} \cdot C)$ where,

- v is the number of votes for the movie,
- m is the minimum votes required to be considered,
- R is the average rating of the movie, and
- C is the average rating across the whole report


In [8]:
df_movies_info.head()


Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,video,vote_average,vote_count,belongs_to_collection.id,belongs_to_collection.name,belongs_to_collection.poster_path,belongs_to_collection.backdrop_path,cast,crew,keywords
0,False,/cXQH2u7wUIX1eoIdEj51kHXoWhX.jpg,,1350000,"[{'id': 35, 'name': 'Comedy'}, {'id': 80, 'nam...",http://www.universalstudiosentertainment.com/l...,100,tt0120735,en,"Lock, Stock and Two Smoking Barrels",...,False,8.1,5798,,,,,"[{'adult': False, 'gender': 2, 'id': 973, 'kno...","[{'adult': False, 'gender': 2, 'id': 960, 'kno...","[{'id': 502, 'name': 'ambush'}, {'id': 567, 'n..."
1,False,/cbTvuGya7E1PnL8t95AWzumjqHg.jpg,,0,"[{'id': 18, 'name': 'Drama'}]",,100017,tt0488903,de,Verfolgt,...,False,4.6,16,,,,,"[{'adult': False, 'gender': 2, 'id': 5202, 'kn...","[{'adult': False, 'gender': 1, 'id': 2338, 'kn...","[{'id': 2843, 'name': 'fetishism'}, {'id': 326..."
2,False,,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,100032,tt0099137,en,The Great Los Angeles Earthquake,...,False,6.9,13,,,,,"[{'adult': False, 'gender': 1, 'id': 87038, 'k...","[{'adult': False, 'gender': 2, 'id': 36116, 'k...","[{'id': 2708, 'name': 'hitman'}, {'id': 3521, ..."
3,False,,,0,"[{'id': 27, 'name': 'Horror'}, {'id': 35, 'nam...",,100034,tt0462634,en,The Worst Horror Movie Ever Made,...,False,3.0,8,,,,,"[{'adult': False, 'gender': 0, 'id': 1022808, ...","[{'adult': False, 'gender': 2, 'id': 99005, 'k...",[]
4,False,,,0,"[{'id': 10402, 'name': 'Music'}]",,100038,,en,Meshuggah - Nothing,...,False,4.0,2,,,,,[],[],[]


In [9]:
# determine min number of votes required to be considered
min_votes = df_movies_info['vote_count'].quantile(0.9)
min_votes


75.0

In [10]:
# calculate average rating across the whole report
overall_average_rating = df_movies_info['vote_average'].mean()
overall_average_rating


4.7209208757213394

In [11]:
# filter dataset to exclude movies with less than m votes
df_movies_info_filtered = df_movies_info[df_movies_info['vote_count'] >= min_votes]


In [12]:
len(df_movies_info_filtered) / len(df_movies_info)


0.10054137664346481

In [13]:
# function to calculate weighted rating
def calculate_weigted_rating(x, m=min_votes, C=overall_average_rating):
    R = x['vote_average']
    v = x['vote_count']
    result = ((v/(v+m)*R)) + ((m/(v+m))*C)
    return result


In [14]:
# create new column for weighted rating
df_movies_info_filtered['weighted_rating'] = df_movies_info_filtered.apply(
    lambda x: calculate_weigted_rating(x, min_votes, overall_average_rating), axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movies_info_filtered['weighted_rating'] = df_movies_info_filtered.apply(


In [15]:
# manually check some values to ensure function worked correctly
df_movies_info_filtered[['vote_average',
                         'vote_count', 'weighted_rating']].head()


Unnamed: 0,vote_average,vote_count,weighted_rating
0,8.1,5798,8.056848
6,5.538,2795,5.516648
8,6.6,124,5.891804
28,5.1,90,4.927691
38,6.6,123,5.888228


In [16]:
# sort dataframe based on weighted rating
df_movies_info_filtered = df_movies_info_filtered.sort_values(
    'weighted_rating', ascending=False)


In [17]:
# look at top 25 movies
df_movies_info_filtered[['title', 'vote_average',
                         'vote_count', 'weighted_rating']].head(25)


Unnamed: 0,title,vote_average,vote_count,weighted_rating
22666,The Godfather,8.715,17427,8.697884
23672,The Shawshank Redemption,8.702,23237,8.689192
22687,The Godfather Part II,8.601,10552,8.573616
25229,Schindler's List,8.565,13729,8.544114
9944,The Dark Knight,8.506,29141,8.496283
24896,12 Angry Men,8.534,7021,8.493698
26305,The Green Mile,8.5,15013,8.481215
4496,Spirited Away,8.5,13917,8.479743
28868,Pulp Fiction,8.491,24615,8.479548
4671,Forrest Gump,8.48,24102,8.468339


Lets try generating Top 25 lists for specific genres of movies


In [18]:
# convert from json to list for genre column
df_movies_info['genres'] = df_movies_info['genres'].fillna('[]').apply(
    literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
print(df_movies_info['genres'].head())


0              [Comedy, Crime]
1                      [Drama]
2    [Drama, Action, TV Movie]
3             [Horror, Comedy]
4                      [Music]
Name: genres, dtype: object


In [19]:
# split genres columns into individual rows
genre_col = df_movies_info.apply(lambda x: pd.Series(
    x['genres'], dtype='object'), axis=1).stack().reset_index(level=1, drop=True)
genre_col.name = 'genre'
df_movies_info_genre_split = df_movies_info.drop(
    'genres', axis=1).join(genre_col)


In [20]:
# function to return dataframe of top N movies for a particular genre
def top_N_movies(genre, N, percentile=0.9):
    df_movies_info_specific_genre = df_movies_info_genre_split[
        df_movies_info_genre_split['genre'] == genre]

    min_votes = df_movies_info_specific_genre['vote_count'].quantile(
        percentile)

    overall_average_rating = df_movies_info_specific_genre['vote_average'].mean(
    )

    df_movies_info_specific_genre_filtered = df_movies_info_specific_genre[
        df_movies_info_specific_genre['vote_count'] >= min_votes]

    df_movies_info_specific_genre_filtered['weighted_rating'] = df_movies_info_specific_genre_filtered.apply(
        lambda x: calculate_weigted_rating(x, min_votes, overall_average_rating), axis=1)

    df_movies_info_specific_genre_filtered = df_movies_info_specific_genre_filtered.sort_values(
        'weighted_rating', ascending=False)

    return df_movies_info_specific_genre_filtered[:N]


In [21]:
genre_col.unique()


array(['Comedy', 'Crime', 'Drama', 'Action', 'TV Movie', 'Horror',
       'Music', 'Romance', 'Documentary', 'Family', 'Western',
       'Animation', 'Thriller', 'War', 'Science Fiction', 'Fantasy',
       'Adventure', 'Mystery', 'History'], dtype=object)

In [22]:
top_N_movies('Romance', 25)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movies_info_specific_genre_filtered['weighted_rating'] = df_movies_info_specific_genre_filtered.apply(


Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,homepage,id,imdb_id,original_language,original_title,overview,...,vote_count,belongs_to_collection.id,belongs_to_collection.name,belongs_to_collection.poster_path,belongs_to_collection.backdrop_path,cast,crew,keywords,genre,weighted_rating
4671,False,/3h1JZGDhZ8nzxdgvkxha0qBqi05.jpg,,55000000,https://www.paramountmovies.com/movies/forrest...,13,tt0109830,en,Forrest Gump,A man with a low IQ has accomplished great thi...,...,24102,,,,,"[{'adult': False, 'gender': 2, 'id': 31, 'know...","[{'adult': False, 'gender': 2, 'id': 37, 'know...","[{'id': 422, 'name': 'vietnam veteran'}, {'id'...",Romance,8.431216
24853,False,/W1ffLQGHoxfAOq0ZYdPtJlvAdb.jpg,,20000000,http://www.eternalsunshine.com,38,tt0338013,en,Eternal Sunshine of the Spotless Mind,"Joel Barish, heartbroken that his girlfriend u...",...,13043,,,,,"[{'adult': False, 'gender': 2, 'id': 206, 'kno...","[{'adult': False, 'gender': 2, 'id': 201, 'kno...","[{'id': 242, 'name': 'new york city'}, {'id': ...",Romance,8.022287
25260,False,/pGWVZT13HBz70L30ytRMXpNkTgU.jpg,,2479000,,426,tt0052357,en,Vertigo,A retired San Francisco detective suffering fr...,...,4927,,,,,"[{'adult': False, 'gender': 2, 'id': 854, 'kno...","[{'adult': False, 'gender': 2, 'id': 1045, 'kn...","[{'id': 582, 'name': 'san francisco, californi...",Romance,7.99266
23789,False,/wOfzdzC0QZyhUIlyjeMuUYSb8Ax.jpg,,878000,,289,tt0034583,en,Casablanca,"In Casablanca, Morocco in December 1941, a cyn...",...,4584,,,,,"[{'adult': False, 'gender': 2, 'id': 4110, 'kn...","[{'adult': False, 'gender': 2, 'id': 2005, 'kn...","[{'id': 128, 'name': 'love triangle'}, {'id': ...",Romance,7.982338
25371,False,/pgEWI7yGYF0mM5Uf1YxSOHsBkJR.jpg,,28000000,http://www.prideandprejudicemovie.net/splash.html,4348,tt0414387,en,Pride & Prejudice,A story of love and life among the landed Engl...,...,6966,,,,,"[{'adult': False, 'gender': 1, 'id': 116, 'kno...","[{'adult': False, 'gender': 1, 'id': 474, 'kno...","[{'id': 392, 'name': 'england'}, {'id': 818, '...",Romance,7.958144
32008,False,/g7KmJaGFvST5cyvA71V6epKAqIC.jpg,,2540800,,872,tt0045152,en,Singin' in the Rain,"In 1927 Hollywood, a silent film production co...",...,2663,,,,,"[{'adult': False, 'gender': 2, 'id': 13294, 'k...","[{'adult': False, 'gender': 2, 'id': 14681, 'k...","[{'id': 3177, 'name': 'fan'}, {'id': 3748, 'na...",Romance,7.845676
27775,False,/6VmFqApQRyZZzmiGOQq2C92jyvH.jpg,,200000000,,597,tt0120338,en,Titanic,101-year-old Rose DeWitt Bukater tells the sto...,...,22225,,,,,"[{'adult': False, 'gender': 1, 'id': 204, 'kno...","[{'adult': False, 'gender': 2, 'id': 900, 'kno...","[{'id': 793, 'name': 'drowning'}, {'id': 2227,...",Romance,7.841617
18997,False,/9Y9K6LeLrMeofOvX7hZW36Aj3OG.jpg,,10000000,https://www.miramax.com/movie/amelie/,194,tt0211915,fr,Le Fabuleux Destin d'Amélie Poulain,"At a tiny Parisian café, the adorable yet pain...",...,10270,,,,,"[{'adult': False, 'gender': 1, 'id': 2405, 'kn...","[{'adult': False, 'gender': 2, 'id': 2401, 'kn...","[{'id': 90, 'name': 'paris, france'}, {'id': 1...",Romance,7.812409
22676,False,/5zTwAcnI3zCJ0mucPPnoBtkSWEV.jpg,,2883848,,239,tt0053291,en,Some Like It Hot,Two musicians witness a mob hit and struggle t...,...,2950,,,,,"[{'adult': False, 'gender': 2, 'id': 3150, 'kn...","[{'adult': False, 'gender': 2, 'id': 3146, 'kn...","[{'id': 520, 'name': 'chicago, illinois'}, {'i...",Romance,7.80378
25660,False,/vVBcIN68kFq681b4lObiNJhEVro.jpg,,60000000,http://www.abeautifulmind.com,453,tt0268978,en,A Beautiful Mind,John Nash is a brilliant but asocial mathemati...,...,9043,,,,,"[{'adult': False, 'gender': 2, 'id': 934, 'kno...","[{'adult': False, 'gender': 2, 'id': 151, 'kno...","[{'id': 30, 'name': 'individual'}, {'id': 222,...",Romance,7.79776


In [23]:
top_N_movies('Science Fiction', 25).head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movies_info_specific_genre_filtered['weighted_rating'] = df_movies_info_specific_genre_filtered.apply(


Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,homepage,id,imdb_id,original_language,original_title,overview,...,vote_count,belongs_to_collection.id,belongs_to_collection.name,belongs_to_collection.poster_path,belongs_to_collection.backdrop_path,cast,crew,keywords,genre,weighted_rating
10446,False,/rAiYTfKGqDCRIIqo664sY9XZIvQ.jpg,,165000000,http://www.interstellarmovie.net/,157336,tt0816692,en,Interstellar,The adventures of a group of explorers who mak...,...,30464,,,,,"[{'adult': False, 'gender': 2, 'id': 10297, 'k...","[{'adult': False, 'gender': 2, 'id': 947, 'kno...","[{'id': 310, 'name': 'artificial intelligence'...",Science Fiction,8.190308
23433,False,/s3TBrRGB1iav7gFOCNx3H31MoES.jpg,,160000000,https://www.warnerbros.com/movies/inception,27205,tt1375666,en,Inception,"Cobb, a skilled thief who commits corporate es...",...,33132,,,,,"[{'adult': False, 'gender': 2, 'id': 6193, 'kn...","[{'adult': False, 'gender': 2, 'id': 947, 'kno...","[{'id': 90, 'name': 'paris, france'}, {'id': 4...",Science Fiction,8.179947
17759,False,/dMZxEdrWIzUmUoOz2zvmFuutbj7.jpg,,18000000,http://www.starwars.com/films/star-wars-episod...,1891,tt0080684,en,The Empire Strikes Back,"The epic saga continues as Luke Skywalker, in ...",...,15026,10.0,Star Wars Collection,/gq5Wi7i4SF3lo4HHkJasDV95xI9.jpg,/d8duYyyC9J5T825Hg7grmaabfxQ.jpg,"[{'adult': False, 'gender': 2, 'id': 2, 'known...","[{'adult': False, 'gender': 2, 'id': 1, 'known...","[{'id': 526, 'name': 'rebel'}, {'id': 803, 'na...",Science Fiction,8.020315
572,False,/5bzPWQ2dFUl2aZKkp7ILJVVkRed.jpg,,19000000,http://www.backtothefuture.com/movies/backtoth...,105,tt0088763,en,Back to the Future,Eighties teenager Marty McFly is accidentally ...,...,17522,264.0,Back to the Future Collection,/5Xsu2o5IsZRuuxCEVZ9nVve21FP.jpg,/AqQotqj7XOI6GjB28nhMMa8YzOT.jpg,"[{'adult': False, 'gender': 2, 'id': 521, 'kno...","[{'adult': False, 'gender': 2, 'id': 37, 'know...","[{'id': 389, 'name': 'clock tower'}, {'id': 39...",Science Fiction,7.988157
27899,False,/waCRuAW5ocONRehP556vPexVXA9.jpg,,63000000,http://www.warnerbros.com/matrix,603,tt0133093,en,The Matrix,"Set in the 22nd century, The Matrix tells the ...",...,22774,2344.0,The Matrix Collection,/bV9qTVHTVf0gkW0j7p7M0ILD4pG.jpg,/bRm2DEgUiYciDw3myHuYFInD7la.jpg,"[{'adult': False, 'gender': 2, 'id': 6384, 'kn...","[{'adult': False, 'gender': 2, 'id': 123, 'kno...","[{'id': 83, 'name': 'saving the world'}, {'id'...",Science Fiction,7.951657


### Content Based Filtering


In this recommender system, the contents of the movie (overview, genre, cast, crew, keywords, etc) is used to determine its similarity with other movies. The movies which are most similar are recommended.


#### 1. Plot Description Based Recommender


In [24]:
df_movies_info['overview']


0        A card shark and his unwillingly-enlisted frie...
1        Angelina Maccarone's intense drama deals with ...
2        After a series of small tremors in Los Angeles...
3        A group of friends get together for a card gam...
4                             Live content from bonus DVD.
                               ...                        
33613    Nicholas, a young man with limited resources, ...
33614    Everything you've ever wanted to know about Sa...
33615    Donald is washing windows on a high-rise; Plut...
33616    Johann Bach teams up with the mysterious Josh ...
33617    The earliest Nazi propaganda film by Leni Rief...
Name: overview, Length: 33618, dtype: object

In [25]:
# drop those with empty overview
df_movies_info_no_empty_overview = df_movies_info.dropna(
    subset=['overview']).reset_index()


In [26]:
# define tfidf vectorizer and remove all english stop words
tfidf = TfidfVectorizer(stop_words='english', use_idf=True)
tfidf_matrix = tfidf.fit_transform(
    df_movies_info_no_empty_overview['overview'])
tfidf_matrix.shape


(30864, 68854)

In [27]:
df_tfidf = pd.DataFrame(tfidf_matrix[0].T.todense(
), index=tfidf.get_feature_names_out(), columns=['tfidf'])


There are almost 70,000 different words used to describe the 30,000 movies in the dataaset.
Using this matrix, we will calculate a similarity score. We will use the cosine similarity score.


The cosine similarity score computes a numeric quantity that describes the similarity between two movies. Mathematically, it is defined as:
$cosine(x,y) = \frac{x. y^\intercal}{||x||.||y||} $

As we already have the tf-idf vectorizer, we can directlyy get the cosine similarity score by computing the dot product.


In [28]:
# compute cosine similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


In [29]:
# create map of index and movie titles
indices = pd.Series(df_movies_info_no_empty_overview.index,
                    index=df_movies_info_no_empty_overview['title'])


Steps:

1. Get index of the input movie title.
2. Get list of cosine similarity scores for that particular movies with all other movies. Convert it into a list. of tuples where the first element is its index and the second is the cosine similarity score.
3. Sort the list of tuples based on the cosine similarity score.
4. Get the top 10 elements of this list, ignoring the first element as it refers to the input movie.
5. Return the titles corresponding to the indices of the top elements


In [30]:
# function to take in a movie title as input and output the N most similar movies
def get_recommendations(title, N, indices, cosine_sim, df):
    # get index of the input movie
    index = indices[title]

    # get the similarity scores for all movies with the input movie
    scores = list(enumerate(cosine_sim[index]))

    # sort the movies based on the similarity scores
    scores = sorted(scores, key=lambda x: x[1], reverse=True)

    # get the scores of the top N most similar movies
    scores = scores[1:N+1]

    # get the movie indices
    movie_indices = [i[0] for i in scores]

    # return the top N most similar movies
    result = df['title'].iloc[movie_indices]
    return result


In [31]:
get_recommendations('The Dark Knight', 10, indices, cosine_sim, df_movies_info_no_empty_overview)


21349                                               Batman
23026                                       Batman Forever
6098                                  Fight Batman, Fight!
12329    Batman and Me: A Devotion to Destiny, the Bob ...
21458                                        Batman Begins
23037                                       Batman & Robin
13827       Lego Batman: The Movie - DC Super Heroes Unite
15658                                          Batman Tech
26894                              The Mechanical Monsters
3600                                                Batman
Name: title, dtype: object

In [32]:
get_recommendations('Toy Story', 10, indices, cosine_sim, df_movies_info_no_empty_overview)


16539             The Bees' Buzz
19400            Show Biz Beagle
26636     The 40 Year Old Virgin
19434                 Lotsa Luck
19420           Ship a-Hoy Woody
13200         The Rasslin' Match
19411           Wild Bill Hiccup
14616                 Hot Splash
19435    Hook, Line, and Stinker
750            The Violent Breed
Name: title, dtype: object

#### 2. Credits, Genres and Keywords Based Recommender


From the cast, crew and keywords features, we will extract the three most important actors, the director and the keywords associated with that movie.


In [33]:
df_movies_info['cast']


0        [{'adult': False, 'gender': 2, 'id': 973, 'kno...
1        [{'adult': False, 'gender': 2, 'id': 5202, 'kn...
2        [{'adult': False, 'gender': 1, 'id': 87038, 'k...
3        [{'adult': False, 'gender': 0, 'id': 1022808, ...
4                                                       []
                               ...                        
33613    [{'adult': False, 'gender': 2, 'id': 222484, '...
33614    [{'adult': False, 'gender': 2, 'id': 120576, '...
33615    [{'adult': False, 'gender': 2, 'id': 78077, 'k...
33616    [{'adult': False, 'gender': 0, 'id': 1022748, ...
33617    [{'adult': False, 'gender': 2, 'id': 10280, 'k...
Name: cast, Length: 33618, dtype: object

In [34]:
# parse each column into their corresponding python objects
# skip genres columns because it has already been parsed
features = ['cast', 'crew', 'keywords']

for feature in features:
    df_movies_info[feature] = df_movies_info[feature].apply(literal_eval)
    

In [35]:
# function to get the director name from the crew feature
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan


In [36]:
# function to get a list of the first 3 elements or entire list, whichever is more
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        if len(names) > 3:
            names = names[:3]
        return names

    return []


In [37]:
df_movies_info['director'] = df_movies_info['crew'].apply(get_director)

features = ['cast', 'keywords']
for feature in features:
    df_movies_info[feature] = df_movies_info[feature].apply(get_list)


In [38]:
df_movies_info[['title', 'director', 'cast', 'keywords', 'genres']].head()


Unnamed: 0,title,director,cast,keywords,genres
0,"Lock, Stock and Two Smoking Barrels",Guy Ritchie,"[Jason Flemyng, Dexter Fletcher, Nick Moran]","[ambush, alcohol, shotgun]","[Comedy, Crime]"
1,Punish Me,Angelina Maccarone,"[Kostja Ullmann, Maren Kroymann, Moritz Grove]","[fetishism, masochism, submissive]",[Drama]
2,The Great Los Angeles Earthquake,Larry Elikann,"[Joanna Kerns, Dan Lauria, Bonnie Bartlett]","[hitman, earthquake, los angeles, california]","[Drama, Action, TV Movie]"
3,The Worst Horror Movie Ever Made,Bill Zebub,"[Rocco Martone, Jeanne Potter, Elaine Tuttle]",[],"[Horror, Comedy]"
4,Meshuggah - Nothing,,[],[],[Music]


In [39]:
df_movies_info[df_movies_info['title'] == 'Batman'][['title', 'director', 'cast', 'keywords', 'genres']].head()


Unnamed: 0,title,director,cast,keywords,genres
3802,Batman,Lambert Hillyer,"[Lewis Wilson, Douglas Croft, J. Carrol Naish]","[superhero, based on comic, espionage]","[Action, Adventure, Crime, Science Fiction, Th..."
23232,Batman,Leslie H. Martinson,"[Adam West, Burt Ward, Lee Meriwether]","[submarine, missile, shark attack]","[Action, Comedy, Crime]"
23319,Batman,Tim Burton,"[Michael Keaton, Jack Nicholson, Kim Basinger]","[double life, dual identity, chemical]","[Fantasy, Action, Crime]"


In [40]:
# function to convert all strings to lower case and strip name of spaces
def lower_and_strip_spaces(x):
    if isinstance(x, list):
        # for lists
        return [str.lower(i.replace(' ','')) for i in x]
    elif isinstance(x, str):
        # for strings
        return str.lower(x.replace(' ', ''))
    else:
        return ''

In [41]:
# apply function on features
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    df_movies_info[feature] = df_movies_info[feature].apply(lower_and_strip_spaces)

In [42]:
df_movies_info[df_movies_info['title'] == 'Batman'][['title', 'director', 'cast', 'keywords', 'genres']].head()


Unnamed: 0,title,director,cast,keywords,genres
3802,Batman,lamberthillyer,"[lewiswilson, douglascroft, j.carrolnaish]","[superhero, basedoncomic, espionage]","[action, adventure, crime, sciencefiction, thr..."
23232,Batman,leslieh.martinson,"[adamwest, burtward, leemeriwether]","[submarine, missile, sharkattack]","[action, comedy, crime]"
23319,Batman,timburton,"[michaelkeaton, jacknicholson, kimbasinger]","[doublelife, dualidentity, chemical]","[fantasy, action, crime]"


In [43]:
# function to create metadata soup from the columns
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast'])+ ' ' + x['director'] + ' ' + ' '.join(x['genres'])


In [44]:
# create metadata soup
df_movies_info['soup'] = df_movies_info.apply(create_soup, axis=1)

In [45]:
df_movies_info[df_movies_info['title'] == 'Batman'][['soup']].head()


Unnamed: 0,soup
3802,superhero basedoncomic espionage lewiswilson d...
23232,submarine missile sharkattack adamwest burtwar...
23319,doublelife dualidentity chemical michaelkeaton...


In [46]:
df_movies_info['soup'][3802]

'superhero basedoncomic espionage lewiswilson douglascroft j.carrolnaish lamberthillyer action adventure crime sciencefiction thriller war'

We use CountVectorizer() instead of tf-idf here so we do not down-weight the presence of an actor/director if they have acted or directed in more movies compared to others

In [47]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df_movies_info['soup'])
count_matrix.shape

(33618, 68764)

In [48]:
# compute cosine similarity
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)


In [49]:
indices = pd.Series(df_movies_info.index, index=df_movies_info['title'])

In [50]:
get_recommendations('The Godfather', 10, indices, cosine_sim2, df_movies_info)

22687                        The Godfather Part II
22700                       The Godfather Part III
19468                                A Sneaky Boer
9695     Killing the Chickens to Scare the Monkeys
29649                          The Local Stigmatic
2893                       The Talented Mr. Ripley
1016                              The Price of Air
1693                                        Fetish
5047                             Dope Case Pending
5235                     C.C.Catch - Pretty Voices
Name: title, dtype: object

In [51]:
get_recommendations('The Dark Knight', 10, indices, cosine_sim2, df_movies_info)

23431                   Batman Begins
29271    Janice M Vidal Fairy Concert
1016                 The Price of Air
22895                     Harry Brown
1121                       Malevolent
2223                          Musafir
2503              How to Kill a Judge
3939                       A Red Bear
4018      Bad Azz Muthaz: Black Ninja
6013            Family: Ties of Blood
Name: title, dtype: object

In [52]:
get_recommendations('Toy Story', 10, indices, cosine_sim2, df_movies_info)

5705     Willy Fog in Journey to the Center of the Earth
31261                                         Alles Helt
33267                                    Superstar Goofy
11587                                             Pyrats
26188                                             Cars 2
32806                                               Cars
715                               Little Red Riding Hood
13097     Buck Denver Asks...Why Do We Call It Christmas
13332                     Donald Duck and his Companions
21140                                     When I Yoo Hoo
Name: title, dtype: object