In [1]:
# Importing the required libraries

%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

import warnings; warnings.simplefilter('ignore')

### Simple Recommender system 
##### A Recommender system that recommends movies based on movie ratings and other basic criteria.

In [3]:
meta = pd.read_csv('Data/movies_metadata.csv')
meta['genres'] = meta['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [4]:
vote_counts = meta[meta['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = meta[meta['vote_average'].notnull()]['vote_average'].astype('int')
avg_rating = round(vote_averages.mean(),3)
min_votes = vote_counts.quantile(0.95)
print("Average rating (out of 10) required by the movie to qualify is:",avg_rating)
print("Minimum number of votes required by the movie to qualify is:",min_votes)

Average rating (out of 10) required by the movie to qualify is: 5.245
Minimum number of votes required by the movie to qualify is: 434.0


In [5]:
meta['year'] = pd.to_datetime(meta['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [6]:
# Creating a new df called qualified_movies with the following criteria:
# vote_count >= min_votes and is not null
# vote_average >= avg_rating and is not null

qualified_movies = meta[(meta['vote_count'] >= min_votes) & (meta['vote_count'].notnull()) & (meta['vote_average'].notnull())& (meta['vote_average']>= avg_rating)][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified_movies['vote_count'] = qualified_movies['vote_count'].astype('int')
qualified_movies['vote_average'] = qualified_movies['vote_average'].astype('int')
qualified_movies.shape

(2181, 6)

##### Creating a new attribute called 'weighted_rating' to show top movies. 
$\\$
Weighted_Rating = $(\frac{votes}{votes + min\_votes} . rating) + (\frac{min\_votes}{votes + min\_votes} . avg\_rating)$
$\\$
$\\$
where,
* *votes* is the number of votes for the movie
* *min_votes* is the minimum votes required to be listed in the chart
* *rating* is the average rating of the movie
* *avg_rating* is the mean vote across the whole report

In [43]:
def weighted_rating(df):
    votes = df['vote_count']
    rating = df['vote_average']
    return (votes/(votes+min_votes)*rating) + (min_votes/(min_votes+votes)*avg_rating)


In [8]:
qualified_movies['Weighted_Rating'] = qualified_movies.apply(weighted_rating,axis = 1)
qualified_movies = qualified_movies.sort_values('Weighted_Rating', ascending= False)

In [9]:
qualified_movies.head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,Weighted_Rating
15480,Inception,2010,14075,8,29.108149,"[Action, Thriller, Science Fiction, Mystery, A...",7.917591
12481,The Dark Knight,2008,12269,8,123.167259,"[Drama, Action, Crime, Thriller]",7.905875
22879,Interstellar,2014,11187,8,32.213481,"[Adventure, Drama, Science Fiction]",7.897111
2843,Fight Club,1999,9678,8,63.869599,[Drama],7.881757
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.070725,"[Adventure, Fantasy, Action]",7.871792
292,Pulp Fiction,1994,8670,8,140.950236,"[Thriller, Crime]",7.868665
314,The Shawshank Redemption,1994,8358,8,51.645403,"[Drama, Crime]",7.864005
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.324358,"[Adventure, Fantasy, Action]",7.861932
351,Forrest Gump,1994,8147,8,48.307194,"[Comedy, Drama, Romance]",7.860661
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.423537,"[Adventure, Fantasy, Action]",7.851929


In [10]:
genres = meta.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
genres.name = 'genre'
gen_meta = meta.drop('genres', axis=1).join(genres)

In [11]:
def top_movies(genre,percentile = 0.80):
    genres = meta.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
    genres.name = 'genre'
    gen_meta = meta.drop('genres', axis=1).join(genres)

    df = gen_meta[gen_meta['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    avg_rating = round(vote_averages.mean(),3)
    min_votes = vote_counts.quantile(percentile)

    qualified_movies = df[(df['vote_count'] >= min_votes) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())& (df['vote_average']>= avg_rating)][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genre']]
    qualified_movies['vote_count'] = qualified_movies['vote_count'].astype('int')
    qualified_movies['vote_average'] = qualified_movies['vote_average'].astype('int')
    
    qualified_movies['Weighted_Rating'] = qualified_movies.apply(weighted_rating,axis = 1)
    qualified_movies = qualified_movies.sort_values('Weighted_Rating', ascending= False)

    return qualified_movies
    

In [13]:
top_movies('Thriller')

Unnamed: 0,title,year,vote_count,vote_average,popularity,genre,Weighted_Rating
15480,Inception,2010,14075,8,29.108149,Thriller,7.917591
12481,The Dark Knight,2008,12269,8,123.167259,Thriller,7.905875
292,Pulp Fiction,1994,8670,8,140.950236,Thriller,7.868665
46,Se7en,1995,5915,8,18.45743,Thriller,7.811676
24860,The Imitation Game,2014,5895,8,31.59594,Thriller,7.811081
...,...,...,...,...,...,...,...
42309,Ghost in the Shell,2017,2547,5,68.726676,Thriller,5.035669
11864,Fantastic 4: Rise of the Silver Surfer,2007,2648,5,12.924254,Thriller,5.034500
42902,Alien: Covenant,2017,2677,5,72.884078,Thriller,5.034179
20536,G.I. Joe: Retaliation,2013,3045,5,10.560608,Thriller,5.030563


### Content Based Recommender (Content Based Filtering)
##### A recommender system that computes the similarity between movies based on certain criteria and metrics and recommends movies that are extremely similar to a movie that a user liked.

### Two types of systems can be created:

##### 1) Movie description and taglines based recommender
##### 2) Movie cast, crew, keywords and genre based recommender

In [2]:
links_small = pd.read_csv('Data/links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

meta = pd.read_csv('Data/movies_metadata.csv')
meta['genres'] = meta['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
meta['year'] = pd.to_datetime(meta['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

meta = meta.drop([19730, 29503, 35587]) #Incorrect data so remove it.
meta['id'] = meta['id'].astype('int')

In [3]:
sub_meta = meta[meta['id'].isin(links_small)]
sub_meta

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,1995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40224,False,,15000000,"[Action, Adventure, Drama, Horror, Science Fic...",,315011,tt4262980,ja,シン・ゴジラ,From the mind behind Evangelion comes a hit la...,...,77000000.0,120.0,"[{'iso_639_1': 'it', 'name': 'Italiano'}, {'is...",Released,A god incarnate. A city doomed.,Shin Godzilla,False,6.6,152.0,2016
40503,False,,0,"[Documentary, Music]",http://www.thebeatlesliveproject.com/,391698,tt2531318,en,The Beatles: Eight Days a Week - The Touring Y...,"The band stormed Europe in 1963, and, in 1964,...",...,0.0,99.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The band you know. The story you don't.,The Beatles: Eight Days a Week - The Touring Y...,False,7.6,92.0,2016
44821,False,"{'id': 34055, 'name': 'Pokémon Collection', 'p...",16000000,"[Adventure, Fantasy, Animation, Action, Family]",http://movies.warnerbros.com/pk3/,10991,tt0235679,ja,Pokémon 3: The Movie,When Molly Hale's sadness of her father's disa...,...,68411275.0,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Pokémon: Spell of the Unknown,Pokémon: Spell of the Unknown,False,6.0,144.0,2000
44826,False,"{'id': 34055, 'name': 'Pokémon Collection', 'p...",0,"[Adventure, Fantasy, Animation, Science Fictio...",http://www.pokemon.com/us/movies/movie-pokemon...,12600,tt0287635,ja,劇場版ポケットモンスター セレビィ 時を越えた遭遇（であい）,"All your favorite Pokémon characters are back,...",...,28023563.0,75.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,,Pokémon 4Ever: Celebi - Voice of the Forest,False,5.7,82.0,2001


#### Movie description based recommender

In [4]:
sub_meta['tagline'] = sub_meta['tagline'].fillna('')
sub_meta['description'] = sub_meta['overview'] + sub_meta['tagline']
sub_meta['description'] = sub_meta['description'].fillna('')

In [5]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(sub_meta['description'])

In [6]:
tfidf_matrix.shape

(9099, 268124)

#### Here, I will be using cosine similarity to find the similarity between two movies. Cosine similarity calculates the numeric quantity that denoted the similarity between two movies.
#### $cosine(x,y) = \frac{x. y^\intercal}{||x||.||y||} $

#### Since we have used the TF-IDF Vectorizer, calculating the Dot Product will directly give us the Cosine Similarity Score. Therefore, we will use sklearn's **linear_kernel** instead of cosine_similarities since it is much faster.

In [8]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [10]:
cosine_sim.shape

# We now have a pairwise cosine similarity matrix for all the movies in our dataset.

(9099, 9099)

In [13]:
sub_meta = sub_meta.reset_index()
titles = sub_meta['title']
indices = pd.Series(sub_meta.index,index = sub_meta['title'])

In [16]:
# Recommending top 30 movies
def get_recommendations(movie_title):
    idx = indices[movie_title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]    

In [18]:
get_recommendations('The Dark Knight').head(10)

7931                      The Dark Knight Rises
132                              Batman Forever
1113                             Batman Returns
8227    Batman: The Dark Knight Returns, Part 2
7565                 Batman: Under the Red Hood
524                                      Batman
7901                           Batman: Year One
2579               Batman: Mask of the Phantasm
2696                                        JFK
8165    Batman: The Dark Knight Returns, Part 1
Name: title, dtype: object

From the above results for **The Dark Knight** we can see that our system is able to identlfy it as a series of Batman movies and recommends other batman movies as its top recommendations. This system is not upto the mark as it is not considering key factors like director, crew, cast and genre, which determines the rating and popularity of a movie.

We can improve our system by using metadata than **Overview** and **Tagline**. In the next subsection, we will build a more sophisticated recommender that takes **genre**, **keywords**, **cast** and **crew** into consideration.

#### Metadata based Recommender

To build our standard metadata based content recommender, we will need to merge our current dataset with the crew and the keyword datasets.

In [21]:
credits = pd.read_csv('Data/credits.csv')
keywords = pd.read_csv('Data/keywords.csv')

keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
meta['id'] = meta['id'].astype('int')

In [22]:
meta = meta.merge(credits, on='id')
meta = meta.merge(keywords, on='id')

sub_meta = meta[meta['id'].isin(links_small)]
sub_meta.shape

(9219, 28)

We now have our cast, crew, genres and credits, all in one dataframe. Let us wrangle this a little more using the following intuitions:

1. **Crew:** From the crew, we will only pick the director as our feature since the others don't contribute that much to the *feel* of the movie.
2. **Cast:** Choosing Cast is a little more tricky. Lesser known actors and minor roles do not really affect people's opinion of a movie. Therefore, we must only select the major characters and their respective actors. Arbitrarily we will choose the top 3 actors that appear in the credits list. 

In [23]:
sub_meta['cast'] = sub_meta['cast'].apply(literal_eval)
sub_meta['crew'] = sub_meta['crew'].apply(literal_eval)
sub_meta['keywords'] = sub_meta['keywords'].apply(literal_eval)
sub_meta['cast_size'] = sub_meta['cast'].apply(lambda x: len(x))
sub_meta['crew_size'] = sub_meta['crew'].apply(lambda x: len(x))

In [25]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [26]:
sub_meta['director'] = sub_meta['crew'].apply(get_director)

sub_meta['cast'] = sub_meta['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
sub_meta['cast'] = sub_meta['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [28]:
sub_meta['keywords'] = sub_meta['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [30]:
sub_meta['cast'] = sub_meta['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

sub_meta['director'] = sub_meta['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
sub_meta['director'] = sub_meta['director'].apply(lambda x: [x,x, x])

In [31]:
# Preprocessing of keywords

key = sub_meta.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
key.name = 'keyword'
key = key.value_counts()

# Only taking keywords into considerations whose count is more than 1.
key = key[key > 1]
key

independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
                       ... 
boarder                   2
social climbing           2
covert operation          2
prisoners                 2
crystal                   2
Name: keyword, Length: 6709, dtype: int64

In [32]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in key:
            words.append(i)
    return words

In [33]:
stemmer = SnowballStemmer('english')

sub_meta['keywords'] = sub_meta['keywords'].apply(filter_keywords)
sub_meta['keywords'] = sub_meta['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
sub_meta['keywords'] = sub_meta['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [35]:
sub_meta['soup'] = sub_meta['keywords'] + sub_meta['cast'] + sub_meta['director'] + sub_meta['genres']
sub_meta['soup'] = sub_meta['soup'].apply(lambda x: ' '.join(x))

In [38]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(sub_meta['soup'])

cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [39]:
sub_meta = sub_meta.reset_index()
titles = sub_meta['title']
indices = pd.Series(sub_meta.index, index=sub_meta['title'])

In [40]:
get_recommendations('The Dark Knight').head(10)

8031         The Dark Knight Rises
6218                 Batman Begins
6623                  The Prestige
2085                     Following
7648                     Inception
4145                      Insomnia
3381                       Memento
8613                  Interstellar
7659    Batman: Under the Red Hood
1134                Batman Returns
Name: title, dtype: object

The recommendations seem to have recognized other Christopher Nolan movies (due to the high weightage given to director) and put them as top recommendations. I enjoyed watching **The Dark Knight** as well as some of the other ones in the list including **Batman Begins**, **The Prestige** and **The Dark Knight Rises**. 

We can of course experiment on this engine by trying out different weights for our features (directors, actors, genres), limiting the number of keywords that can be used in the soup, weighing genres based on their frequency, only showing movies with the same languages, etc.

#### Popularity and Ratings

One thing that we notice about our recommendation system is that it recommends movies regardless of ratings and popularity. It is true that **Batman and Robin** has a lot of similar characters as compared to **The Dark Knight** but it was a terrible movie that shouldn't be recommended to anyone.

Therefore, we will add a mechanism to remove bad movies and return movies which are popular and have had a good critical response.

I will take the top 25 movies based on similarity scores and calculate the vote of the 60th percentile movie. Then, using this as the value of $m$, we will calculate the weighted rating of each movie using IMDB's formula like we did in the Simple Recommender section.

In [44]:
def weighted_rating(df,min_votes,avg_rating):
    votes = df['vote_count']
    rating = df['vote_average']
    return (votes/(votes+min_votes)*rating) + (min_votes/(min_votes+votes)*avg_rating)

def improved_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = sub_meta.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    
    avg_rating = vote_averages.mean()
    min_votes = vote_counts.quantile(0.60)
    
    qualified = movies[(movies['vote_count'] >= min_votes) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating,args=(min_votes,avg_rating), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [45]:
improved_recommendations('The Dark Knight')

Unnamed: 0,title,vote_count,vote_average,year,wr
7648,Inception,14075,8,2010,7.872621
8613,Interstellar,11187,8,2014,7.843131
6623,The Prestige,4510,8,2006,7.662461
3381,Memento,4168,8,2000,7.641301
8031,The Dark Knight Rises,9263,7,2012,6.936548
6218,Batman Begins,7511,7,2005,6.923913
1134,Batman Returns,1706,6,1992,6.206438
132,Batman Forever,1529,5,1995,5.67652
9024,Batman v Superman: Dawn of Justice,7189,5,2016,5.224783
1260,Batman & Robin,1447,4,1997,5.167624
