# [Content based movie recommender](https://www.datacamp.com/community/tutorials/recommender-systems-python)

## Import libraries

In [1]:
import pandas as pd
import numpy as np
from os import path

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from ast import literal_eval


## Download dataset
Subset of Movielens dataset - [kaggle download link](https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset/download)



Extract the files into a `kaggle_dataset` folder in the project root directory

In [2]:
dataset_dir = "kaggle_dataset"

## Visualize metadata

In [3]:
metadata_dir = path.join(dataset_dir, 'movies_metadata.csv')
assert path.exists(metadata_dir)

In [4]:
metadata = pd.read_csv(metadata_dir, low_memory=False)

In [5]:
metadata.shape

(45466, 24)

In [6]:
metadata.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


## Creating a top 250 charts


Create new dataframe with weighted ranking as a new feature

### Weighted ranking
The formula used by IMDB


$ Weighted rating = (\frac{v}{v+m}R) + (\frac{m}{m+v}C)$

where 
- v = number of votes
- m = minimum votes requirement
- R = avg rating
- C = mean vote across dataset

In [8]:
C = metadata['vote_average'].mean()

print("Mean vote of dataset: {} ".format(C))

Mean vote of dataset: 5.618207215133889 


In [8]:
ptile=0.9 # percentile
m = metadata['vote_count'].quantile(ptile)

print('Min vote count at {}th percentile : {}'.format(int(ptile*100), m))

Min vote count at 90th percentile : 160.0


### Filter out movies vote counts less than `m`

In [9]:
q_dataFrame = metadata.copy().loc[metadata['vote_count'] >= m]

In [10]:
q_dataFrame.shape

(4555, 24)

### Creating `score`

Define the weighted rating function

In [11]:
def weighted_rating(X, m=m, C=C):
	v = X['vote_count']
	R = X['vote_average']
	return v*R/(v+m) + v*C/(v+m)


In [12]:
q_dataFrame['score'] = q_dataFrame.apply(weighted_rating, axis=1)

In [13]:
q_dataFrame = q_dataFrame.sort_values('score', ascending=False)

In [14]:
q_dataFrame[['title', 'vote_count', 'vote_average', 'score']].head(10)

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,13.853014
834,The Godfather,6024.0,8.5,13.752924
12481,The Dark Knight,12269.0,8.3,13.739036
2843,Fight Club,9678.0,8.3,13.691849
292,Pulp Fiction,8670.0,8.3,13.666009
15480,Inception,14075.0,8.1,13.564016
351,Forrest Gump,8147.0,8.2,13.552057
22879,Interstellar,11187.0,8.1,13.524772
1154,The Empire Strikes Back,5998.0,8.2,13.459176
7000,The Lord of the Rings: The Return of the King,8226.0,8.1,13.456472


## Plot description based recommender

- use pairwise cosine similarity scores 
- NLP problem

### Visualize the `overview` feature

In [15]:
metadata[['title', 'overview']].head()

Unnamed: 0,title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...


### Compute TF-IDF matrix

In [16]:
tfidf = TfidfVectorizer(stop_words='english')

Replace `NaN` with `''`

In [17]:
metadata['overview'] = metadata['overview'].fillna('')

In [18]:
tfidf_matrix = tfidf.fit_transform(metadata['overview'])

In [19]:
tfidf_matrix.shape

(45466, 75827)

Sample array mappings (feature indices to feature name)

In [20]:
tfidf.get_feature_names_out()[5000:5010]

array(['avails', 'avaks', 'avalanche', 'avalanches', 'avallone', 'avalon',
       'avant', 'avanthika', 'avanti', 'avaracious'], dtype=object)

#### Cosine similarity formula
Mathematically



$cos(x,y) = \frac{x.y^T}{||x||.||y||} = \frac{\Sigma^n_i x_i . y^T_i}{\sqrt\Sigma(x_i)^2 \sqrt\Sigma(y_i)^2}$



So, we will just use dot product

We convert down to `np.float32` due to paging issues

In [21]:
reduced_matrix = tfidf_matrix.copy().astype(np.float32)

In [22]:
cosine_sim = linear_kernel(reduced_matrix, reduced_matrix)

In [23]:
cosine_sim.shape

(45466, 45466)

Sample cosine simularity

In [24]:
cosine_sim[0]

array([1.        , 0.01504121, 0.        , ..., 0.        , 0.00595453,
       0.        ], dtype=float32)

### Constructing a recommender 

Define a function that takes in a movie title as input and outputs the 10 closest movies based on the calculated cosine similarity

Define a reverse map of indices to features

In [62]:
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()

In [65]:
indices["The Dark Knight"]

title
The Dark Knight    12481
The Dark Knight    28700
dtype: int64

In [8]:
def get_recs_from_title(title, cosine_sim, n_recs=10):
	idx = indices[title]
	movie_indices = np.argpartition(cosine_sim[idx], -(n_recs+1))[-(n_recs+1):]
	movie_indices = sorted(movie_indices, key=lambda x: cosine_sim[idx][x], reverse=True)[1:]
	return metadata['title'].iloc[movie_indices]
	


In [28]:
get_recs_from_title("The Dark Knight Rises")

12481                                      The Dark Knight
150                                         Batman Forever
1328                                        Batman Returns
15511                           Batman: Under the Red Hood
585                                                 Batman
21194    Batman Unmasked: The Psychology of the Dark Kn...
9230                    Batman Beyond: Return of the Joker
18035                                     Batman: Year One
19792              Batman: The Dark Knight Returns, Part 1
3095                          Batman: Mask of the Phantasm
Name: title, dtype: object

In [29]:
get_recs_from_title("The Godfather")

1178               The Godfather: Part II
44030    The Godfather Trilogy: 1972-1990
1914              The Godfather: Part III
23126                          Blood Ties
11297                    Household Saints
34717                   Start Liquidation
10821                            Election
38030            A Mother Should Be Loved
17729                   Short Sharp Shock
26293                  Beck 28 - Familjen
Name: title, dtype: object

As can be seen, the title alone may not be enough for recommendations (eg. Dark Knight lovers could be fans of Nolan, and not necessarily just the batman)

## Credits, genres and keywords based

### Load the required datasets

In [7]:
credits_dir = path.join(dataset_dir, 'credits.csv')
keywords_dir = path.join(dataset_dir, 'keywords.csv')

In [8]:
credits_data = pd.read_csv(credits_dir)
keywords_data = pd.read_csv(keywords_dir)

In [9]:
credits_data.shape

(45476, 3)

In [10]:
keywords_data.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


### Preprocess data

First, remove rows with bad `id` formats, which are the following

In [11]:
metadata.loc[metadata['id'].str.contains('-')]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
19730,- Written by Ørnås,0.065736,/ff9qCepilowshEtG2GYWwzt2bs4.jpg,"[{'name': 'Carousel Productions', 'id': 11176}...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",1997-08-20,0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,1,,,,,,,,,
29503,Rune Balot goes to a casino connected to the ...,1.931659,/zV8bHuSL6WXoD6FWogP9j4x80bL.jpg,"[{'name': 'Aniplex', 'id': 2883}, {'name': 'Go...","[{'iso_3166_1': 'US', 'name': 'United States o...",2012-09-29,0,68.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,...,12,,,,,,,,,
35587,Avalanche Sharks tells the story of a bikini ...,2.185485,/zaSf5OG7V8X8gqFvly88zDdRm46.jpg,"[{'name': 'Odyssey Media', 'id': 17161}, {'nam...","[{'iso_3166_1': 'CA', 'name': 'Canada'}]",2014-01-01,0,82.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,22,,,,,,,,,


In [12]:
q_dataFrame = metadata[metadata['id'].str.contains('-')==False].copy().drop_duplicates()

In [13]:
q_dataFrame.shape

(45446, 24)

Convert `id`s to `int` before merging the 3 dataframes

In [14]:
keywords_data['id'] = keywords_data['id'].astype('int')
credits_data['id'] = credits_data['id'].astype('int')
q_dataFrame['id'] = q_dataFrame['id'].astype('int')

In [15]:
keywords_data.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [16]:
q_dataFrame = q_dataFrame.merge(credits_data, on='id')

In [17]:
q_dataFrame.shape

(45502, 26)

In [18]:
q_dataFrame = q_dataFrame.merge(keywords_data, on='id')

In [19]:
q_dataFrame.shape

(46548, 27)

In [20]:
q_dataFrame.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."


In [55]:
q_dataFrame = q_dataFrame.drop_duplicates()
print(q_dataFrame.shape)

TypeError: unhashable type: 'list'

Parse stringified features into python objects

In [22]:
features = ['cast', 'crew', 'keywords', 'genres']

In [23]:
for feature in features:
	q_dataFrame[feature] = q_dataFrame[feature].apply(literal_eval)

#### Helper functions

In [24]:
def get_director(x):
	for i in x:
		if i['job'] == 'Director':
			return i['name']
	return np.nan

In [25]:
def get_list(x):
	if isinstance(x, list):
		names = [i['name'] for i in x]
		if len(names) > 3:
			names = names[:3]
		return names

	return []

In [26]:
def clean_data(x):
	if isinstance(x, list):
		return [str.lower(i.replace(" ", "")) for i in x]
	else:
		if isinstance(x, str):
			return str.lower(x.replace(" ", ""))
		else:
			return ''

In [27]:
q_dataFrame['director'] = q_dataFrame['crew'].apply(get_director)

In [28]:
q_dataFrame['director'].loc[:5]

0      John Lasseter
1       Joe Johnston
2      Howard Deutch
3    Forest Whitaker
4      Charles Shyer
5       Michael Mann
Name: director, dtype: object

In [29]:
features = ['cast', 'keywords', 'genres']
for feature in features:
	q_dataFrame[feature] = q_dataFrame[feature].apply(get_list)

In [30]:
q_dataFrame[features].head()

Unnamed: 0,cast,keywords,genres
0,"[Tom Hanks, Tim Allen, Don Rickles]","[jealousy, toy, boy]","[Animation, Comedy, Family]"
1,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]","[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]"
2,"[Walter Matthau, Jack Lemmon, Ann-Margret]","[fishing, best friend, duringcreditsstinger]","[Romance, Comedy]"
3,"[Whitney Houston, Angela Bassett, Loretta Devine]","[based on novel, interracial relationship, sin...","[Comedy, Drama, Romance]"
4,"[Steve Martin, Diane Keaton, Martin Short]","[baby, midlife crisis, confidence]",[Comedy]


In [31]:
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
	q_dataFrame[feature] = q_dataFrame[feature].apply(clean_data)

In [32]:
q_dataFrame[features].head()

Unnamed: 0,cast,keywords,director,genres
0,"[tomhanks, timallen, donrickles]","[jealousy, toy, boy]",johnlasseter,"[animation, comedy, family]"
1,"[robinwilliams, jonathanhyde, kirstendunst]","[boardgame, disappearance, basedonchildren'sbook]",joejohnston,"[adventure, fantasy, family]"
2,"[waltermatthau, jacklemmon, ann-margret]","[fishing, bestfriend, duringcreditsstinger]",howarddeutch,"[romance, comedy]"
3,"[whitneyhouston, angelabassett, lorettadevine]","[basedonnovel, interracialrelationship, single...",forestwhitaker,"[comedy, drama, romance]"
4,"[stevemartin, dianekeaton, martinshort]","[baby, midlifecrisis, confidence]",charlesshyer,[comedy]


In [33]:
def create_soup(x):
	return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + ' '.join(x['director']) + ' ' + ' '.join(x['genres']) 

#### Creating data soup

In [34]:
q_dataFrame['soup'] = q_dataFrame.apply(create_soup, axis=1)

In [35]:
q_dataFrame['soup'].head(3)

0    jealousy toy boy tomhanks timallen donrickles ...
1    boardgame disappearance basedonchildren'sbook ...
2    fishing bestfriend duringcreditsstinger walter...
Name: soup, dtype: object

## Creating a recommender
Use a count vectorizer here(basically TF-IDF without the IDF)

In [36]:
def get_recs_from_features(title, cosine_sim, n_recs=10):
	idx = indices[title]
	movie_indices = np.argpartition(cosine_sim[idx], -(n_recs+1))[-(n_recs+1):]
	movie_indices = sorted(movie_indices, key=lambda x: cosine_sim[idx][x], reverse=True)[1:]
	return q_dataFrame['title'].iloc[movie_indices]
	


In [37]:
count = CountVectorizer(stop_words='english')


In [38]:
count_matrix = count.fit_transform(q_dataFrame['soup'])

In [39]:
count_matrix.shape

(45456, 58204)

In [40]:
count_matrix[0]

<1x58204 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [42]:
reduced_count_matrix = count_matrix.copy().astype(np.float32)

In [43]:
reduced_count_matrix[0]

<1x58204 sparse matrix of type '<class 'numpy.float32'>'
	with 9 stored elements in Compressed Sparse Row format>

In [44]:
cosine_sim_2 = cosine_similarity(reduced_count_matrix, reduced_count_matrix)

In [45]:
cosine_sim_2[0]

array([1.        , 0.10540926, 0.11111112, ..., 0.        , 0.        ,
       0.        ], dtype=float32)

In [49]:
q_dataFrame = q_dataFrame.reset_index()

In [50]:
indices = pd.Series(q_dataFrame.index, index=q_dataFrame['title']).drop_duplicates()

In [51]:
indices['The Dark Knight Rises']

18267

In [56]:
get_recs_from_features("The Dark Knight Rises", cosine_sim_2[:])

12498           The Dark Knight
10137             Batman Begins
9242                     Shiner
9801            Amongst Friends
7710                   Mitchell
35486    Manuscripts Don't Burn
39891                      Sara
516           Romeo Is Bleeding
24842                  Deadfall
23901                 Quicksand
Name: title, dtype: object

In [61]:
get_recs_from_features("Jumanji", cosine_sim_2[:])

37541                          Jack and the Beanstalk
14300                       Where the Wild Things Are
26507                                  Mostly Ghostly
24675               Tinker Bell and the Lost Treasure
40869                                  You Are Umasou
40426       Mostly Ghostly 3: One Night in Doom House
30441    Mostly Ghostly: Have You Met My Ghoulfriend?
14625          Playmobil: The Secret of Pirate Island
22012                        The Sword and the Dragon
45306                       Friend of the Jolly Devil
Name: title, dtype: object

### Saving the model

In [72]:
np.save('keywords_rec', cosine_sim_2)

In [73]:
test_save = np.load('keywords_rec.npy')

In [74]:
test_save

array([[1.        , 0.10540926, 0.11111112, ..., 0.        , 0.        ,
        0.        ],
       [0.10540926, 1.0000001 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.11111112, 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.99999994,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]], dtype=float32)