Two MovieLens datasets.

The Full Dataset: Consists of 26,000,000 ratings and 750,000 tag applications applied to 45,000 movies by 270,000 users. Includes tag genome data with 12 million relevance scores across 1,100 tags.
The Small Dataset: Comprises of 100,000 ratings and 1,300 tag applications applied to 9,000 movies by 700 users.

In [49]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
from surprise.model_selection import KFold
from surprise.model_selection.validation import cross_validate
import warnings; warnings.simplefilter('ignore')

In [32]:
md = pd. read_csv('movies_metadata.csv')
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [33]:
md = md.drop([19730, 29503, 35587])
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
md['overview'] = md['overview'].fillna('')
md['tagline'] = md['tagline'].fillna('')
md['description'] = md['overview'] + md['tagline']
md['id'] = md['id'].fillna('').astype('int')

In [79]:
links_small = pd.read_csv('links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')


#Check EDA Notebook for how and why I got these indices.

smd = md[md['id'].isin(links_small)]
#smd.shape

In [66]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

def word_vec(sentence):
    docres = model.encode(sentence)
    return docres

overview_embedding = smd["description"].apply(word_vec)
overview_embedding = overview_embedding.to_list()
overview_embedding_df = pd.DataFrame(overview_embedding)

In [67]:
overview_embedding_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,0.063439,0.001027,0.093210,-0.014942,-0.006400,0.015289,0.123085,-0.030446,-0.037180,0.021572,...,0.025831,-0.101288,-0.070295,-0.001306,-0.022086,-0.007595,0.038765,0.015355,0.044632,0.022023
1,0.078425,0.047064,-0.035055,-0.057462,0.007203,0.076272,0.054492,-0.060593,0.005528,0.055816,...,0.081946,-0.039362,-0.053424,0.063753,-0.020088,0.041161,0.022090,-0.016544,-0.028449,-0.021939
2,-0.108588,0.040781,0.013575,-0.033336,-0.107850,0.040752,0.014376,-0.018910,-0.002793,-0.100902,...,-0.034944,-0.042342,0.073390,0.088056,0.054444,0.017003,0.041833,0.053519,-0.015743,0.015136
3,-0.064281,-0.027703,0.043969,0.048130,0.068590,0.008186,0.061041,-0.061553,-0.017819,-0.032262,...,0.019171,-0.041394,-0.056389,0.086823,0.029052,-0.057417,-0.022852,-0.035540,-0.026187,-0.055277
4,-0.012485,-0.062078,0.064716,0.017995,0.029822,-0.014653,0.058125,-0.001420,-0.010802,-0.010093,...,0.119831,-0.025066,-0.071494,-0.046497,0.015632,0.029994,0.023736,-0.005992,-0.003104,-0.053614
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9094,-0.049507,0.012062,0.032526,0.039825,0.049340,0.049653,-0.013143,0.030100,0.052773,-0.004601,...,0.030382,-0.040603,-0.011870,0.059492,-0.062189,0.032584,0.048773,-0.066097,0.028390,0.042934
9095,0.090541,0.005454,0.000183,0.044051,0.049436,0.060292,0.012631,-0.005757,0.022504,-0.010987,...,0.048484,-0.037295,0.052352,0.140643,0.032545,0.024296,0.046054,-0.044899,-0.012620,0.010985
9096,-0.114949,0.025848,0.001485,-0.003160,0.010447,-0.019959,0.086733,-0.076076,-0.030192,-0.042817,...,0.050925,-0.016225,-0.013226,-0.005050,-0.015376,0.031389,0.105086,0.002048,0.019390,-0.004215
9097,-0.045378,0.071012,0.017228,-0.017682,0.024349,0.052665,0.020917,0.067747,0.036772,0.004524,...,0.044147,0.040792,-0.059186,-0.010472,-0.014112,-0.040231,0.056798,-0.030055,-0.004013,-0.009382


In [68]:
#compute pair-wise similarity between movies
matrix_sim = overview_embedding_df.dot(overview_embedding_df.T) 
matrix_sim[matrix_sim > 0.99] = 0 

item_similarity_df = pd.concat([
    smd, 
    matrix_sim],
    axis=1)

In [40]:
item_similarity_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,9089,9090,9091,9092,9093,9094,9095,9096,9097,9098
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862.0,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,0.153453,0.041898,0.141014,-0.043846,0.148947,0.11263,0.029654,0.204158,0.214748,0.002099
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844.0,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,0.307478,0.106262,0.311835,-0.000146,0.04629,0.265419,0.053533,0.249519,0.210156,0.224443
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602.0,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,0.156794,0.119668,0.225342,0.160165,0.244736,0.149899,0.023642,0.136481,0.090952,0.212686
3,False,,16000000,"[Comedy, Drama, Romance]",,31357.0,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,0.230069,0.075899,0.312953,0.154146,0.132488,0.158909,0.201036,0.165535,0.101264,0.148251
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862.0,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,0.120878,0.226767,0.202079,0.099009,0.163831,0.190801,0.020061,0.122571,0.03875,0.034045


In [71]:
item_similarity_df.shape

(12329, 9125)

In [80]:
smd = smd.reset_index()
indices = pd.Series(item_similarity_df.index, index=item_similarity_df['title'])

def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(matrix_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[0:10]
    movie_indices = [i[0] for i in sim_scores]
    return smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]

### As shown below, content-based recommender performs well since the top recommended movies of Toy Story are it's own sequence movies, followed by other family & anime movies

In [81]:
get_recommendations('Toy Story')

Unnamed: 0,title,vote_count,vote_average,year,id
2502,Toy Story 2,3914.0,7.3,1999,863
7535,Toy Story 3,4710.0,7.6,2010,10193
1552,Child's Play 3,274.0,5.5,1991,11187
8313,The Kings of Summer,427.0,7.0,2013,156700
994,Manhattan,600.0,7.8,1979,696
2199,Radio Days,131.0,7.0,1987,30890
528,Pinocchio,1412.0,6.9,1940,10895
2958,Firestarter,174.0,5.9,1984,11495
7842,A Very Harold & Kumar Christmas,335.0,6.0,2011,55465
6312,Luxo Jr.,148.0,7.1,1986,13925


# Collaborative filtering

In [46]:
reader = Reader()
ratings = pd.read_csv('ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [51]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)


svd = SVD()
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2cc7c7d60>

In [53]:
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [55]:
svd.predict(1, 31, 3) #predict the user 1's rating on movie 31, the estimates is 2.735, which is close to the actual rating 2.5

Prediction(uid=1, iid=1029, r_ui=3, est=2.735302300898442, details={'was_impossible': False})

# Hybrid model

In [56]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

id_map = pd.read_csv('links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title')

In [57]:
indices_map = id_map.set_index('id')

In [58]:
indices_map

Unnamed: 0_level_0,movieId
id,Unnamed: 1_level_1
862.0,1
8844.0,2
15602.0,3
31357.0,4
11862.0,5
...,...
159550.0,161944
392572.0,162542
402672.0,162672
315011.0,163056


In [59]:
def hybrid(userId, title):
    idx = indices[title]
    
    sim_scores = list(enumerate(matrix_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)  #use similarity score to make use of the content-based information
    sim_scores = sim_scores[0:10]
    movie_indices = [i[0] for i in sim_scores]
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]

    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est) #further use the collaborative model on top of the content-based predictions to make use of the user information
    movies = movies.sort_values('est', ascending=False)
    return movies.head(10)

### As the result shows, the hybrid model produced similar prediction to the results from the content-based model, but they are not identical

In [61]:
hybrid(1, 'Toy Story')

Unnamed: 0,title,vote_count,vote_average,year,id,est
15348,Toy Story 3,4710.0,7.6,2010,10193,3.335528
2997,Toy Story 2,3914.0,7.3,1999,863,3.201692
1199,Manhattan,600.0,7.8,1979,696,2.912136
2635,Radio Days,131.0,7.0,1987,30890,2.795806
21070,The Kings of Summer,427.0,7.0,2013,156700,2.723321
589,Pinocchio,1412.0,6.9,1940,10895,2.658003
10659,Luxo Jr.,148.0,7.1,1986,13925,2.64002
17628,A Very Harold & Kumar Christmas,335.0,6.0,2011,55465,2.563796
3586,Firestarter,174.0,5.9,1984,11495,2.51784
1884,Child's Play 3,274.0,5.5,1991,11187,2.370295


# 