# Hybrid Recommender System

A movie recommender system combing content based and collaborative filtering methods, using Full MovieLens DataSet 

Reference: https://www.kaggle.com/rounakbanik/movie-recommender-systems/notebook

In [17]:
#import libraries

import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from surprise import Reader, Dataset, SVD, accuracy
from surprise.model_selection import KFold

In [18]:
#Read the movies and tags data
tags = pd.read_csv('ml-latest-small/tags.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [19]:
#merge both dataset into one
movies_tags = pd.merge(movies,tags,on=['movieId'])
movies_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,1139045764
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar,1137206825
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,fun,1525286013
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,fantasy,1528843929
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,magic board game,1528843932


In [20]:
#Replace the special character in genres to space between the words
movies_tags['genres'] = movies_tags['genres'].str.replace('|', ' ')
movies_tags.head()

  


Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,336,pixar,1139045764
1,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,474,pixar,1137206825
2,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,567,fun,1525286013
3,2,Jumanji (1995),Adventure Children Fantasy,62,fantasy,1528843929
4,2,Jumanji (1995),Adventure Children Fantasy,62,magic board game,1528843932


In [21]:
#combine tags for each movies
movies_tags['tag'] = movies_tags['tag'].astype(str)
movies_tags['tag'] = movies_tags.groupby(['movieId'])['tag'].transform(lambda x: ' '.join(x))
movies_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,336,pixar pixar fun,1139045764
1,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,474,pixar pixar fun,1137206825
2,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,567,pixar pixar fun,1525286013
3,2,Jumanji (1995),Adventure Children Fantasy,62,fantasy magic board game Robin Williams game,1528843929
4,2,Jumanji (1995),Adventure Children Fantasy,62,fantasy magic board game Robin Williams game,1528843932


In [22]:
#remove unnecessary columns and drop duplicated rows from combining tags in earlier step
movies_tags = movies_tags.drop(columns=['timestamp','userId'])
movies_tags = movies_tags.drop_duplicates()
movies_tags

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar pixar fun
3,2,Jumanji (1995),Adventure Children Fantasy,fantasy magic board game Robin Williams game
7,3,Grumpier Old Men (1995),Comedy Romance,moldy old
9,5,Father of the Bride Part II (1995),Comedy,pregnancy remake
11,7,Sabrina (1995),Comedy Romance,remake
...,...,...,...,...
3668,183611,Game Night (2018),Action Comedy Crime Horror,Comedy funny Rachel McAdams
3671,184471,Tomb Raider (2018),Action Adventure Fantasy,adventure Alicia Vikander video game adaptation
3674,187593,Deadpool 2 (2018),Action Comedy Sci-Fi,Josh Brolin Ryan Reynolds sarcasm
3677,187595,Solo: A Star Wars Story (2018),Action Adventure Children Sci-Fi,Emilia Clarke star wars


In [23]:
#combine genres and tags to one description 'soup'
movies_tags['description'] = movies_tags['genres'] + movies_tags['tag']
movies_tags

Unnamed: 0,movieId,title,genres,tag,description
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar pixar fun,Adventure Animation Children Comedy Fantasypix...
3,2,Jumanji (1995),Adventure Children Fantasy,fantasy magic board game Robin Williams game,Adventure Children Fantasyfantasy magic board ...
7,3,Grumpier Old Men (1995),Comedy Romance,moldy old,Comedy Romancemoldy old
9,5,Father of the Bride Part II (1995),Comedy,pregnancy remake,Comedypregnancy remake
11,7,Sabrina (1995),Comedy Romance,remake,Comedy Romanceremake
...,...,...,...,...,...
3668,183611,Game Night (2018),Action Comedy Crime Horror,Comedy funny Rachel McAdams,Action Comedy Crime HorrorComedy funny Rachel ...
3671,184471,Tomb Raider (2018),Action Adventure Fantasy,adventure Alicia Vikander video game adaptation,Action Adventure Fantasyadventure Alicia Vikan...
3674,187593,Deadpool 2 (2018),Action Comedy Sci-Fi,Josh Brolin Ryan Reynolds sarcasm,Action Comedy Sci-FiJosh Brolin Ryan Reynolds ...
3677,187595,Solo: A Star Wars Story (2018),Action Adventure Children Sci-Fi,Emilia Clarke star wars,Action Adventure Children Sci-FiEmilia Clarke ...


In [24]:
#Apply CountVectorizer to get pairwise cosine similarity matrix for all the movies in our dataset
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(movies_tags['description'])

In [25]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim

array([[1.        , 0.12725695, 0.12403473, ..., 0.07161149, 0.14322297,
        0.20180184],
       [0.12725695, 1.        , 0.        , ..., 0.        , 0.17770466,
        0.        ],
       [0.12403473, 0.        , 1.        , ..., 0.11547005, 0.        ,
        0.21693046],
       ...,
       [0.07161149, 0.        , 0.11547005, ..., 1.        , 0.13333333,
        0.31311215],
       [0.14322297, 0.17770466, 0.        , ..., 0.13333333, 1.        ,
        0.12524486],
       [0.20180184, 0.        , 0.21693046, ..., 0.31311215, 0.12524486,
        1.        ]])

In [26]:
movies_tags = movies_tags.reset_index()
titles = movies_tags['title']
indices = pd.Series(movies_tags.index, index=movies_tags['title'])

In [27]:
indices

title
Toy Story (1995)                         0
Jumanji (1995)                           1
Grumpier Old Men (1995)                  2
Father of the Bride Part II (1995)       3
Sabrina (1995)                           4
                                      ... 
Game Night (2018)                     1567
Tomb Raider (2018)                    1568
Deadpool 2 (2018)                     1569
Solo: A Star Wars Story (2018)        1570
Gintama: The Movie (2010)             1571
Length: 1572, dtype: int64

In [28]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [29]:
reader = Reader()
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

svd = SVD()
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a490190c88>

In [30]:
def get_recommendations(userId, title):
    idx = indices['Toy Story (1995)']
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]

    movies = movies_tags.iloc[movie_indices][['title','movieId']]
    movies['est'] = movies['movieId'].apply(lambda x: svd.predict(userId, x).est)
    movies = movies.sort_values('est', ascending=False)
    
    return movies.head(10)

In [31]:
get_recommendations(1, 'Toy Story (1995)')

Unnamed: 0,title,movieId,est
277,Wallace & Gromit: The Wrong Trousers (1993),1148,5.0
163,Wallace & Gromit: The Best of Aardman Animatio...,720,5.0
308,"Grand Day Out with Wallace and Gromit, A (1989)",1223,5.0
812,Shrek (2001),4306,4.860993
613,"Iron Giant, The (1999)",2761,4.806003
1011,Laputa: Castle in the Sky (Tenkû no shiro Rapy...,6350,4.757686
167,Wallace & Gromit: A Close Shave (1995),745,4.756647
1016,Finding Nemo (2003),6377,4.740767
1254,"Incredibles, The (2004)",8961,4.719974
498,"Jungle Book, The (1967)",2078,4.718342


In [32]:
get_recommendations(2, 'Toy Story (1995)')

Unnamed: 0,title,movieId,est
613,"Iron Giant, The (1999)",2761,4.409218
812,Shrek (2001),4306,4.379122
1011,Laputa: Castle in the Sky (Tenkû no shiro Rapy...,6350,4.304038
277,Wallace & Gromit: The Wrong Trousers (1993),1148,4.231179
308,"Grand Day Out with Wallace and Gromit, A (1989)",1223,4.184961
1016,Finding Nemo (2003),6377,4.172327
167,Wallace & Gromit: A Close Shave (1995),745,4.160554
163,Wallace & Gromit: The Best of Aardman Animatio...,720,4.104243
544,"Bug's Life, A (1998)",2355,4.103281
511,Watership Down (1978),2138,4.018223
