# Movies Recommender System

![](http://labs.criteo.com/wp-content/uploads/2017/08/CustomersWhoBought3.jpg)

In [153]:
import pandas as pd
import numpy as np
import ast, json
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate, KFold

import warnings; warnings.simplefilter('ignore')

## Simple Recommender


In [154]:
md = pd. read_csv('../input/movies_metadata.csv')
#md

In [155]:
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [156]:
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()

In [157]:
m = vote_counts.quantile(0.95)

In [158]:
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [159]:
qualified = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

(41, 6)

In [160]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [161]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)

In [162]:
qualified = qualified.sort_values('wr', ascending=False).head(250)

### Top Movies

In [163]:
s = md.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_md = md.drop('genres', axis=1).join(s)

In [164]:
def build_chart(genre, percentile=0.85):
    df = gen_md[gen_md['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified

## Content Based Recommender


In [165]:
links_small = pd.read_csv('../input/links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [166]:
#md = md.drop([19730, 29503, 35587])

In [167]:
#Check EDA Notebook for how and why I got these indices.
md['id'] = md['id'].astype('int')

In [168]:
smd = md[md['id'].isin(links_small)]
smd.shape

(818, 26)

### Movie Description Based Recommender


In [169]:
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')

In [170]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])


In [171]:
tfidf_matrix.shape

(818, 33682)

In [172]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


In [173]:
cosine_sim[0]

array([1.        , 0.00377865, 0.        , 0.00356997, 0.        ,
       0.        , 0.        , 0.01648727, 0.        , 0.        ,
       0.        , 0.        , 0.00467307, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.00551282,
       0.        , 0.        , 0.00224231, 0.00292596, 0.01091109,
       0.01218616, 0.        , 0.01265084, 0.        , 0.        ,
       0.        , 0.        , 0.0249205 , 0.        , 0.        ,
       0.        , 0.0157512 , 0.        , 0.        , 0.0617168 ,
       0.00524183, 0.        , 0.        , 0.        , 0.00529733,
       0.        , 0.00381369, 0.        , 0.        , 0.        ,
       0.02497007, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.01161201, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.02114618,
       0.        , 0.        , 0.00289205, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [174]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [175]:
def content_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    movies = smd.iloc[movie_indices][['title', 'year', 'id' , 'imdb_id']]
    
    return movies[['title','id','imdb_id']].to_json(orient='records')


### Metadata Based Recommender


In [176]:
credits = pd.read_csv('../input/credits.csv')
keywords = pd.read_csv('../input/keywords.csv')

In [177]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
md['id'] = md['id'].astype('int')

In [178]:
md.shape

(818, 26)

In [179]:
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

In [180]:
#md['id'].value_counts()

In [181]:
#md[md['id']==0]

In [182]:
#drops index with non english movies titles
#md = md.drop([411, 412, 413,414,415,416,417,418,651,652,653,678,679,680,730,56,732,451,43,641,48,712,74])

#drops index with only english movies titles
md = md.drop([556,557,558,538,539,540,69,53,372,40,45,530])

In [183]:
#md['id'].value_counts()

In [184]:
smd = md[md['id'].isin(links_small)]
#smd

In [185]:
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

In [186]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [187]:
smd['director'] = smd['crew'].apply(get_director)

In [188]:
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [189]:
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [190]:
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [191]:
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x,x, x])

In [192]:
s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

In [193]:
s = s.value_counts()
s[:5]

woman director      85
independent film    78
murder              36
based on novel      32
sex                 25
Name: keyword, dtype: int64

In [194]:
s = s[s > 1]

In [195]:
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

'dog'

In [196]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [197]:
smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [198]:
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

In [199]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

In [200]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [201]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

## Collaborative Filtering


In [202]:
reader = Reader()
svd = SVD()

In [203]:
ratings = pd.read_csv('../input/ratings_small.csv')
#ratings.head()

In [224]:
get_user_ratings = None
ratings_table = pd.read_csv('../input/ratings_table.csv')

def post_user_ratings(userId,userName,movieId,movieName,rating,review):
    links_small_map = pd.read_csv('../input/links_small.csv')[['movieId', 'imdbId' , 'tmdbId']]
    tagret_movie = links_small_map[links_small_map['tmdbId'] == movieId]
    tagret_id = tagret_movie['movieId'].iloc[0]
    ratings.loc[len(ratings)] = [userId, tagret_id, rating,"Nulled"]
    veiw_ratings = ratings_table.loc[len(ratings_table)] = [userId, userName , tagret_id , movieName , rating, review ]
    def return_ratings():
        return ratings_table
    global get_user_ratings 
    get_user_ratings= return_ratings()
    return ratings_table
    
def post_SVD(userId,userName,movieId,movieName,rating,review):
    links_small_map = pd.read_csv('../input/links_small.csv')[['movieId', 'imdbId' , 'tmdbId']]
    tagret_movie = links_small_map[links_small_map['tmdbId'] == movieId]
    tagret_id = tagret_movie['movieId'].iloc[0]
    ratings.loc[len(ratings)] = [userId, tagret_id, rating,"Nulled"]
    veiw_ratings = ratings_table.loc[len(ratings_table)] = [userId, userName , tagret_id , movieName , rating, review ]
    reader = Reader()
    svd = SVD()
    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
    cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    trainset = data.build_full_trainset()
    svd.fit(trainset)
    def return_ratings():
        #return ratings
        return ratings_table
    global get_user_ratings 
    get_user_ratings= return_ratings()
    return ratings_table

In [205]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [206]:
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9005  0.8862  0.8915  0.8970  0.9084  0.8967  0.0076  
MAE (testset)     0.6937  0.6825  0.6878  0.6910  0.6998  0.6910  0.0058  
Fit time          0.67    0.79    0.71    0.67    0.68    0.71    0.05    
Test time         0.16    0.08    0.09    0.09    0.08    0.10    0.03    


{'test_rmse': array([0.90049248, 0.88623302, 0.89145487, 0.89696688, 0.90838541]),
 'test_mae': array([0.69367795, 0.68249407, 0.68783358, 0.6910335 , 0.69980544]),
 'fit_time': (0.6712071895599365,
  0.792992353439331,
  0.7148404121398926,
  0.6718175411224365,
  0.6775209903717041),
 'test_time': (0.1604447364807129,
  0.08311867713928223,
  0.09044528007507324,
  0.08918452262878418,
  0.0832071304321289)}

In [207]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x18b5a1006a0>

In [208]:
#ratings[ratings['userId'] == 1]

In [209]:
svd.predict(1, 302 , 3 )

Prediction(uid=1, iid=302, r_ui=3, est=2.746589533068551, details={'was_impossible': False})

## Hybrid Recommender


In [210]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [211]:
id_map = pd.read_csv('../input/links_small.csv')[['movieId', 'imdbId' , 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'imdbId' , 'id']
id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title')

In [212]:
indices_map = id_map.set_index('id')
#print(indices_map)

In [213]:
def hybrid_recommendations(userId, title):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    imdbId = id_map.loc[title]['imdbId']
    movie_id = id_map.loc[title]['movieId']
    
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    movies = smd.iloc[movie_indices][['title', 'year', 'id' , 'imdb_id']]
    
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId'] , r_ui=None).est)
    movies = movies.sort_values('est', ascending=False)
    
    return movies[['title','id','imdb_id']].to_json(orient='records')



In [214]:
hybrid_recommendations(1, "The League Of Gentlemen'S Apocalypse")

'[{"title":"The Lost Skeleton Of Cadavra","id":18841,"imdb_id":"tt0307109"},{"title":"Deathgasm","id":323373,"imdb_id":"tt3705412"},{"title":"Saint","id":45756,"imdb_id":"tt1167675"},{"title":"Dead Meat","id":15689,"imdb_id":"tt0369359"},{"title":"Bachelor Night","id":292191,"imdb_id":"tt3797142"},{"title":"Don Verdean","id":309298,"imdb_id":"tt3534282"},{"title":"Chris Tucker Live","id":348035,"imdb_id":"tt4835636"},{"title":"Sociopathia","id":382725,"imdb_id":"tt3899706"},{"title":"Jimmy Vestvood: Amerikan Hero","id":299641,"imdb_id":"tt3135556"},{"title":"Hickey","id":434234,"imdb_id":"tt2122355"},{"title":"A Few Less Men","id":413391,"imdb_id":"tt3784652"},{"title":"Visions Of Suffering","id":89722,"imdb_id":"tt0840027"},{"title":"Lewis Black:  Black On Broadway","id":37801,"imdb_id":"tt0414224"},{"title":"Assume The Position With Mr. Wuhl","id":63011,"imdb_id":"tt0788006"},{"title":"Fear Lives Here","id":238358,"imdb_id":"tt2235515"},{"title":"51","id":57812,"imdb_id":"tt1629439"}

In [215]:
content_recommendations("The Frighteners")

'[{"title":"The Lord Of The Rings: The Fellowship Of The Ring","id":120,"imdb_id":"tt0120737"},{"title":"Saint","id":45756,"imdb_id":"tt1167675"},{"title":"The League Of Gentlemen\'S Apocalypse","id":19723,"imdb_id":"tt0435687"},{"title":"Bride Of Chucky","id":11932,"imdb_id":"tt0144120"},{"title":"Stuart Little","id":10137,"imdb_id":"tt0164912"},{"title":"Nesting","id":104232,"imdb_id":"tt1705126"},{"title":"Silent Hill: Revelation 3D","id":61012,"imdb_id":"tt0938330"},{"title":"Jersey Shore Massacre","id":288154,"imdb_id":"tt2713642"},{"title":"Alien Apocalypse","id":14907,"imdb_id":"tt0404756"},{"title":"The Thinning","id":419639,"imdb_id":"tt5254868"},{"title":"Burying The Ex","id":255798,"imdb_id":"tt3339674"},{"title":"R.S.V.P.","id":30081,"imdb_id":"tt0271210"},{"title":"An American Vampire Story","id":65416,"imdb_id":"tt0251582"},{"title":"Hell Baby","id":159092,"imdb_id":"tt2318527"},{"title":"Cockneys Vs Zombies","id":114606,"imdb_id":"tt1362058"},{"title":"Pervert!","id":463

In [240]:
post_user_ratings(1000 ,"karim" , 282035 , "test1" , 4.5 , "good")

Unnamed: 0,userId,userName,movieId,movieTitle,rating,review
0,1000,karim,105204,test1,4.5,good
1,1000,karim,105204,test1,4.5,good
2,1000,karim,105204,test1,4.5,good
3,1000,karim,170827,test1,4.5,good


In [242]:
get_user_ratings

Unnamed: 0,userId,userName,movieId,movieTitle,rating,review
0,1000,karim,105204,test1,4.5,good
1,1000,karim,105204,test1,4.5,good
2,1000,karim,105204,test1,4.5,good
3,1000,karim,170827,test1,4.5,good


In [244]:
pd.read_json(json.dumps(json.loads()))

Unnamed: 0,title,id,imdb_id
0,The Lord Of The Rings: The Fellowship Of The Ring,120,tt0120737
1,Saint,45756,tt1167675
2,The League Of Gentlemen'S Apocalypse,19723,tt0435687
3,Bride Of Chucky,11932,tt0144120
4,Stuart Little,10137,tt0164912
5,Nesting,104232,tt1705126
6,Silent Hill: Revelation 3D,61012,tt0938330
7,Jersey Shore Massacre,288154,tt2713642
8,Alien Apocalypse,14907,tt0404756
9,The Thinning,419639,tt5254868
