# Movies Recommender System

![](http://labs.criteo.com/wp-content/uploads/2017/08/CustomersWhoBought3.jpg)

In [1]:
import pandas as pd
import numpy as np
import ast, json
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate, KFold

import warnings; warnings.simplefilter('ignore')

## Simple Recommender


In [2]:
md = pd. read_csv('../input/movies_metadata.csv')
#md

In [3]:
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [4]:
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()

In [5]:
m = vote_counts.quantile(0.95)

In [6]:
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [7]:
qualified = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

(41, 6)

In [8]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [9]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)

In [10]:
qualified = qualified.sort_values('wr', ascending=False).head(250)

### Top Movies

In [11]:
s = md.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_md = md.drop('genres', axis=1).join(s)

In [12]:
def build_chart(genre, percentile=0.85):
    df = gen_md[gen_md['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified

## Content Based Recommender


In [13]:
links_small = pd.read_csv('../input/links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [14]:
#md = md.drop([19730, 29503, 35587])

In [15]:
#Check EDA Notebook for how and why I got these indices.
md['id'] = md['id'].astype('int')

In [16]:
smd = md[md['id'].isin(links_small)]
smd.shape

(818, 26)

### Movie Description Based Recommender


In [17]:
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')

In [18]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])


In [19]:
tfidf_matrix.shape

(818, 33682)

In [20]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


In [21]:
cosine_sim[0]

array([1.        , 0.00377865, 0.        , 0.00356997, 0.        ,
       0.        , 0.        , 0.01648727, 0.        , 0.        ,
       0.        , 0.        , 0.00467307, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.00551282,
       0.        , 0.        , 0.00224231, 0.00292596, 0.01091109,
       0.01218616, 0.        , 0.01265084, 0.        , 0.        ,
       0.        , 0.        , 0.0249205 , 0.        , 0.        ,
       0.        , 0.0157512 , 0.        , 0.        , 0.0617168 ,
       0.00524183, 0.        , 0.        , 0.        , 0.00529733,
       0.        , 0.00381369, 0.        , 0.        , 0.        ,
       0.02497007, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.01161201, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.02114618,
       0.        , 0.        , 0.00289205, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [22]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [23]:
def content_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    movies = smd.iloc[movie_indices][['title', 'year', 'id' , 'imdb_id']]
    
    return movies[['title','id','imdb_id']]


### Metadata Based Recommender


In [24]:
credits = pd.read_csv('../input/credits.csv')
credits=credits[["cast" ,"crew" , "id" ]]
keywords = pd.read_csv('../input/keywords.csv')

In [25]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
md['id'] = md['id'].astype('int')

In [26]:
md.shape

(818, 26)

In [27]:
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

In [28]:
#md['id'].value_counts()

In [29]:
#md[md['id']==0]

In [30]:
#drops index with non english movies titles and fully credits
#md = md.drop([411, 412, 413,414,415,416,417,418,651,652,653,678,679,680,730,56,732,451,43,641,48,712,74])

#drops index with only english movies titles and fully credits
#md = md.drop([556,557,558,538,539,540,69,53,372,40,45,530])

#drops index with only english movies titles and 800 credits
md = md.drop([69,53,372,530,40,554,538,45])

In [31]:
#md['id'].value_counts()

In [32]:
smd = md[md['id'].isin(links_small)]
#smd

In [33]:
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

In [34]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [35]:
smd['director'] = smd['crew'].apply(get_director)

In [36]:
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [37]:
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [38]:
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [39]:
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x,x, x])

In [40]:
s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

In [41]:
s = s.value_counts()
s[:5]

woman director      85
independent film    78
murder              36
based on novel      32
sex                 25
Name: keyword, dtype: int64

In [42]:
s = s[s > 1]

In [43]:
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

'dog'

In [44]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [45]:
smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [46]:
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

In [47]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

In [48]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [49]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

## Collaborative Filtering


In [50]:
reader = Reader()
svd = SVD()

In [51]:
ratings = pd.read_csv('../input/ratings_small.csv')
#ratings.head()

In [52]:
get_user_ratings = None
ratings_table = pd.read_csv('../input/ratings_table.csv')

def post_user_ratings(userId,userName,movieId,movieName,rating,review):
    links_small_map = pd.read_csv('../input/links_small.csv')[['movieId', 'imdbId' , 'tmdbId']]
    tagret_movie = links_small_map[links_small_map['tmdbId'] == movieId]
    tagret_id = tagret_movie['movieId'].iloc[0]
    ratings.loc[len(ratings)] = [userId, tagret_id, rating,"Nulled"]
    veiw_ratings = ratings_table.loc[len(ratings_table)] = [userId, userName , tagret_id , movieName , rating, review ]
    def return_ratings():
        return ratings_table
    global get_user_ratings 
    get_user_ratings= return_ratings()
    return ratings_table
    
def post_SVD(userId,userName,movieId,movieName,rating,review):
    links_small_map = pd.read_csv('../input/links_small.csv')[['movieId', 'imdbId' , 'tmdbId']]
    tagret_movie = links_small_map[links_small_map['tmdbId'] == movieId]
    tagret_id = tagret_movie['movieId'].iloc[0]
    ratings.loc[len(ratings)] = [userId, tagret_id, rating,"Nulled"]
    veiw_ratings = ratings_table.loc[len(ratings_table)] = [userId, userName , tagret_id , movieName , rating, review ]
    reader = Reader()
    svd = SVD()
    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
    cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    trainset = data.build_full_trainset()
    svd.fit(trainset)
    def return_ratings():
        #return ratings
        return ratings_table
    global get_user_ratings 
    get_user_ratings= return_ratings()
    return ratings_table

In [53]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [54]:
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9004  0.8903  0.8987  0.8969  0.9004  0.8973  0.0038  
MAE (testset)     0.6968  0.6850  0.6902  0.6904  0.6902  0.6905  0.0037  
Fit time          0.72    0.73    0.73    0.73    0.74    0.73    0.00    
Test time         0.14    0.09    0.10    0.14    0.09    0.11    0.02    


{'test_rmse': array([0.90040963, 0.89028834, 0.89873353, 0.89692182, 0.90038948]),
 'test_mae': array([0.6967962 , 0.68504343, 0.6901772 , 0.6904215 , 0.69015277]),
 'fit_time': (0.7237086296081543,
  0.7265925407409668,
  0.726449728012085,
  0.7253353595733643,
  0.7355144023895264),
 'test_time': (0.14103150367736816,
  0.09045767784118652,
  0.09931731224060059,
  0.14087700843811035,
  0.09063267707824707)}

In [55]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1999cf008b0>

In [56]:
#ratings[ratings['userId'] == 1]

In [57]:
svd.predict(1, 302 , 3 )

Prediction(uid=1, iid=302, r_ui=3, est=2.7597536588807277, details={'was_impossible': False})

## Hybrid Recommender


In [58]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [59]:
id_map = pd.read_csv('../input/links_small.csv')[['movieId', 'imdbId' , 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'imdbId' , 'id']
id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title')

In [67]:
indices_map = id_map.set_index('id')
#print(indices_map)

In [61]:
def hybrid_recommendations(userId, title):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    imdbId = id_map.loc[title]['imdbId']
    movie_id = id_map.loc[title]['movieId']
    
    #cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    movies = smd.iloc[movie_indices][['title', 'year', 'id' , 'imdb_id']]
    
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId'] , r_ui=None).est)
    movies = movies.sort_values('est', ascending=False)
    
    return movies[['title','id','imdb_id']]



In [62]:
hybrid_recommendations(1, "The Frighteners")

Unnamed: 0,title,id,imdb_id
566,The Lord Of The Rings: The Fellowship Of The Ring,120,tt0120737
176,Hell Baby,159092,tt2318527
377,Assume The Position With Mr. Wuhl,63011,tt0788006
184,Fear Lives Here,238358,tt2235515
5,The Axe Murders Of Villisca,401065,tt3320500
488,Dead Meat,15689,tt0369359
254,51,57812,tt1629439
524,Ride Or Die,72790,tt0313532
128,Call Girl Of Cthulhu,285598,tt2689354
112,Darling,352372,tt4126394


In [63]:
content_recommendations("The Frighteners")

Unnamed: 0,title,id,imdb_id
566,The Lord Of The Rings: The Fellowship Of The Ring,120,tt0120737
277,Saint,45756,tt1167675
426,The League Of Gentlemen'S Apocalypse,19723,tt0435687
697,Bride Of Chucky,11932,tt0144120
654,Stuart Little,10137,tt0164912
222,Nesting,104232,tt1705126
206,Silent Hill: Revelation 3D,61012,tt0938330
145,Jersey Shore Massacre,288154,tt2713642
458,Alien Apocalypse,14907,tt0404756
61,The Thinning,419639,tt5254868


In [64]:
post_user_ratings(1000 ,"karim" , 282035 , "test1" , 4.5 , "good")

Unnamed: 0,userId,userName,movieId,movieTitle,rating,review
0,1000,karim,170827,test1,4.5,good


In [65]:
get_user_ratings

Unnamed: 0,userId,userName,movieId,movieTitle,rating,review
0,1000,karim,170827,test1,4.5,good
