# Movie Recommendation System Project
Based on data from https://drive.google.com/file/d/1Dn1BZD3YxgBQJSIjbfNnmCFlDW2jdQGD/view

In [224]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [2]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
movies.describe()

Unnamed: 0,movieId
count,10329.0
mean,31924.282893
std,37734.741149
min,1.0
25%,3240.0
50%,7088.0
75%,59900.0
max,149532.0


In [6]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,105339.0,105339.0,105339.0,105339.0
mean,364.924539,13381.312477,3.51685,1130424000.0
std,197.486905,26170.456869,1.044872,180266000.0
min,1.0,1.0,0.5,828565000.0
25%,192.0,1073.0,3.0,971100800.0
50%,383.0,2497.0,3.5,1115154000.0
75%,557.0,5991.0,4.0,1275496000.0
max,668.0,149532.0,5.0,1452405000.0


## Preprocessing

In [7]:
movies['genres'] = [x.replace('Film-Noir', 'filmnoir') for x in movies['genres']]
movies['genres'] = [x.replace('Sci-Fi', 'scifi') for x in movies['genres']]

ratings.drop('timestamp', axis=1, inplace=True)

In [8]:
# create a movie title dictionary
movie_dict = movies.drop('genres',axis=1).set_index('movieId').to_dict()

## Content-based Recommendation System

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfv = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=1)
x = tfv.fit_transform(movies['genres'])

In [10]:
# compute the cosine similarities of genres
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(x, x)

In [201]:
# define function to get recommendations based on cosine similarites of genre
def genre_recommendations(title):
    idx = movies[movies['title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices].drop(idx)

In [214]:
genre_recommendations('Toy Story 2 (1999)').head(10)

0                                        Toy Story (1995)
1815                                          Antz (1998)
2967       Adventures of Rocky and Bullwinkle, The (2000)
3166                     Emperor's New Groove, The (2000)
3811                                Monsters, Inc. (2001)
6617    DuckTales: The Movie - Treasure of the Lost La...
6997                                     Wild, The (2006)
7382                               Shrek the Third (2007)
7987                       Tale of Despereaux, The (2008)
9215    Asterix and the Vikings (Astérix et les Viking...
Name: title, dtype: object

## Collaborative Filtering

### Memory Based

Memory based models use past behavior and not context (or attributes) to make recommendations. These types of models are susceptible to "cold-start" problems in which there is no historical data to make a recommendation. There are two flavors of memory based models: user-based and item-based.

In [14]:
# let's merge the movie titles with the ratings
data = pd.merge(movies,ratings)
data.drop(['genres'], axis=1, inplace=True)
data.shape

(105339, 4)

In [106]:
data.head()

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),2,5.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),8,5.0
3,1,Toy Story (1995),11,4.0
4,1,Toy Story (1995),14,4.0


#### Item-Based

In [15]:
# lets make a pivot table in order to make rows are users and columns are movies. And values are rating
item_pivot_table = data.pivot_table(index = ["userId"],columns = ["title"],values = "rating",fill_value=0)
item_pivot_table.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 (1979),...,[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),a/k/a Tommy Chong (2005),eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [168]:
def get_item_recommendation(input, k=5):
    movie_watched = item_pivot_table[input]
    sim_scores = item_pivot_table.corrwith(movie_watched)
    sim_scores = sim_scores.sort_values(ascending=False)
    return sim_scores.drop(input).head(k)

In [170]:
get_item_recommendation('Toy Story (1995)', 3)

title
Star Wars: Episode VI - Return of the Jedi (1983)    0.451954
Toy Story 2 (1999)                                   0.443862
Star Wars: Episode IV - A New Hope (1977)            0.420390
dtype: float64

#### User-Based

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
user_pivot_table = item_pivot_table

In [20]:
def remove_rated_titles(liked_items, user_recs, n_recs):
#     liked_items = list(data[data['userId'] == 55].sort_values('rating', ascending=False)['title'])
    liked_removed = [x for x in list(user_recs.index) if x not in liked_items]
    user_recs = user_recs.loc[liked_removed].sort_values(ascending=False).index[:n_recs].to_list()
    return user_recs

def user_based_recs(user, n_recs=5, n_users=5):
    """
    Args:
    user - the userId you want recommendations for
    n_recs - the number of recommendations you want to return
    n_users - the number of users you want to compare your user to (i.e. the number of user similarity scores to use)
    """
    # find top 3 movies user liked for reference
    liked_items = list(data[data['userId'] == user].sort_values('rating', ascending=False)['title'])
    
    # calculate the cosine similarity scores for the given user
    user_scores = pd.Series(cosine_similarity(user_pivot_table)[user-1]) #subtract one due to indexing mismatch
    user_scores.index = range(1,len(user_scores)+1) # match indexing (increase by one to match pivot table)
    sim_users_scores = user_scores.sort_values(ascending=False)[1:n_users+1] # take top 5 similar users
    sim_users = list(sim_users_scores.index) # save top n userId's as list
    sim_users_df = user_pivot_table.T[sim_users].T # create user-item matrix based on top n users
    
    # take the weighted sum of similar user's rankings
    user_recs =  sim_users_scores.dot(sim_users_df) / np.array([np.abs(sim_users_scores).sum(axis=0)]) 
    # remove the titles the user has already seen
    user_recs = remove_rated_titles(liked_items, user_recs, n_recs)

    # print the user, some of their liked movies, and top n recommendations
    print('User: {}'.format(user))
    print('Liked item: ', liked_items[:3])
    print('Recommendations:', list(user_recs))

In [21]:
user_based_recs(55,3,3)

User: 55
Liked item:  ['Coming to America (1988)', 'Mallrats (1995)', 'Ghostbusters II (1989)']
Recommendations: ['American Beauty (1999)', 'Fight Club (1999)', 'Lord of the Rings: The Fellowship of the Ring, The (2001)']


#### Alternate Memory Based: Non-Parametric KNN

Here I will use a python library called surprise to implement a non-parametric memory based model. It uses k-nearest neighbors to develop its recommendations. This model can be either user or item based. I am choosed to use user-based in this demonstration.

In [23]:
from surprise import KNNWithMeans, Dataset, Reader, accuracy
from surprise.model_selection import train_test_split, GridSearchCV

In [95]:
model_data = data.drop('title', axis=1)
model_data = model_data[['userId','movieId','rating']] # need to re-arrange columns
model_data.head()

Unnamed: 0,userId,movieId,rating
0,2,1,5.0
1,5,1,4.0
2,8,1,5.0
3,11,1,4.0
4,14,1,4.0


In [96]:
# Reading the dataset
reader = Reader(rating_scale=(1, 5))
knn_data = Dataset.load_from_df(model_data,reader)

# Create trainset 
trainset = knn_data.build_full_trainset()

# Use user_based true/false to switch between user-based or item-based collaborative filtering
algo = KNNWithMeans(k=5, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo.fit(trainset)

# create test set and run the trained model
testset = trainset.build_anti_testset() # use anti_test set to get user-item pairs that do not have ratings
test_pred = algo.test(testset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [220]:
from collections import defaultdict
# define the return top n function
def get_top_n(predictions):
    
    # first map the predictions to each user
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid,est))
    
    # next sort the predictions by user and retrieve the k highest ones
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings
        
    return top_n

In [49]:
# call top_n function to retrieve top movies 
top_n = get_top_n(test_pred)

In [50]:
def get_knn_recommendation(user, n=3):

    # find 3 movies user liked for reference
    liked_item = list(data[data['userId'] == user].sort_values('rating', ascending=False).iloc[:3]['title'])

    print('User: {}'.format(user))
    print('Sample liked movies: ', liked_item)

    # examine top 3 products for user
    temp = top_n.get(user)[:n]
    recs = pd.Series([x[0] for x in temp]).map(movie_dict['title'])
    
    print('Recommendations:', list(recs))

In [57]:
get_knn_recommendation(55, 3)

User: 55
Sample liked movies:  ['Coming to America (1988)', 'Mallrats (1995)', 'Ghostbusters II (1989)']
Recommendations: ['Misérables, Les (1995)', "You Can't Take It with You (1938)", 'Trekkies (1997)']


## Model-Based

In this approach, CF models are developed using machine learning algorithms to predict user’s rating of unrated items
Matrix Factorization: attitudes or preferences of a user can be determined by a small number of hidden factors. We can call these factors "Embeddings".

#### Singular Value Decomposition (SVD) Model

In [216]:
# import SVD from Surprise library
from surprise import SVD

In [217]:
# we can train the model using the user based model data set since it's the correct format
svd_model = SVD(n_factors=150, n_epochs=20, lr_all=0.008, 
                    reg_all=0.1, random_state=24)
svd_model.fit(trainset)

# predict ratings for all user-item pairs that are not in the trainset
predictions = svd_model.test(testset)

In [225]:
# build top picks for each user

# first map the predictions to each user
top_picks = defaultdict(list)
for uid, iid, true_r, est, _ in predictions:
    top_picks[uid].append((iid,est))

# next sort the predictions by user and retrieve the k highest ones
for uid, user_ratings in top_picks.items():
    user_ratings.sort(key=lambda x: x[1], reverse=True)
    top_picks[uid] = user_ratings

In [61]:
def get_svd_recs(user, n=3):

    # find 3 movies user liked for reference
    liked_item = list(data[data['userId'] == user].sort_values('rating', ascending=False).iloc[:3]['title'])

    print('User: {}'.format(user))
    print('Sample liked movies: ', liked_item)

    # examine top 3 products for user
    temp = top_picks.get(user)[:n]
    recs = pd.Series([x[0] for x in temp]).map(movie_dict['title'])
    
    print('Recommendations:', list(recs))

In [63]:
get_svd_recs(55, 3)

User: 55
Sample liked movies:  ['Coming to America (1988)', 'Mallrats (1995)', 'Ghostbusters II (1989)']
Recommendations: ['Misérables, Les (1995)', "You Can't Take It with You (1938)", 'Trekkies (1997)']


## Ensemble Model

In [None]:
# To do
# create ensemble model
# measure performance of all models
# grid search params for SVD
# create a way to input user likes/dislikes
# create an output function

## Performance

In [55]:
def my_train_test_split(data):
    n_users = data.userId.nunique()
    test = pd.DataFrame(columns=['userId','movieId','rating'])
    train = data.copy()

    for user in range(1,n_users+1):
        temp = data[(data['userId'] == user) & (data['rating'] > 0)].sample(10, random_state=24)
        test = pd.concat([test, temp])
    train = pd.merge(train, test, left_index=True, right_index=True, how="outer", indicator=True).query('_merge=="left_only"')
    train = train[['userId_x','movieId_x','rating_x', 'title_x']]
    train = train.rename(columns={'userId_x': 'userId', 'movieId_x':'movieId', 'rating_x':'rating', 'title_x':'title'})
    return train, test

In [56]:
train, test = my_train_test_split(data)

In [129]:
from sklearn.metrics import mean_squared_error

def get_mse(pred, actual):
    # Ignore zeros
    pred = pred[actual.nonzero()]
    actual = actual[actual.nonzero()]
    return mean_squared_error(pred, actual)

def test_memory_based(train, test, kind='item'):
    temp = train.pivot_table(index = ["userId"],columns = ["title"],values = "rating",fill_value=0).to_numpy()
    actual = test.pivot_table(index = ["userId"],columns = ["title"],values = "rating",fill_value=0).to_numpy()
    if kind == 'item':
        sim = temp.T.dot(temp) + 1e-9 # add small value to fix divide by zero
        pred = temp.dot(sim) / np.array([np.abs(sim).sum(axis=1)])
    elif kind == 'user':
        sim = temp.dot(temp.T) + 1e-9 # add small value to fix divide by zero
        pred = sim.dot(temp) / np.array([np.abs(sim).sum(axis=1)]).T
    else:
        print('Please choose an approriate input for kind.')
        return
    mse = get_mse(pred, actual)
    print('{}-based MSE: {}'.format(kind, mse))

In [100]:
def test_knn(data):
    trainset, testset = train_test_split(data, test_size=.10, random_state=24)
    algo = KNNWithMeans(k=5, sim_options={'name': 'pearson_baseline', 'user_based': True})
    algo.fit(trainset)
    predictions = algo.test(testset)
    mse = accuracy.mse(predictions, verbose=False)
    print('KNN Model MSE: {}'.format(mse))

In [102]:
def test_svd(data):
    trainset, testset = train_test_split(data, test_size=.10, random_state=24)
    svd_model = SVD()
    svd_model.fit(trainset)
    predictions = svd_model.test(testset)
    mse = accuracy.mse(predictions, verbose=False)
    print('SVD Model MSE: {}'.format(mse))

In [130]:
test_memory_based(train, test, kind='item')
test_memory_based(train, test, kind='user')

item-based MSE: 13.232804141815977
user-based MSE: 13.544850175634158


In [101]:
test_knn(knn_data)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
KNN Model MSE: 0.8367164051160263


In [105]:
test_svd(knn_data)

SVD Model MSE: 0.04275652330105412
