# Collaborative Filtering

### Import Dependencies

In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### Load the Data

In [2]:
ratings = pd.read_csv("ml-latest-small/ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
movies = pd.read_csv('ml-latest-small/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Transforming the data

In [4]:
movie_ratings = pd.merge(movies,ratings).drop(['genres','timestamp'],axis=1)
print(movie_ratings.shape)
movie_ratings.head()

(100836, 4)


Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5


## 1. Implementation of Item-Item Filtering

In [5]:
# Create a matrix using pivot table for the users and movies they've rated
userRatings = movie_ratings.pivot_table(index=['userId'],columns=['title'],values='rating')
userRatings.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [6]:
print(userRatings.shape)

(610, 9719)


In [7]:
# Remove movies which have less than 10 users' ratings
userRatings = userRatings.dropna(thresh=10, axis=1).fillna(0,axis=1)
print(userRatings.shape)

(610, 2269)


In [8]:
# Use the Pearson correlation score with movies to determine the correlation scores
movie_correlations = userRatings.corr(method = 'pearson')
np.fill_diagonal( movie_correlations.values, 0 )
movie_correlations.head(10)

title,"'burbs, The (1989)",(500) Days of Summer (2009),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),12 Years a Slave (2013),127 Hours (2010),...,Zack and Miri Make a Porno (2008),Zero Dark Thirty (2012),Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",0.0,0.063117,-0.023768,0.143482,0.011998,0.087931,0.224052,0.034223,0.009277,0.008331,...,0.017477,0.03247,0.134701,0.153158,0.101301,0.049897,0.003233,0.187953,0.062174,0.353194
(500) Days of Summer (2009),0.063117,0.0,0.142471,0.273989,0.19396,0.148903,0.142141,0.159756,0.135486,0.200135,...,0.374515,0.178655,0.068407,0.414585,0.355723,0.252226,0.216007,0.053614,0.241092,0.125905
10 Cloverfield Lane (2016),-0.023768,0.142471,0.0,-0.005799,0.112396,0.006139,-0.016835,0.031704,-0.024275,0.272943,...,0.242663,0.099059,-0.023477,0.272347,0.241751,0.195054,0.319371,0.177846,0.096638,0.002733
10 Things I Hate About You (1999),0.143482,0.273989,-0.005799,0.0,0.24467,0.223481,0.211473,0.011784,0.091964,0.043383,...,0.243118,0.104858,0.13246,0.091853,0.158637,0.281934,0.050031,0.121029,0.130813,0.110612
"10,000 BC (2008)",0.011998,0.19396,0.112396,0.24467,0.0,0.234459,0.119132,0.059187,-0.025882,0.089328,...,0.260261,0.087592,0.094913,0.184521,0.242299,0.240231,0.094773,0.088045,0.203002,0.083518
101 Dalmatians (1996),0.087931,0.148903,0.006139,0.223481,0.234459,0.0,0.285112,0.119843,0.072399,0.029967,...,0.114968,0.077232,0.096294,0.067134,0.113224,0.184324,0.054024,0.047804,0.156932,0.078734
101 Dalmatians (One Hundred and One Dalmatians) (1961),0.224052,0.142141,-0.016835,0.211473,0.119132,0.285112,0.0,0.134037,0.017264,-0.046277,...,0.120302,0.125816,0.049818,0.08365,0.171654,0.27426,0.077594,0.085606,0.24882,0.171118
12 Angry Men (1957),0.034223,0.159756,0.031704,0.011784,0.059187,0.119843,0.134037,0.0,0.132979,0.058862,...,0.104518,0.028415,0.079905,0.241435,0.144652,0.122107,0.056742,-0.001708,0.074306,0.102744
12 Years a Slave (2013),0.009277,0.135486,-0.024275,0.091964,-0.025882,0.072399,0.017264,0.132979,0.0,0.249931,...,0.024045,0.038127,0.013786,0.190366,0.10415,0.017351,0.063325,0.002528,0.037469,0.004213
127 Hours (2010),0.008331,0.200135,0.272943,0.043383,0.089328,0.029967,-0.046277,0.058862,0.249931,0.0,...,0.223135,0.154299,0.012907,0.364841,0.198926,0.091416,0.225747,0.128638,0.153335,0.002912


In [9]:
movie_correlations.shape

(2269, 2269)

In [10]:
# Create a dataframe for a user ratings!
userRatings = pd.DataFrame([['Liar Liar (1997)', 5], ['Fargo (1996)', 1]], columns = ['title', 'rating']) 
userRatings 

Unnamed: 0,title,rating
0,Liar Liar (1997),5
1,Fargo (1996),1


In [11]:
def get_recommended_movies_1(userRatings, n_recommendations=5):
    
    similar_movies_list = pd.Series([], dtype='float')
    for i in range(0, len(userRatings.index)):
        similar_movie = movie_correlations[userRatings['title'][i]].dropna() 
        similar_movie = similar_movie.map(lambda x: x * userRatings['rating'][i]) 
        similar_movies_list = similar_movies_list.append(similar_movie)
    
    similar_movies_list.sort_values(inplace = True, ascending = False)
   
    return similar_movies_list.head(n_recommendations).index.values

In [12]:
recommended_movies = get_recommended_movies_1(userRatings, 10)
print("Movies you should watch are:")
print("---" * 13)
for i in range(len(recommended_movies)): 
    print (recommended_movies[i])

Movies you should watch are:
---------------------------------------
Men in Black (a.k.a. MIB) (1997)
Zoolander (2001)
Bruce Almighty (2003)
Meet the Parents (2000)
Mr. Deeds (2002)
Wedding Crashers (2005)
Austin Powers: The Spy Who Shagged Me (1999)
Indiana Jones and the Temple of Doom (1984)
Wayne's World (1992)
Dodgeball: A True Underdog Story (2004)


## 2. Implementation of User-Item Filtering 

In [13]:
from sklearn.metrics import pairwise_distances

In [14]:
users_ratings = ratings.pivot_table(index=['userId'],columns=['movieId'],values='rating').reset_index(drop=True)
users_ratings.fillna( 0, inplace = True )
movie_similarity = 1 - pairwise_distances( users_ratings, metric="cosine" )
np.fill_diagonal( movie_similarity, 0 )
users_ratings = pd.DataFrame( movie_similarity )
users_ratings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,600,601,602,603,604,605,606,607,608,609
0,0.000000,0.027283,0.059720,0.194395,0.129080,0.128152,0.158744,0.136968,0.064263,0.016875,...,0.080554,0.164455,0.221486,0.070669,0.153625,0.164191,0.269389,0.291097,0.093572,0.145321
1,0.027283,0.000000,0.000000,0.003726,0.016614,0.025333,0.027585,0.027257,0.000000,0.067445,...,0.202671,0.016866,0.011997,0.000000,0.000000,0.028429,0.012948,0.046211,0.027565,0.102427
2,0.059720,0.000000,0.000000,0.002251,0.005020,0.003936,0.000000,0.004941,0.000000,0.000000,...,0.005048,0.004892,0.024992,0.000000,0.010694,0.012993,0.019247,0.021128,0.000000,0.032119
3,0.194395,0.003726,0.002251,0.000000,0.128659,0.088491,0.115120,0.062969,0.011361,0.031163,...,0.085938,0.128273,0.307973,0.052985,0.084584,0.200395,0.131746,0.149858,0.032198,0.107683
4,0.129080,0.016614,0.005020,0.128659,0.000000,0.300349,0.108342,0.429075,0.000000,0.030611,...,0.068048,0.418747,0.110148,0.258773,0.148758,0.106435,0.152866,0.135535,0.261232,0.060792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,0.164191,0.028429,0.012993,0.200395,0.106435,0.102123,0.200035,0.099388,0.075898,0.088963,...,0.178084,0.116534,0.300669,0.066032,0.148141,0.000000,0.153063,0.262558,0.069622,0.201104
606,0.269389,0.012948,0.019247,0.131746,0.152866,0.162182,0.186114,0.185142,0.011844,0.010451,...,0.092525,0.199910,0.203540,0.137834,0.118780,0.153063,0.000000,0.283081,0.149190,0.139114
607,0.291097,0.046211,0.021128,0.149858,0.135535,0.178809,0.323541,0.187233,0.100435,0.077424,...,0.158355,0.197514,0.232771,0.155306,0.178142,0.262558,0.283081,0.000000,0.121993,0.322055
608,0.093572,0.027565,0.000000,0.032198,0.261232,0.214234,0.090840,0.423993,0.000000,0.021766,...,0.035653,0.335231,0.061941,0.236601,0.097610,0.069622,0.149190,0.121993,0.000000,0.053225


In [15]:
similar_users =  pd.DataFrame(users_ratings.idxmax(axis=1), columns = ['similar_user']) 
similar_users

Unnamed: 0,similar_user
0,265
1,365
2,312
3,390
4,469
...,...
605,473
606,569
607,479
608,339


In [16]:
def movieId2Title(movieIDs):
    movie_titles= list()
    for id in movieIDs:
        movie_titles.append(movies[movies['movieId']==id]['title'].values[0])
    return movie_titles

In [17]:
def getRecommendedMovies_2(userId, n_recommendations=5):

    user_to_movies = ratings[ratings['userId']== userId]['movieId']
    sim_user = similar_users.iloc[userId,0]
    recommended_df = pd.DataFrame(columns=['movieId','title','userId','rating'])
    for movieId in ratings[ratings['userId']== sim_user]['movieId']:
        if movieId not in user_to_movies:
            new_df = movie_ratings[(movie_ratings.userId==sim_user) & (movie_ratings.movieId==movieId)]
            recommended_df = pd.concat([recommended_df, new_df])
        
    recommended_df = recommended_df.sort_values(['rating'], ascending = False )[1:(n_recommendations+1)]  
    
    return recommended_df['movieId']

In [18]:
user_id = 50
recommended_movies = movieId2Title(getRecommendedMovies_2(user_id, 10))
print("Movies you should watch are:")
print("---" * 13)
for j in range(len(recommended_movies)): 
    print (recommended_movies[j])

Movies you should watch are:
---------------------------------------
Mighty Wind, A (2003)
Requiem for a Dream (2000)
Royal Tenenbaums, The (2001)
Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)
Hedwig and the Angry Inch (2000)
Lost in Translation (2003)
Heavenly Creatures (1994)
Punch-Drunk Love (2002)
Sideways (2004)
American Splendor (2003)
