In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

In [2]:
movies = pd.read_csv('movies.csv')
rate = pd.read_csv('ratings.csv')

In [3]:
#Define the user-input
user_input = [
            {'title':'Breakfast Club, The', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 
#create the dataframe for the input
user_df = pd.DataFrame(user_input)

In [4]:
user_df

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
#Cleaning the movies dataset
movies['years']= movies['title'].str.extract('(\(\d\d\d\d\))',expand=False)
movies['years']= movies['years'].str.extract('(\d\d\d\d)',expand=False)
movies['title']= movies['title'].str.replace('(\(\d\d\d\d\))','')
movies['title']= movies['title'].apply(lambda x: x.strip())

In [8]:
movies.head(3)

Unnamed: 0,movieId,title,genres,years
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995


In [9]:
#cheking for movie match in the movies and user movies...and writing them to list
user_movie_id = movies[movies['title'].isin(user_df['title'].tolist())]

In [10]:
user_movie_id

Unnamed: 0,movieId,title,genres,years
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
257,296,Pulp Fiction,Comedy|Crime|Drama|Thriller,1994
973,1274,Akira,Action|Adventure|Animation|Sci-Fi,1988
1445,1968,"Breakfast Club, The",Comedy|Drama,1985


In [11]:
user_mov = pd.merge(user_df,user_movie_id)

In [12]:
user_mov

Unnamed: 0,title,rating,movieId,genres,years
0,"Breakfast Club, The",5.0,1968,Comedy|Drama,1985
1,Toy Story,3.5,1,Adventure|Animation|Children|Comedy|Fantasy,1995
2,Jumanji,2.0,2,Adventure|Children|Fantasy,1995
3,Pulp Fiction,5.0,296,Comedy|Crime|Drama|Thriller,1994
4,Akira,4.5,1274,Action|Adventure|Animation|Sci-Fi,1988


In [13]:
user_mov.drop(['genres','years'],axis=1,inplace=True)

In [14]:
user_mov

Unnamed: 0,title,rating,movieId
0,"Breakfast Club, The",5.0,1968
1,Toy Story,3.5,1
2,Jumanji,2.0,2
3,Pulp Fiction,5.0,296
4,Akira,4.5,1274


In [16]:
rate.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247


In [17]:
#checking for those that have watched similar movies with the input_user
similar_user = rate[rate['movieId'].isin(user_mov['movieId'].tolist())]

In [18]:
similar_user.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
16,1,296,3.0,964982967
320,4,296,1.0,945173350
422,4,1968,4.0,986934786
516,5,1,4.0,847434962


In [19]:
#similar_user grouped by user ID
similar_user_group = similar_user.groupby('userId')

similar_user_group

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fe89c0e4730>

In [20]:
similar_user_group.get_group(6)

Unnamed: 0,userId,movieId,rating,timestamp
560,6,2,4.0,845553522
692,6,296,2.0,845553110


In [21]:
#sort according to the number of similer movies watched
similar_user_group = sorted(similar_user_group, key= lambda x:len(x[1]),reverse=True)

In [22]:
len(similar_user_group)

419

In [23]:
similar_user_group

[(91,
         userId  movieId  rating   timestamp
  14121      91        1     4.0  1112713037
  14122      91        2     3.0  1112713392
  14173      91      296     4.5  1112711264
  14316      91     1274     5.0  1112713057
  14383      91     1968     3.0  1112713409),
 (177,
         userId  movieId  rating   timestamp
  24900     177        1     5.0  1435533535
  24901     177        2     3.5  1435534109
  24930     177      296     5.0  1435530409
  25069     177     1274     2.0  1435535036
  25129     177     1968     3.5  1435534080),
 (219,
         userId  movieId  rating   timestamp
  31524     219        1     3.5  1194681084
  31525     219        2     2.5  1194740185
  31554     219      296     4.0  1198522553
  31628     219     1274     2.5  1194686351
  31680     219     1968     3.0  1194931899),
 (274,
         userId  movieId  rating   timestamp
  39229     274        1     4.0  1171410158
  39230     274        2     3.5  1171934785
  39288     274      2

In [24]:
similar_user_group = similar_user_group[0:100]

In [25]:
from scipy.stats import pearsonr
pcf_corr = {}

for user_id,movie in similar_user_group:
    movie = movie.sort_values(by='movieId')
    #print(movie)
    user_movie = user_mov.sort_values(by='movieId')
    #print(user_movie)
    temp_df = user_movie[user_movie['movieId'].isin(movie['movieId'].tolist())]
    #print(temp_df)
    
    user_form = temp_df['rating'].tolist()
    #print(user_form)
    group_form = movie['rating'].tolist()
    #print(group_form)
    
    corr = pearsonr(user_form,group_form)
    pcf_corr[user_id]= corr[0]
    



In [26]:
pcf_corr

{91: 0.4385290096535147,
 177: 6.938893903907228e-18,
 219: 0.4512426281971403,
 274: 0.7161148740394331,
 298: 0.9592712306918569,
 414: 0.9376144618769908,
 474: 0.11720180773462385,
 477: 0.4385290096535146,
 480: 0.7844645405527362,
 483: 0.08006407690254358,
 599: 0.7666866491579839,
 608: 0.9207368843792512,
 50: 0.15713484026367724,
 57: -0.7385489458759964,
 68: 0.0,
 103: 0.5222329678670936,
 135: 0.8703882797784894,
 182: 0.9428090415820636,
 202: 0.5222329678670936,
 217: 0.30151134457776363,
 226: 0.9438798074485389,
 288: 0.6005325641789633,
 307: 0.9655810287305762,
 318: 0.44486512077567236,
 322: 0.5057805388588732,
 330: 0.9035942578600878,
 357: 0.560611910581388,
 434: 0.9864036607532465,
 448: 0.30151134457776363,
 469: 0.8164965809277261,
 561: 0.5222329678670936,
 600: 0.18442777839082944,
 606: 0.9146591207600473,
 610: -0.4714045207910318,
 18: 1.0,
 19: -0.5,
 21: nan,
 45: 0.5,
 63: -0.5,
 64: 0.0,
 66: 0.5,
 107: -1.0,
 122: 0.8660254037844388,
 132: 1.0,
 14

In [27]:
pearson_df = pd.DataFrame(columns=['userId', 'similarity_index'], data=pcf_corr.items())
pearson_df = pearson_df.sort_values(by='similarity_index', ascending=False)[:50]
pearson_df

Unnamed: 0,userId,similarity_index
34,18,1.0
90,562,1.0
82,489,1.0
63,305,1.0
43,132,1.0
86,525,1.0
46,144,1.0
27,434,0.986404
89,560,0.981981
22,307,0.965581


In [28]:
users_rating = pearson_df.merge(rate, on='userId', how='inner')
users_rating['weighted_rating'] = users_rating['rating'] * users_rating['similarity_index']
users_rating

Unnamed: 0,userId,similarity_index,movieId,rating,timestamp,weighted_rating
0,18,1.0,1,3.5,1455209816,3.5
1,18,1.0,2,3.0,1455617462,3.0
2,18,1.0,6,4.0,1460138360,4.0
3,18,1.0,16,4.5,1461311583,4.5
4,18,1.0,32,4.0,1455209840,4.0
...,...,...,...,...,...,...
27870,555,0.5,4081,2.0,980125364,1.0
27871,555,0.5,4084,3.0,980125838,1.5
27872,555,0.5,4085,3.0,980125838,1.5
27873,555,0.5,4086,4.0,980125895,2.0


In [29]:
grouped_ratings = users_rating.groupby('movieId').sum()[['similarity_index', 'weighted_rating']]

In [30]:
grouped_ratings

Unnamed: 0_level_0,similarity_index,weighted_rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,36.354096,131.667946
2,31.505292,96.654257
3,8.783859,26.381456
4,0.866025,1.732051
5,6.165336,16.275255
...,...,...
187593,1.000000,5.000000
187595,0.937614,3.281651
187717,0.802955,4.014775
188189,0.802955,3.613298


In [31]:
recommend_movies = pd.DataFrame()

# Add average recommendation score
recommend_movies['avg_reccomend_score'] = grouped_ratings['weighted_rating']/grouped_ratings['similarity_index']
recommend_movies['movieID'] = grouped_ratings.index
#recommend_books['book_id'] = grouped_ratings['book_id']
recommend_movies = recommend_movies.reset_index(drop=True)

# Left books with the highest score
recommend_movies = recommend_movies[(recommend_movies['avg_reccomend_score'] == 5)]
recommend_movies

Unnamed: 0,avg_reccomend_score,movieID
67,5.0,85
128,5.0,187
201,5.0,290
316,5.0,456
436,5.0,633
...,...,...
6040,5.0,162414
6061,5.0,167064
6081,5.0,170705
6136,5.0,187593
