### Libraries

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import sqrt

### Collaborative

In [18]:
movies_df = pd.read_csv('movies.csv')
movies_df = movies_df.drop('genres', axis=1) # drop genres because in collaborative we dont need to know features of items
movies_df

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)
...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017)
9738,193583,No Game No Life: Zero (2017)
9739,193585,Flint (2017)
9740,193587,Bungo Stray Dogs: Dead Apple (2018)


In [19]:
ratings_df = pd.read_csv('ratings.csv')
ratings_df = ratings_df.drop('timestamp',axis=1) #drop time
ratings_df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [20]:
inputMovies = pd.DataFrame({'movieId':[1968,1,2,296,1274],'rating':[5.0,3.5,2.0,5.0,4.5]})
inputMovies = inputMovies.sort_values(by='movieId')
inputMovies = pd.merge(inputMovies, movies_df) #add movieId for each movie
inputMovies

Unnamed: 0,movieId,rating,title
0,1,3.5,Toy Story (1995)
1,2,2.0,Jumanji (1995)
2,296,5.0,Pulp Fiction (1994)
3,1274,4.5,Akira (1988)
4,1968,5.0,"Breakfast Club, The (1985)"


In [21]:
#we need all ratings of other users to movies witch our user rate too
subsetratings = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
subsetratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
16,1,296,3.0
320,4,296,1.0
422,4,1968,4.0
516,5,1,4.0
...,...,...,...
99510,609,296,4.0
99534,610,1,5.0
99552,610,296,5.0
99636,610,1274,5.0


In [22]:
#becuase of the subset above is to large we should seprate a subset of that witch have more important to us
# (users whose have more rating in common with our user)
# so we group them by userId and get top 100 of the list
ratingGroups = subsetratings.groupby('userId')

In [23]:
ratingGroups = sorted(ratingGroups, key=lambda x: len(x[1]),reverse=True)
ratingGroups = ratingGroups[:100]
ratingGroups

[(91,
         userId  movieId  rating
  14121      91        1     4.0
  14122      91        2     3.0
  14173      91      296     4.5
  14316      91     1274     5.0
  14383      91     1968     3.0),
 (177,
         userId  movieId  rating
  24900     177        1     5.0
  24901     177        2     3.5
  24930     177      296     5.0
  25069     177     1274     2.0
  25129     177     1968     3.5),
 (219,
         userId  movieId  rating
  31524     219        1     3.5
  31525     219        2     2.5
  31554     219      296     4.0
  31628     219     1274     2.5
  31680     219     1968     3.0),
 (274,
         userId  movieId  rating
  39229     274        1     4.0
  39230     274        2     3.5
  39288     274      296     5.0
  39448     274     1274     4.0
  39549     274     1968     4.0),
 (298,
         userId  movieId  rating
  44535     298        1     2.0
  44536     298        2     0.5
  44555     298      296     4.5
  44620     298     1274     4.0
 

In [24]:
#this cell is a copy of original code and I didnt change it at all
#It calculate similarity between our user and other 100 users

#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in ratingGroups:
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    #Get the N for the formula
    nRatings = len(group)
    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [25]:
pearsonCorrelationDict

{91: 0.43852900965351443,
 177: 0.0,
 219: 0.45124262819713973,
 274: 0.716114874039432,
 298: 0.9592712306918567,
 414: 0.9376144618769914,
 474: 0.11720180773462392,
 477: 0.4385290096535153,
 480: 0.7844645405527362,
 483: 0.08006407690254357,
 599: 0.7666866491579839,
 608: 0.920736884379251,
 50: 0.15713484026367722,
 57: -0.7385489458759964,
 68: 0.0,
 103: 0.5222329678670935,
 135: 0.8703882797784892,
 182: 0.9428090415820635,
 202: 0.5222329678670935,
 217: 0.30151134457776363,
 226: 0.9438798074485389,
 288: 0.6005325641789633,
 307: 0.9655810287305759,
 318: 0.44486512077567225,
 322: 0.5057805388588731,
 330: 0.9035942578600878,
 357: 0.5606119105813882,
 434: 0.9864036607532465,
 448: 0.30151134457776363,
 469: 0.8164965809277261,
 561: 0.5222329678670935,
 600: 0.18442777839082938,
 606: 0.9146591207600472,
 610: -0.47140452079103173,
 18: 1.0,
 19: -0.5,
 21: 0,
 45: 0.5000000000000009,
 63: -0.4999999999999982,
 64: 0.0,
 66: 0.5000000000000009,
 107: -1.0,
 122: 0.86602

In [26]:
#this is a dataframe of our dictionary witch sorted by similarity and return first 20 items of that because the original one is too large
tmp = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
tmp.columns = ['similarity']
tmp['userId'] = tmp.index
tmp = tmp.sort_values(by='similarity', ascending=False)
tmp = tmp.reset_index(drop=True)
tmp = tmp[:20]
tmp

Unnamed: 0,similarity,userId
0,1.0,132
1,1.0,18
2,1.0,305
3,1.0,489
4,1.0,525
5,1.0,144
6,1.0,562
7,0.986404,434
8,0.981981,560
9,0.965581,307


In [27]:
sr = ratings_df[ratings_df['userId'].isin(tmp['userId'].tolist())] #sr is a subset of ratings witch are belongs to selected users
tmp = pd.merge(sr, tmp, on='userId', how='inner') #we gather rating and similarity for each record by merging 2 dataframe
tmp['w_rating'] = tmp['rating']*tmp['similarity'] #calculating weightedRating for each movie by similarity*rating
tmp = tmp.groupby('movieId')['similarity','w_rating'].sum() #calculating sum of rating and weightedRating for each movie
tmp['score'] = tmp['w_rating']/tmp['similarity'] #calculating score of each movie
tmp

  tmp = tmp.groupby('movieId')['similarity','w_rating'].sum() #calculating sum of rating and weightedRating for each movie


Unnamed: 0_level_0,similarity,w_rating,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,17.153713,59.989294,3.497161
2,14.314264,37.745121,2.636889
3,5.632175,17.868134,3.172510
5,2.898383,5.796767,2.000000
6,5.827596,21.863790,3.751768
...,...,...,...
184791,0.937614,2.344036,2.500000
185135,1.000000,4.500000,4.500000
185585,1.000000,2.500000,2.500000
187593,1.000000,5.000000,5.000000


In [28]:
costum_movies = tmp.sort_values(by='score', ascending=False)
costum_movies = costum_movies[costum_movies['score']>3] #ratings are between 0 and 5. so those are over 3 means favorit
costum_movies = pd.merge(costum_movies, movies_df, left_index=True, right_on='movieId').iloc[:,2:] #merge dataframes to get titles
costum_movies

Unnamed: 0,score,movieId,title
5013,5.0,7767,"Best of Youth, The (La meglio gioventù) (2003)"
712,5.0,931,Spellbound (1945)
912,5.0,1211,"Wings of Desire (Himmel über Berlin, Der) (1987)"
4969,5.0,7579,Pride and Prejudice (1940)
687,5.0,905,It Happened One Night (1934)
...,...,...,...
518,3.0,605,One Fine Day (1996)
3524,3.0,4815,Hearts in Atlantis (2001)
3411,3.0,4640,Brother (2000)
2089,3.0,2779,Heaven Can Wait (1978)


### Content-based

In [29]:
mov = pd.read_csv('movies.csv')
mov

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [30]:
# generate a seprate column for each genre
movg = mov.copy()
movg['genres'] = movg['genres'].str.split('|')
for index,row in movg.iterrows():
    for g in row['genres']:
        movg.at[index,g] = 1

movg = movg.fillna(0)
movg = movg.drop(['genres'], axis=1)
movg

Unnamed: 0,movieId,title,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1,Toy Story (1995),1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9738,193583,No Game No Life: Zero (2017),0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9739,193585,Flint (2017),0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9740,193587,Bungo Stray Dogs: Dead Apple (2018),0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
# user input movies
input = pd.DataFrame({'movieId':[1968,1,2,296,1274],'rating':[5.0,3.5,2.0,5.0,4.5]})
input = input.sort_values(by='movieId')
input

Unnamed: 0,movieId,rating
1,1,3.5
2,2,2.0
3,296,5.0
4,1274,4.5
0,1968,5.0


In [32]:
inputr = input['rating'].reset_index(drop=True)
inputr

0    3.5
1    2.0
2    5.0
3    4.5
4    5.0
Name: rating, dtype: float64

In [33]:
usermg = movg[movg['movieId'].isin(input['movieId'].tolist())]
usermg = usermg.drop('movieId', axis=1).drop('title', axis=1)
usermg = usermg.reset_index(drop=True)
usermg

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
#calculate user profile
user_profile = inputr.transpose().dot(usermg)
user_profile

Adventure             10.0
Animation              8.0
Children               5.5
Comedy                13.5
Fantasy                5.5
Romance                0.0
Drama                 10.0
Action                 4.5
Crime                  5.0
Thriller               5.0
Horror                 0.0
Mystery                0.0
Sci-Fi                 4.5
War                    0.0
Musical                0.0
Documentary            0.0
IMAX                   0.0
Western                0.0
Film-Noir              0.0
(no genres listed)     0.0
Name: rating, dtype: float64

In [35]:
#calculate score of each movie for our user
mov_score = (user_profile.transpose()*movg).sum(axis=1)/sum(user_profile)
mov_score = pd.DataFrame(data=mov_score)
mov_score = mov_score.rename(columns={0:'score'})
mov_score


Unnamed: 0,score
0,0.594406
1,0.293706
2,0.188811
3,0.328671
4,0.188811
...,...
9737,0.440559
9738,0.377622
9739,0.139860
9740,0.174825


In [36]:
#merge dataframe with original dataframe to bring titles
costum_movie2 = pd.merge(mov_score, mov, left_index=True, right_index=True).drop('genres', axis=1)
costum_movie2 = costum_movie2.sort_values(by='score',ascending=False)

#score over 0.5 means 'good movie'
costum_movie2 = costum_movie2[costum_movie2['score']>0.5]
costum_movie2

Unnamed: 0,score,movieId,title
8900,0.734266,134853,Inside Out (2015)
9169,0.685315,148775,Wizards of Waverly Place: The Movie (2009)
8597,0.678322,117646,Dragonheart 2: A New Beginning (2000)
4631,0.678322,6902,Interstate 60 (2002)
7441,0.671329,81132,Rubber (2010)
...,...,...,...
9542,0.503497,172637,Priklyucheniya Kapitana Vrungelya (1979)
7547,0.503497,85179,Summer Wars (Samâ wôzu) (2009)
5350,0.503497,8917,Team America: World Police (2004)
8109,0.503497,100611,Escape from Planet Earth (2013)


## Merge Both results for make decision

In [37]:
from sklearn.preprocessing import MinMaxScaler
# find commen result from both recommender systems
recommended = pd.merge(costum_movies,costum_movie2,how='inner', on='movieId').drop('title_x',axis=1)

# normalize scores becuase of the different ranges
recommended['collaborative'] = MinMaxScaler().fit_transform(recommended.score_x.values.reshape(-1,1))
recommended['content_based'] = MinMaxScaler().fit_transform(recommended.score_y.values.reshape(-1,1))

# calculate final score 
recommended['final_score'] = (recommended['collaborative']+recommended['content_based'])/2

# clean dataframe and reset_index and sort by scores
recommended = recommended.drop(['score_x','score_y'],axis=1)
recommended = recommended.sort_values(by='final_score', ascending=False)
recommended = recommended.rename(columns={'title_y':'title'})
recommended = recommended.reset_index(drop=True)
recommended

Unnamed: 0,movieId,title,collaborative,content_based,final_score
0,1223,"Grand Day Out with Wallace and Gromit, A (1989)",1.0,0.333333,0.666667
1,6902,Interstate 60 (2002),0.572208,0.757576,0.664892
2,134853,Inside Out (2015),0.276608,1.0,0.638304
3,78499,Toy Story 3 (2010),0.709501,0.393939,0.55172
4,3000,Princess Mononoke (Mononoke-hime) (1997),0.979824,0.121212,0.550518
5,108932,The Lego Movie (2014),0.392381,0.666667,0.529524
6,1907,Mulan (1998),0.354305,0.666667,0.510486
7,2987,Who Framed Roger Rabbit? (1988),0.281331,0.69697,0.48915
8,4886,"Monsters, Inc. (2001)",0.507825,0.393939,0.450882
9,4719,Osmosis Jones (2001),0.28392,0.606061,0.44499
