# Movie Recommendation System Using Collaborative Filtering 

In [1]:
import pandas as pd

### Building the user-user collaborative filtering for toy-dataset

In [547]:
toy_dataset = pd.read_csv("collab_filtering_dataset/toy_dataset.csv", index_col=0)
toy_dataset

Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
user 1,4.0,5.0,3.0,,2.0,1.0
user 2,5.0,3.0,3.0,2.0,2.0,
user 3,1.0,,,4.0,5.0,4.0
user 4,,2.0,1.0,4.0,,3.0
user 5,1.0,,2.0,3.0,3.0,4.0


In [146]:
# Applying standardization:
def standardize(row):
    return (row - row.mean())/(row.max()-row.min())

toy_dataset_std = toy_dataset.apply(standardize, axis=1).fillna(0)
toy_dataset_std

Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
user 1,0.25,0.5,0.0,0.0,-0.25,-0.5
user 2,0.666667,0.0,0.0,-0.333333,-0.333333,0.0
user 3,-0.625,0.0,0.0,0.125,0.375,0.125
user 4,0.0,-0.166667,-0.5,0.5,0.0,0.166667
user 5,-0.533333,0.0,-0.2,0.133333,0.133333,0.466667


In [137]:
# similarity matrix for users 
from sklearn.metrics.pairwise import cosine_similarity
toy_similarity_matrix = cosine_similarity(toy_dataset_std)
toy_similarity_matrix_df = pd.DataFrame(toy_similarity_matrix, index=toy_dataset.index, columns=toy_dataset.index)
toy_similarity_matrix_df

Unnamed: 0,user 1,user 2,user 3,user 4,user 5
user 1,1.0,0.387298,-0.527046,-0.282843,-0.66564
user 2,0.387298,1.0,-0.952579,-0.273861,-0.716115
user 3,-0.527046,-0.952579,1.0,0.149071,0.80397
user 4,-0.282843,-0.273861,0.149071,1.0,0.431455
user 5,-0.66564,-0.716115,0.80397,0.431455,1.0


In [587]:
x = toy_dataset_std.T.corr(method="spearman")
corr_df = pd.DataFrame(x, index= toy_dataset.index, columns= toy_dataset.index)
corr_df

Unnamed: 0,user 1,user 2,user 3,user 4,user 5
user 1,1.0,0.422701,-0.746352,-0.441176,-0.75
user 2,0.422701,1.0,-0.874007,-0.39139,-0.626224
user 3,-0.746352,-0.874007,1.0,0.477665,0.850841
user 4,-0.441176,-0.39139,0.477665,1.0,0.632353
user 5,-0.75,-0.626224,0.850841,0.632353,1.0


In [221]:
toy_dataset["action1"].mean()

2.75

In [588]:
import numpy as np

def recommend_movies(userids):
    result_df = pd.DataFrame()
    for userid in userids:
        result = None
        similar_users = get_similar_users(userid)
        toy_dataset2 = toy_dataset.fillna(2.5).astype('float')
        #print(similar_users)
        user_ratings = pd.Series(dtype=float)
        for index, value in similar_users.iteritems():
            #print("user:", index, "similarity score: ", value)
            if value >= 0:
                user_ratings = user_ratings.append(toy_dataset2.loc[index].multiply(toy_dataset2.loc[index]) * value)
                #user_ratings = user_ratings.append(toy_dataset2.loc[index] * value)
                #print(user_ratings)
            else:
                continue
                #user_ratings = user_ratings.append(5 - (toy_dataset2.loc[index]  * (-value) * toy_dataset2.loc[userid]))
        user_ratings = user_ratings.astype('float')
        result = user_ratings.groupby(level=0).mean()
        result = (result / result.max()) * 5
        for index, value in result.iteritems():
            if value > 2.5:
                result.loc[index] = np.floor(value)
            else:
                result.loc[index] = np.ceil(value)
        result = result.to_frame().rename(columns={0:userid}).T
        result_df = result_df.append(result)
    return result_df


def get_similar_users(userid):
    return corr_df[userid]
    
result = recommend_movies(toy_dataset.index)
result

Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
user 1,4.0,5.0,3.0,2.0,1.0,1.0
user 2,5.0,3.0,3.0,2.0,1.0,2.0
user 3,1.0,2.0,2.0,4.0,5.0,4.0
user 4,2.0,2.0,2.0,5.0,4.0,4.0
user 5,1.0,2.0,2.0,4.0,4.0,5.0


In [589]:
# Validating the Recommendation System using the mean square error method.
import math
def MSE(predicted_values, actual_values):
    total = 0
    null_count = 0
    predicted_values = predicted_values.values.tolist()
    predicted_values = sum(predicted_values, [])
    
    actual_values = actual_values.values.tolist()
    actual_values = sum(actual_values, [])

    for i in range(0,len(predicted_values)):
        if math.isnan(float(actual_values[i])):
            null_count = null_count + 1 
        else:
            total = total + (predicted_values[i] - float(actual_values[i])) ** 2
  
    mean = total / (len(actual_values) - null_count)
    return mean


In [590]:
Mean_sq_error = MSE(result, toy_dataset)
Mean_sq_error


0.34782608695652173

In [592]:
# Applying the same Algorithm for the actual dataset
movies = pd.read_csv("collab_filtering_dataset/movies.csv")
ratings = pd.read_csv("collab_filtering_dataset/ratings.csv")

In [621]:
ratings_matrix = pd.pivot_table(ratings, index="userId", columns="movieId", values="rating")
ratings_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [595]:
# Applying standardization:
def standardize(row):
    return (row - row.mean())/(row.max()-row.min())

ratings_matrix_std = ratings_matrix.apply(standardize, axis=1).fillna(0)
ratings_matrix_std

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-9.159483e-02,-1.096066e-18,-9.159483e-02,-1.096066e-18,-1.096066e-18,-9.159483e-02,-1.096066e-18,-1.096066e-18,-1.096066e-18,-1.096066e-18,...,-1.096066e-18,-1.096066e-18,-1.096066e-18,-1.096066e-18,-1.096066e-18,-1.096066e-18,-1.096066e-18,-1.096066e-18,-1.096066e-18,-1.096066e-18
2,-2.269198e-19,-2.269198e-19,-2.269198e-19,-2.269198e-19,-2.269198e-19,-2.269198e-19,-2.269198e-19,-2.269198e-19,-2.269198e-19,-2.269198e-19,...,-2.269198e-19,-2.269198e-19,-2.269198e-19,-2.269198e-19,-2.269198e-19,-2.269198e-19,-2.269198e-19,-2.269198e-19,-2.269198e-19,-2.269198e-19
3,-2.283470e-19,-2.283470e-19,-2.283470e-19,-2.283470e-19,-2.283470e-19,-2.283470e-19,-2.283470e-19,-2.283470e-19,-2.283470e-19,-2.283470e-19,...,-2.283470e-19,-2.283470e-19,-2.283470e-19,-2.283470e-19,-2.283470e-19,-2.283470e-19,-2.283470e-19,-2.283470e-19,-2.283470e-19,-2.283470e-19
4,-1.141735e-18,-1.141735e-18,-1.141735e-18,-1.141735e-18,-1.141735e-18,-1.141735e-18,-1.141735e-18,-1.141735e-18,-1.141735e-18,-1.141735e-18,...,-1.141735e-18,-1.141735e-18,-1.141735e-18,-1.141735e-18,-1.141735e-18,-1.141735e-18,-1.141735e-18,-1.141735e-18,-1.141735e-18,-1.141735e-18
5,9.090909e-02,-1.255908e-19,-1.255908e-19,-1.255908e-19,-1.255908e-19,-1.255908e-19,-1.255908e-19,-1.255908e-19,-1.255908e-19,-1.255908e-19,...,-1.255908e-19,-1.255908e-19,-1.255908e-19,-1.255908e-19,-1.255908e-19,-1.255908e-19,-1.255908e-19,-1.255908e-19,-1.255908e-19,-1.255908e-19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,-2.571998e-01,4.018907e-18,4.018907e-18,4.018907e-18,4.018907e-18,4.018907e-18,-2.571998e-01,4.018907e-18,4.018907e-18,4.018907e-18,...,4.018907e-18,4.018907e-18,4.018907e-18,4.018907e-18,4.018907e-18,4.018907e-18,4.018907e-18,4.018907e-18,4.018907e-18,4.018907e-18
607,5.347594e-02,9.133879e-20,9.133879e-20,9.133879e-20,9.133879e-20,9.133879e-20,9.133879e-20,9.133879e-20,9.133879e-20,9.133879e-20,...,9.133879e-20,9.133879e-20,9.133879e-20,9.133879e-20,9.133879e-20,9.133879e-20,9.133879e-20,9.133879e-20,9.133879e-20,9.133879e-20
608,-1.409279e-01,-2.520390e-01,-2.520390e-01,-2.922841e-18,-2.922841e-18,-2.922841e-18,-2.922841e-18,-2.922841e-18,-2.922841e-18,1.924054e-01,...,-2.922841e-18,-2.922841e-18,-2.922841e-18,-2.922841e-18,-2.922841e-18,-2.922841e-18,-2.922841e-18,-2.922841e-18,-2.922841e-18,-2.922841e-18
609,-2.702703e-01,-3.653552e-19,-3.653552e-19,-3.653552e-19,-3.653552e-19,-3.653552e-19,-3.653552e-19,-3.653552e-19,-3.653552e-19,7.297297e-01,...,-3.653552e-19,-3.653552e-19,-3.653552e-19,-3.653552e-19,-3.653552e-19,-3.653552e-19,-3.653552e-19,-3.653552e-19,-3.653552e-19,-3.653552e-19


In [597]:
# Similarity Matrix for users:
x = ratings_matrix_std.T.corr(method="spearman")
corr_df = pd.DataFrame(x, index= ratings_matrix.index, columns= ratings_matrix.index)
corr_df

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.024206,0.010474,0.058180,0.009991,-0.033358,0.031598,0.047908,0.009771,-0.011249,...,0.019529,-0.027899,-0.012505,-0.000091,0.004339,0.008851,0.057397,0.069039,0.010987,0.009532
2,0.024206,1.000000,0.000083,-0.013177,0.027921,0.000293,-0.045610,-0.027033,0.000051,0.015397,...,0.055011,-0.015607,-0.000967,0.000208,0.000340,0.005712,-0.014128,0.012999,-0.030012,0.052091
3,0.010474,0.000083,1.000000,-0.010857,-0.024166,0.008996,0.000097,-0.023410,-0.000015,0.000084,...,0.000003,-0.000149,0.025667,-0.000060,-0.010821,-0.019922,-0.000007,0.004697,-0.000139,0.022156
4,0.058180,-0.013177,-0.010857,1.000000,-0.020422,0.004488,0.043922,-0.009655,-0.009890,0.016846,...,-0.007418,0.018751,0.055803,-0.006349,0.028078,0.073200,0.024162,-0.032686,-0.010306,0.014044
5,0.009991,0.027921,-0.024166,-0.020422,1.000000,0.067662,0.036560,0.022030,0.000009,-0.012779,...,0.000053,-0.038356,0.034470,-0.044997,-0.020465,-0.003418,0.033063,0.030520,0.049737,0.008830
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.008851,0.005712,-0.019922,0.073200,-0.003418,-0.006099,0.046441,0.004930,0.040321,-0.001651,...,0.048511,-0.018756,0.051883,0.033937,0.018956,1.000000,-0.015837,0.050026,0.045360,0.073008
607,0.057397,-0.014128,-0.000007,0.024162,0.033063,0.033847,0.035563,0.043023,-0.010684,-0.006742,...,0.021818,0.007396,0.025519,-0.021571,-0.034006,-0.015837,1.000000,0.031427,-0.047230,0.031799
608,0.069039,0.012999,0.004697,-0.032686,0.030520,-0.001514,0.054015,0.061535,0.010789,-0.009407,...,0.070253,0.015168,0.030153,0.023369,-0.025834,0.050026,0.031427,1.000000,0.073859,0.012877
609,0.010987,-0.030012,-0.000139,-0.010306,0.049737,-0.000042,0.000283,0.023898,-0.000085,0.000395,...,0.033247,0.013880,-0.002457,0.032701,-0.033885,0.045360,-0.047230,0.073859,1.000000,-0.028433


In [602]:
import numpy as np

def recommend_movies(userids):
    result_df = pd.DataFrame()
    for userid in userids:
        print("Processing for ", userid)
        result = None
        similar_users = get_similar_users(userid)
        ratings_matrix2 = ratings_matrix.fillna(2.5).astype('float')
        #print(similar_users)
        user_ratings = pd.Series(dtype=float)
        for index, value in similar_users.iteritems():
            #print("user:", index, "similarity score: ", value)
            if value >= 0:
                user_ratings = user_ratings.append(ratings_matrix2.loc[index].multiply(ratings_matrix.loc[index]) * value)
                #user_ratings = user_ratings.append(toy_dataset2.loc[index] * value)
                #print(user_ratings)
            else:
                continue
                #user_ratings = user_ratings.append(5 - (toy_dataset2.loc[index]  * (-value) * toy_dataset2.loc[userid]))
        user_ratings = user_ratings.astype('float')
        result = user_ratings.groupby(level=0).mean()
        result = (result / result.max()) * 5
        for index, value in result.iteritems():
            if value > 2.5:
                result.loc[index] = np.floor(value)
            else:
                result.loc[index] = np.ceil(value)
        result = result.to_frame().rename(columns={0:userid}).T
        result_df = result_df.append(result)
    return result_df


def get_similar_users(userid):
    return corr_df[userid]
    
result = recommend_movies(ratings_matrix.index[:20])
result

Processing for  1
Processing for  2
Processing for  3
Processing for  4
Processing for  5
Processing for  6
Processing for  7
Processing for  8
Processing for  9
Processing for  10
Processing for  11
Processing for  12
Processing for  13
Processing for  14
Processing for  15
Processing for  16
Processing for  17
Processing for  18
Processing for  19
Processing for  20


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
1,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,1.0,2.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
7,2.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
10,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [603]:
# Find the Mean square Error
Mean_sq_error = MSE(result, ratings_matrix)
Mean_sq_error

0.030600664679031164

In [624]:
reco_movies = result.loc[1][result.loc[1].values < 3].sort_values(ascending=False)
reco_movies

ratings_matrix.loc[1][ratings_matrix.loc[1].values < 2].sort_values(ascending=False)

ratings_matrix.loc[1]


  after removing the cwd from sys.path.


movieId
1         4.0
2         NaN
3         4.0
4         NaN
5         NaN
         ... 
193581    NaN
193583    NaN
193585    NaN
193587    NaN
193609    NaN
Name: 1, Length: 9724, dtype: float64

In [539]:
len(toy_dataset2)

30

In [546]:
toy_dataset

Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
user 1,4.0,5.0,3.0,,2.0,1.0
user 2,5.0,3.0,3.0,2.0,2.0,
user 3,1.0,,,4.0,5.0,4.0
user 4,2.5,2.0,1.0,4.0,2.5,3.0
user 5,1.0,,2.0,3.0,3.0,4.0


In [326]:
a = [1,2,3]
b = [3,4,5]
# a * b
a * 1.5


TypeError: can't multiply sequence by non-int of type 'float'

In [599]:
toy_dataset

Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
user 1,4.0,5.0,3.0,,2.0,1.0
user 2,5.0,3.0,3.0,2.0,2.0,
user 3,1.0,,,4.0,5.0,4.0
user 4,,2.0,1.0,4.0,,3.0
user 5,1.0,,2.0,3.0,3.0,4.0


In [324]:
def recommend_movies(userid):
    similar_users = get_similar_users(userid)
    #print(similar_users)
    user_ratings = pd.Series()
    for index, value in similar_users.iteritems():
        #print("user:", index, "similarity score: ", value)
        if value >= 0:
            user_ratings = user_ratings.append(ratings_matrix.loc[index]  * value )
        else:
            continue
            #user_ratings = user_ratings.append(5 - (toy_dataset.loc[index]   * -value))
        #print(toy_dataset.loc[index])
        #print(user_ratings)
    #print(user_ratings)
    print(user_ratings.groupby(level=0).mean().sort_values(ascending=False))
    
def get_similar_users(userid):
    return toy_similarity_matrix_df[userid]
    
recommend_movies("user 4")


  after removing the cwd from sys.path.


TypeError: can't multiply sequence by non-int of type 'float'

In [179]:
toy_dataset

Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
user 1,4.0,5.0,3.0,,2.0,1.0
user 2,5.0,3.0,3.0,2.0,2.0,
user 3,1.0,,,4.0,5.0,4.0
user 4,,2.0,1.0,4.0,,3.0
user 5,1.0,,2.0,3.0,3.0,4.0


In [189]:
# item-item collaboration
item_similarity_matrix = cosine_similarity(toy_dataset_std.T)
item_similarity_matrix_df = pd.DataFrame(item_similarity_matrix, index=toy_dataset.columns, columns=toy_dataset.columns)
item_similarity_matrix_df

Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
action1,1.0,0.218147,0.182187,-0.543961,-0.942141,-0.581505
action2,0.218147,1.0,0.29361,-0.251732,-0.411607,-0.73716
action3,0.182187,0.29361,1.0,-0.81795,-0.085939,-0.458848
romantic1,-0.543961,-0.251732,-0.81795,1.0,0.485647,0.358917
romantic2,-0.942141,-0.411607,-0.085939,0.485647,1.0,0.568239
romantic3,-0.581505,-0.73716,-0.458848,0.358917,0.568239,1.0


In [281]:
def recommend_movies_item_collab(userid):
    user_ratings = toy_dataset.loc[userid]
    movie_rating = pd.Series()
    for index, value in user_ratings.iteritems():
        x = get_similar_movies(index , value)
        movie_rating = movie_rating.append(x)
        
    movie_rating = movie_rating.groupby(level=0).mean().sort_values(ascending=False)
    return movie_rating
    
    
def get_similar_movies(movie_name, rating):
    similar_score = item_similarity_matrix_df[movie_name] * (rating - 2.5)
    similar_score = similar_score
    return similar_score
   

print(recommend_movies_item_collab("user 1"))

action2      0.857114
action1      0.695958
action3      0.447709
romantic1   -0.527089
romantic2   -0.767512
romantic3   -0.945740
dtype: float64


  This is separate from the ipykernel package so we can avoid doing imports until


In [207]:
toy_dataset

Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
user 1,4.0,5.0,3.0,,2.0,1.0
user 2,5.0,3.0,3.0,2.0,2.0,
user 3,1.0,,,4.0,5.0,4.0
user 4,,2.0,1.0,4.0,,3.0
user 5,1.0,,2.0,3.0,3.0,4.0


In [None]:
def recommend_movies_item_collab(userid):
    user_ratings = toy_dataset.loc[userid]
    movie_rating = pd.Series()
    for index, value in user_ratings.iteritems():
        x = get_similar_movies(index , value)
        movie_rating = movie_rating.append(x)
        
    movie_rating = movie_rating.groupby(level=0).mean().sort_values(ascending=False)
    return movie_rating
    
    
def get_similar_users(movie_name, rating):
    similar_score = item_similarity_matrix_df[movie_name] * (rating - 2.5)
    similar_score = similar_score
    return similar_score
   

print(recommend_movies_item_collab("user 4"))

In [2]:
movies = pd.read_csv("collab_filtering_dataset/movies.csv")
ratings = pd.read_csv("collab_filtering_dataset/ratings.csv")

In [3]:
movies.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [4]:
movies.shape

(9742, 3)

In [9]:
import re
movies['year'] = movies["title"].str.extract("\((\d+)\)").fillna(0).astype(int)
movies.head(3)

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995


In [15]:
def get_average_rating(movies):
    avg = []
    for index, row in movies.iterrows():
        a = ratings[ratings['movieId'] == row["movieId"]]['rating'].mean()
        avg.append(a)
    return avg

movies["average_rating"] = get_average_rating(movies)
movies.head(3)

Unnamed: 0,movieId,title,genres,year,average_rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,3.92093
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,3.431818
2,3,Grumpier Old Men (1995),Comedy|Romance,1995,3.259615


In [5]:
ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [6]:
ratings.shape

(100836, 4)

In [20]:
# Building a ratings matrix with userId vs Movieid 
ratings_matrix = pd.pivot_table(ratings, index="userId", columns="movieId", values="rating")
ratings_matrix.head(3)

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,


### Normalizing the ratings across every row - i.e. for every user

In [87]:
# Normalize the values:
# 1. Normalize the values:
def standardize(row):
    return (row - row.mean())/(row.max()-row.min())

ratings_matrix2 = ratings_matrix.apply(standardize, axis=1).fillna(0)
ratings_matrix2.head(3)

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.091595,0.0,-0.091595,0.0,0.0,-0.091595,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 1. Building User-User Collaborative Filtering:

In [18]:
# Building User-User Collaborative Filtering:
from sklearn.metrics.pairwise import cosine_similarity
user_similarity = cosine_similarity(ratings_matrix2)
user_similarity_df = pd.DataFrame(user_similarity, index=ratings_matrix.index, columns=ratings_matrix.index)
user_similarity_df.head(3)


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.001265,0.000553,0.048419,0.021847,-0.045497,-0.0062,0.047013,0.01951,-0.008754,...,0.018127,-0.017172,-0.015221,-0.037059,-0.029121,0.012016,0.055261,0.075224,-0.025713,0.010932
2,0.001265,1.0,0.0,-0.017164,0.021796,-0.021051,-0.011114,-0.048085,0.0,0.003012,...,-0.050551,-0.031581,-0.001688,0.0,0.0,0.006226,-0.020504,-0.006001,-0.060091,0.024999
3,0.000553,0.0,1.0,-0.01126,-0.031539,0.0048,0.0,-0.032471,0.0,0.0,...,-0.004904,-0.016117,0.017749,0.0,-0.001431,-0.037289,-0.007789,-0.013001,0.0,0.01955


In [79]:
def get_similar_users(userid, num):
    return user_similarity_df[userid].sort_values(ascending = False)[:num]


def get_movies(movie_ids):
    output = pd.DataFrame()
    for movieid in movie_ids:
        output = output.append(movies[movies['movieId'] == movieid])
    return output


def get_movies_watched(userid):
    ids = ratings[(ratings['userId'] == userid)]['movieId'].tolist()
    return get_movies(ids)


def get_movies_liked(userid):
    ratings = ratings_matrix.iloc[ratings_matrix.index.get_loc(userid)]
    mean_rating_by_user = ratings.mean()
    movies_liked_ids = ratings[ratings.values > mean_rating_by_user].index
    return get_movies(movies_liked_ids)


def get_movie_recommendation_ids(userid, similar_users_count):
    movies_watched = movies_watched_by_user(userid)
    r = pd.Series()
    similar_users = get_similar_users(userid, similar_users_count)
    for userid, score in similar_users.iteritems():
        x = ratings_matrix2.iloc[ratings_matrix2.index.get_loc(userid)]
        for movie_id in movies_watched:
            x = x.drop(index=movie_id)
        r = r.append(x)

    r = r.sort_values(ascending=False)[:50]
    return r

get_movies_liked(1)



Unnamed: 0,movieId,title,genres,year,average_rating
43,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,1995,3.975369
46,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,1995,4.237745
89,101,Bottle Rocket (1996),Adventure|Comedy|Crime|Romance,1996,3.782609
124,151,Rob Roy (1995),Action|Drama|Romance|War,1995,3.545455
130,157,Canadian Bacon (1995),Comedy|War,1995,2.863636
...,...,...,...,...,...
2764,3702,Mad Max (1979),Action|Adventure|Sci-Fi,1979,3.486842
2765,3703,"Road Warrior, The (Mad Max 2) (1981)",Action|Adventure|Sci-Fi|Thriller,1981,4.037500
2788,3729,Shaft (1971),Action|Crime|Drama|Thriller,1971,4.000000
2836,3793,X-Men (2000),Action|Adventure|Sci-Fi,2000,3.699248


### 2. Building item-item collaborative filtering:

In [22]:
## Building a item-item collaborative filtering:

from sklearn.metrics.pairwise import cosine_similarity
movie_similarity = cosine_similarity(ratings_matrix2.T)
movie_similarity_df = pd.DataFrame(movie_similarity, index=ratings_matrix.columns, columns=ratings_matrix.columns)
movie_similarity_df.head(3)

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,-0.00511,0.055251,-0.030907,-0.129645,0.073183,-0.023355,0.020432,-0.152252,-0.065419,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.00511,1.0,0.005243,-0.073136,0.057203,-0.011372,0.028101,0.037774,-0.00693,-0.10697,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.055251,0.005243,1.0,-0.028179,0.166869,-0.001486,0.095035,-0.024656,-0.057074,-0.075184,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
def recommend_movies_item_to_item_collab(userid):
    movies_rated = ratings[ratings['userId'] == userid][['movieId','rating']]
    similar_movies = pd.Series()
    
    for i, row in movies_rated.iterrows():
        s = movie_similarity_df[row["movieId"]]
        s = s.drop(index=row["movieId"])
        #print(s)
        similar_movies = similar_movies.append(s * row["rating"])
        #print(movie_similarity_df[row["movieId"]])
        
    similar_movies = similar_movies.sort_values(ascending=False)[:50]
    return similar_movies

### 3. Building a content-based Recommendation:

In [46]:
# Building a content-based Recommendation:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
word_count = cv.fit_transform(movies["genres"])
word_count.toarray().shape

from sklearn.metrics.pairwise import cosine_similarity
content_similarity = cosine_similarity(word_count)
content_similarity_df = pd.DataFrame(content_similarity, index=movies["movieId"], columns= movies["movieId"])
content_similarity_df

def recommend_movies_content_based(userid):
    movies_rated = ratings[ratings['userId'] == userid][['movieId','rating']]
    similar_movies = pd.Series()
    
    for i, row in movies_rated.iterrows():
        s = content_similarity_df[row["movieId"]]
        s = s.drop(index=row["movieId"])
        #print(s)
        similar_movies = similar_movies.append(s * row["rating"])
        #print(movie_similarity_df[row["movieId"]])
        
    similar_movies = similar_movies.sort_values(ascending=False)[:50]
    return similar_movies

# Recommending Movies based on different approaches for Userid : 13

In [84]:
userid = 13

### Movies watched by user

In [85]:
movies_watched = get_movies_watched(userid)
print("Movies watched by userid ",userid, ":",len(movies_watched))
movies_watched[:10]

Movies watched by userid  13 : 31


Unnamed: 0,movieId,title,genres,year,average_rating
43,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,1995,3.975369
265,305,Ready to Wear (Pret-A-Porter) (1994),Comedy,1994,2.833333
514,597,Pretty Woman (1990),Comedy|Romance,1990,3.485185
879,1173,"Cook the Thief His Wife & Her Lover, The (1989)",Comedy|Drama,1989,3.136364
900,1198,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure,1981,4.2075
1193,1590,Event Horizon (1997),Horror|Sci-Fi|Thriller,1997,2.94
1219,1619,Seven Years in Tibet (1997),Adventure|Drama|War,1997,3.428571
1231,1639,Chasing Amy (1997),Comedy|Drama|Romance,1997,3.576923
1291,1721,Titanic (1997),Drama|Romance,1997,3.414286
1603,2145,Pretty in Pink (1986),Comedy|Drama|Romance,1986,3.242424


### Movies liked by user

In [86]:
movies_liked = get_movies_liked(userid)
print("Movies liked by userid ",userid, ":",len(movies_liked))
movies_liked[:10]

Movies liked by userid  13 : 18




Unnamed: 0,movieId,title,genres,year,average_rating
43,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,1995,3.975369
900,1198,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure,1981,4.2075
1193,1590,Event Horizon (1997),Horror|Sci-Fi|Thriller,1997,2.94
1231,1639,Chasing Amy (1997),Comedy|Drama|Romance,1997,3.576923
1291,1721,Titanic (1997),Drama|Romance,1997,3.414286
1939,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,1999,4.192446
2477,3300,Pitch Black (2000),Horror|Sci-Fi|Thriller,2000,3.564103
2674,3578,Gladiator (2000),Action|Adventure|Drama,2000,3.938235
2701,3624,Shanghai Noon (2000),Action|Adventure|Comedy|Western,2000,3.151163
2808,3753,"Patriot, The (2000)",Action|Drama|War,2000,3.448529


### Recommendation based upon Popularity, ie, Movies with highest rating

In [64]:
# Buildng a recommendation purely based on popularity:
print("Recommendation based upon Popularity, ie, Movies with highest rating")
reco_movies_popularity = movies.sort_values("average_rating", ascending=False)
reco_movies_popularity[:10]

Recommendation based upon Popularity, ie, Movies with highest rating


Unnamed: 0,movieId,title,genres,year,average_rating
7656,88448,Paper Birds (Pájaros de papel) (2010),Comedy|Drama,2010,5.0
8107,100556,"Act of Killing, The (2012)",Documentary,2012,5.0
9083,143031,Jump In! (2007),Comedy|Drama|Romance,2007,5.0
9094,143511,Human (2015),Documentary,2015,5.0
9096,143559,L.A. Slasher (2015),Comedy|Crime|Fantasy,2015,5.0
4251,6201,Lady Jane (1986),Drama|Romance,1986,5.0
8154,102217,Bill Hicks: Revelations (1993),Comedy,1993,5.0
8148,102084,Justice League: Doom (2012),Action|Animation|Fantasy,2012,5.0
4246,6192,Open Hearts (Elsker dig for evigt) (2002),Romance,2002,5.0
9122,145994,Formula of Love (1984),Comedy,1984,5.0


### Recommendation based on popularity for only the latest movies:

In [65]:
# Recommendation based on popularity for only the latest movies:
print("Recommendation based on popularity for only the latest movies:")
reco_movies_popularity_latest = movies.sort_values("average_rating", ascending=False)[:50].sort_values("year",ascending=False)
reco_movies_popularity_latest[:10]

Recommendation based on popularity for only the latest movies:


Unnamed: 0,movieId,title,genres,year,average_rating
9068,142444,The Editor (2015),Comedy|Horror|Mystery,2015,5.0
9094,143511,Human (2015),Documentary,2015,5.0
9096,143559,L.A. Slasher (2015),Comedy|Crime|Fantasy,2015,5.0
9055,141928,Bloodsucking Bastards (2015),Comedy|Horror,2015,5.0
9042,141718,Deathgasm (2015),Comedy|Horror,2015,5.0
9022,140627,Battle For Sevastopol (2015),Drama|Romance|War,2015,5.0
9131,146684,Cosmic Scrat-tastrophe (2015),Animation|Children|Comedy,2015,5.0
8212,103602,Craig Ferguson: I'm Here To Help (2013),Comedy|Documentary,2013,5.0
8148,102084,Justice League: Doom (2012),Action|Animation|Fantasy,2012,5.0
8107,100556,"Act of Killing, The (2012)",Documentary,2012,5.0


### Recommendedation based on content-based System

In [66]:
# Recommendation based upon content
movie_ids = recommend_movies_content_based(userid)
reco_movies_content_based = get_movies(movie_ids.index)
print("Movies recommended to user ",userid, " by content-based Engine")
reco_movies_content_based[:10]

  


Movies recommended to user  1  by content-based Engine


Unnamed: 0,movieId,title,genres,year,average_rating
5809,31923,"Three Musketeers, The (1973)",Action|Adventure|Comedy,1973,4.0
7766,91488,"Snowman, The (1982)",Animation|Children|Musical,1982,3.75
2573,3441,Red Dawn (1984),Action|Drama|War,1984,3.428571
6804,60803,"Little Drummer Boy, The (1968)",Animation|Children|Musical,1968,2.5
7849,93610,Space Battleship Yamato (2010),Action|Adventure|Drama,2010,3.0
6645,56715,Wristcutters: A Love Story (2006),Drama|Fantasy|Romance,2006,4.0
6932,64997,War of the Worlds (2005),Action|Sci-Fi,2005,3.0
1236,1645,The Devil's Advocate (1997),Drama|Mystery|Thriller,1997,3.411765
1118,1458,Touch (1997),Drama|Fantasy|Romance,1997,4.0
5766,31193,"Many Adventures of Winnie the Pooh, The (1977)",Animation|Children|Musical,1977,3.5


### Recommendedations based on user-user-collaborative filtering 

In [67]:
# Recommendation based upon user-user collaboration
print("\nMovies recommended to user ",userid, " by user-user-recommendation")
movie_ids = get_movie_recommendation_ids(userid,20)
reco_movies_user_collab = get_movies(movie_ids.index)
reco_movies_user_collab[:10]


Movies recommended to user  1  by user-user-recommendation




Unnamed: 0,movieId,title,genres,year,average_rating
483,551,"Nightmare Before Christmas, The (1993)",Animation|Children|Fantasy|Musical,1993,3.553763
695,913,"Maltese Falcon, The (1941)",Film-Noir|Mystery,1941,4.170455
2316,3070,Adventures of Buckaroo Banzai Across the 8th D...,Adventure|Comedy|Sci-Fi,1984,3.547619
1055,1372,Star Trek VI: The Undiscovered Country (1991),Action|Mystery|Sci-Fi,1991,3.345238
2195,2918,Ferris Bueller's Day Off (1986),Comedy,1986,3.83945
1057,1374,Star Trek II: The Wrath of Khan (1982),Action|Adventure|Sci-Fi|Thriller,1982,3.766129
2510,3358,Defending Your Life (1991),Comedy|Drama|Fantasy|Romance,1991,3.85
2536,3396,"Muppet Movie, The (1979)",Adventure|Children|Comedy|Musical,1979,3.590909
2044,2723,Mystery Men (1999),Action|Comedy|Fantasy,1999,3.089744
1158,1527,"Fifth Element, The (1997)",Action|Adventure|Comedy|Sci-Fi,1997,3.74569


### Recommendedations based upon item-item-collaborative filtering

In [68]:
# Recommendation based upon item-item collaboration
movie_ids = recommend_movies_item_to_item_collab(userid)
reco_movies_item_collab = get_movies(movie_ids.index)
print("Movies recommended to userid",userid," by item-item-recommendation")
reco_movies_item_collab[:10]

  This is separate from the ipykernel package so we can avoid doing imports until


Movies recommended to userid 1  by item-item-recommendation


Unnamed: 0,movieId,title,genres,year,average_rating
2296,3043,Meatballs 4 (1992),Comedy,1992,1.0
2730,3667,Rent-A-Cop (1988),Action|Comedy|Crime,1988,1.0
408,470,House Party 3 (1994),Comedy,1994,1.0
2119,2816,Iron Eagle II (1988),Action|War,1988,1.666667
1561,2098,Son of Flubber (1963),Children|Comedy,1963,2.5
1200,1598,Desperate Measures (1998),Crime|Drama|Thriller,1998,2.5
2284,3031,Repossessed (1990),Comedy,1990,2.375
3909,5493,In Like Flint (1967),Action|Adventure|Comedy,1967,4.0
2773,3711,Sarafina! (1992),Drama,1992,4.0
3912,5499,Robin and Marian (1976),Adventure|Drama|Romance,1976,3.25
