In [1]:
import numpy as np
import pandas as pd
from ast import literal_eval

from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

In [2]:
ratings = pd.read_csv('ratings_small.csv')
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
...,...,...,...,...
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663


In [3]:
ratings['userId'].max()

671

In [4]:
ratings['movieId'].max()

163949

In [5]:
X = ratings.copy()
y = ratings['userId']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=44)

In [7]:
ratings_pivot = X_train.pivot(index='userId', columns='movieId', values='rating')
ratings_pivot

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,4.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,4.0,...,,,,,,,,,,
5,,,4.0,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,,,,,,4.0,,,,,...,,,,,,,,,,
668,,,,,,,,,,,...,,,,,,,,,,
669,,,,,,,,,,,...,,,,,,,,,,
670,4.0,,,,,,,,,,...,,,,,,,,,,


In [8]:
df_rating = ratings_pivot.copy()

In [9]:
# 1
df_ratings_ = df_rating.copy().fillna(0)
df_ratings_.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
similarity_matrix = cosine_similarity(df_ratings_, df_ratings_)
similarity_matrix

array([[1.        , 0.        , 0.        , ..., 0.07102523, 0.        ,
        0.02052053],
       [0.        , 1.        , 0.08564574, ..., 0.02153136, 0.13666838,
        0.0728066 ],
       [0.        , 0.08564574, 1.        , ..., 0.09091739, 0.12607173,
        0.12068082],
       ...,
       [0.07102523, 0.02153136, 0.09091739, ..., 1.        , 0.        ,
        0.05498428],
       [0.        , 0.13666838, 0.12607173, ..., 0.        , 1.        ,
        0.20041821],
       [0.02052053, 0.0728066 , 0.12068082, ..., 0.05498428, 0.20041821,
        1.        ]])

In [11]:
df_similarity_matrix = pd.DataFrame(similarity_matrix, index=ratings_pivot.index, columns=ratings_pivot.index)
df_similarity_matrix

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.000000,0.000000,0.088173,0.020059,0.000000,0.059167,0.000000,0.014504,0.000000,...,0.000000,0.000000,0.016804,0.022533,0.000000,0.000000,0.000000,0.071025,0.000000,0.020521
2,0.000000,1.000000,0.085646,0.100041,0.043072,0.000000,0.186658,0.056036,0.076214,0.028092,...,0.337801,0.026008,0.050997,0.133579,0.381434,0.296783,0.032541,0.021531,0.136668,0.072807
3,0.000000,0.085646,1.000000,0.085435,0.115688,0.052971,0.104942,0.208796,0.151035,0.091164,...,0.120638,0.029614,0.148216,0.148331,0.125955,0.126045,0.086458,0.090917,0.126072,0.120681
4,0.088173,0.100041,0.085435,1.000000,0.071691,0.023108,0.256741,0.171413,0.026012,0.106837,...,0.084868,0.000000,0.125122,0.188652,0.091874,0.064931,0.040702,0.090985,0.023686,0.164266
5,0.020059,0.043072,0.115688,0.071691,1.000000,0.043698,0.074791,0.127686,0.068063,0.000000,...,0.078246,0.000000,0.123185,0.155613,0.088566,0.038360,0.000000,0.000000,0.065408,0.187288
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,0.000000,0.296783,0.126045,0.064931,0.038360,0.000000,0.178892,0.080029,0.075391,0.035443,...,0.272708,0.000000,0.061382,0.102857,0.231371,1.000000,0.052637,0.000000,0.151110,0.093916
668,0.000000,0.032541,0.086458,0.040702,0.000000,0.026726,0.000000,0.067328,0.229500,0.022241,...,0.054029,0.076263,0.082671,0.067009,0.067109,0.052637,1.000000,0.000000,0.213058,0.133770
669,0.071025,0.021531,0.090917,0.090985,0.000000,0.000000,0.045835,0.026517,0.034124,0.067449,...,0.020109,0.000000,0.074209,0.100402,0.033303,0.000000,0.000000,1.000000,0.000000,0.054984
670,0.000000,0.136668,0.126072,0.023686,0.065408,0.000000,0.050740,0.173217,0.314620,0.108386,...,0.120639,0.158121,0.118729,0.109970,0.147168,0.151110,0.213058,0.000000,1.000000,0.200418


In [12]:
def predict_ratings(movie_id, user_id):
    if movie_id in ratings_pivot:
        cosine_scores = df_similarity_matrix[user_id] 
        ratings_scores = ratings_pivot[movie_id]     
        
        index_not_rated = ratings_scores[ratings_scores.isnull()].index
        ratings_scores = ratings_scores.dropna()
        cosine_scores = cosine_scores.drop(index_not_rated)

        ratings_movie = np.dot(ratings_scores, cosine_scores)/cosine_scores.sum()
    else:
        return 2.5
    
    return round(ratings_movie, 2)

In [13]:
predict_ratings(3, 150)

3.06

In [14]:
def score_test_set(X):
    user_movie_pairs = zip(X['movieId'], X['userId'])
    rating_predict = np.array([predict_ratings(movie, user) for (movie, user) in user_movie_pairs])
    rating_actual = np.array(X['rating'])
    score = np.sqrt(mean_squared_error(rating_actual, rating_predict))
    
    return score

In [15]:
len(X_test)

20001

In [16]:
score_test_set(X_test[:500])

0.9496455128099117