In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity

# Data preparation (loading and preprocessing)

First of all we need to load our dataset (See `README.md` for details).

In [2]:
movies = pd.read_csv('../../data/ml-1m/movies.dat', sep='::', header=None, names=['movie_id', 'title', 'genres'], engine='python', encoding='latin-1')
ratings = pd.read_csv('../../data/ml-1m/ratings.dat', sep='::', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'], engine='python', encoding='latin-1')

In [4]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## User-to-user collaborative filtering

Sources: The weighted average algorithm was inspired by [this](https://medium.com/analytics-vidhya/recommendation-system-using-collaborative-filtering-cc310e641fde) Medium article

In [5]:
X_ratings_train, X_ratings_test, y_ratings_train, y_ratings_test = train_test_split(ratings, ratings['user_id'], stratify=ratings['user_id'], test_size=0.2, random_state=5)

In [9]:
X_ratings_train_by_user = X_ratings_train.pivot(index='user_id', columns='movie_id', values='rating')

In [10]:
X_ratings_train_by_user.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,2.0,,,,,...,,,,,,,,,,


In [13]:
df_ratings_dummy = X_ratings_train_by_user.copy().fillna(0)
similarity_matrix = cosine_similarity(df_ratings_dummy, df_ratings_dummy)
similarity_matrix_df = pd.DataFrame(similarity_matrix, index=df_ratings_dummy.index, columns=df_ratings_dummy.index)

In [24]:
def weighed_avg_rating_for_movie(id_movie, id_user):

    if not id_movie in X_ratings_train_by_user:
        return 2.5 #average
    cosine_scores = similarity_matrix_df[id_user] #similarity of id_user with every other user
    ratings_scores = X_ratings_train_by_user[id_movie]      #ratings of every other user for the movie id_movie
    #won't consider users who havent rated id_movie so drop similarity scores and ratings corresponsing to np.nan
    index_not_rated = ratings_scores[ratings_scores.isnull()].index
    ratings_scores = ratings_scores.dropna()
    cosine_scores = cosine_scores.drop(index_not_rated)
    #calculating rating by weighted mean of ratings and cosine scores of the users who have rated the movie
    ratings_movie = np.dot(ratings_scores, cosine_scores)/cosine_scores.sum()

    return ratings_movie

Average rating for movie #150 for user #350

In [20]:
weighed_avg_rating_for_movie(150, 350) 

4.0860582963886865

In [22]:
def score_on_test_set(X_test):
    user_movie_pairs = zip(X_test['movie_id'], X_test['user_id'])
    predicted_ratings = np.array([weighed_avg_rating_for_movie(movie, user) for (movie,user) in user_movie_pairs])
    true_ratings = np.array(X_test['rating'])
    score = np.sqrt(mean_squared_error(true_ratings, predicted_ratings))
    return score

In [25]:
score_on_test_set(X_ratings_test)