In [1]:
import numpy as np
import pandas as pd

In [2]:
ratings_df = pd.read_csv('small_movie_ratings.csv', index_col=0)

In [3]:
ratings_df.head()

Unnamed: 0_level_0,15,30,311,452,468,509,547,564,624,73
movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
88 Minutes,2.0,4.0,2.0,,2.0,2.0,1.5,2.0,3.0,3.5
A Time to Kill,3.0,5.0,2.5,2.0,2.0,3.0,,4.0,3.0,3.0
Barry Lyndon,4.0,,3.0,3.0,3.0,3.5,4.0,5.0,4.0,2.5
Gleaming the Cube,2.5,4.0,4.0,,3.0,3.0,3.0,4.0,3.0,4.0
Judgment Night,4.0,4.0,4.0,4.0,3.5,3.0,2.5,1.0,,4.5


There are some missing values in this data.

In [4]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15 entries, 88 Minutes to The Terminal
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   15      15 non-null     float64
 1   30      14 non-null     float64
 2   311     13 non-null     float64
 3   452     10 non-null     float64
 4   468     14 non-null     float64
 5   509     15 non-null     float64
 6   547     12 non-null     float64
 7   564     15 non-null     float64
 8   624     13 non-null     float64
 9   73      14 non-null     float64
dtypes: float64(10)
memory usage: 1.3+ KB


## Find Similarities Between Users
We'll use the Pearson correlation coefficient to find the correlation between users based on their rated movies

In [5]:
def find_correlation_between_two_users(ratings_df: pd.DataFrame, user1: str, user2: str):
    """Find correlation between two users based on their rated movies using Pearson correlation"""
    rated_movies_by_both = ratings_df[[user1, user2]].dropna(axis=0).values
    user1_ratings = rated_movies_by_both[:, 0]
    user2_ratings = rated_movies_by_both[:, 1]
    return np.corrcoef(user1_ratings, user2_ratings)[0, 1]
      

## Create a matrix that shows the similarities between all pairs of users

In [6]:
users = list(ratings_df.columns)
movies = list(ratings_df.index)
similarity_matrix = np.array([[find_correlation_between_two_users(ratings_df, user1, user2) for user1 in users] for user2 in users])
similarity_df = pd.DataFrame(similarity_matrix, columns=users, index=users)
similarity_df

Unnamed: 0,15,30,311,452,468,509,547,564,624,73
15,1.0,0.395367,0.305552,0.230556,0.43494,0.469956,0.123855,0.006502,0.267311,0.462184
30,0.395367,1.0,-0.186997,0.140313,0.102723,0.535891,0.330386,-0.154949,-0.122837,0.122264
311,0.305552,-0.186997,1.0,0.746033,0.344309,0.238744,-0.013878,-0.011111,-0.016278,0.513114
452,0.230556,0.140313,0.746033,1.0,0.807781,0.453188,0.145556,-0.534522,0.537484,0.449013
468,0.43494,0.102723,0.344309,0.807781,1.0,0.595241,0.606714,-0.090911,0.676868,0.500932
509,0.469956,0.535891,0.238744,0.453188,0.595241,1.0,0.734303,-0.204034,0.554024,0.511659
547,0.123855,0.330386,-0.013878,0.145556,0.606714,0.734303,1.0,0.344611,0.436309,-0.072267
564,0.006502,-0.154949,-0.011111,-0.534522,-0.090911,-0.204034,0.344611,1.0,-0.42361,-0.440686
624,0.267311,-0.122837,-0.016278,0.537484,0.676868,0.554024,0.436309,-0.42361,1.0,0.501961
73,0.462184,0.122264,0.513114,0.449013,0.500932,0.511659,-0.072267,-0.440686,0.501961,1.0


# Get Similar Users
Imagine we want to predict the rating of user 3 for the movie 1 based on the ratings of other users. we first want to select only the users who have rated move 1.

In [7]:
def get_rated_for_a_movie(ratings_df: pd.DataFrame, movie: str):
    return ratings_df.loc[movie, :].dropna().index.values

Next, we only pick the k number of users that are the most similar to user 3.

In [8]:
def get_top_neighbors(similarity_df: pd.DataFrame, user: str, rated_users: str, n_neighbors: int):
    return similarity_df[user][rated_users].nlargest(n_neighbors).to_dict()

## Getting Ratings of the similar Users on a Movie
since different users might have different rating scales for the movie that they like, we want to adjust for this bias by substracting a rating of a user for a movie by the mean ratings of that user

In [None]:
def substract_bias(rating: float, mean_rating: float):
    return rating - mean_rating

def get_neighbor_rating_without bias_per_movie(ratings_df: pd.DataFrgame, user: str, movie: str):
    """Subtract the rating of a user from the mean rating of that user to eliminate bias"""
    mean_raring = ratings_df[user].mean()
    rating = ratings_df.loc[movie, user]
    return subtract_bias(rating, mean_rating)

def get_ratings_of_neighbors(ratings_df: pd.DataFrame, neighbors: list, movie: str):
    """Get the ratings of all neighbors after adjusting for biases"""
    return[
    get_neighbor_rating_without bias_per_movie(ratings_df, neighbor, movie)
    for neighbor in neighbors]
    