In [2]:
import numpy as np
import pandas as pd

In [4]:
ratings_df = pd.read_csv('small_movie_ratings.csv', index_col=0)

In [5]:
ratings_df.head()

Unnamed: 0_level_0,15,30,311,452,468,509,547,564,624,73
movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
88 Minutes,2.0,4.0,2.0,,2.0,2.0,1.5,2.0,3.0,3.5
A Time to Kill,3.0,5.0,2.5,2.0,2.0,3.0,,4.0,3.0,3.0
Barry Lyndon,4.0,,3.0,3.0,3.0,3.5,4.0,5.0,4.0,2.5
Gleaming the Cube,2.5,4.0,4.0,,3.0,3.0,3.0,4.0,3.0,4.0
Judgment Night,4.0,4.0,4.0,4.0,3.5,3.0,2.5,1.0,,4.5


There are some missing values in this data.

In [6]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15 entries, 88 Minutes to The Terminal
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   15      15 non-null     float64
 1   30      14 non-null     float64
 2   311     13 non-null     float64
 3   452     10 non-null     float64
 4   468     14 non-null     float64
 5   509     15 non-null     float64
 6   547     12 non-null     float64
 7   564     15 non-null     float64
 8   624     13 non-null     float64
 9   73      14 non-null     float64
dtypes: float64(10)
memory usage: 1.3+ KB


## Find Similarities Between Users
We'll use the Pearson correlation coefficient to find the correlation between users based on their rated movies

In [8]:
def find_correlation_between_two_users(ratings_df: pd.DataFrame, user1: str, user2: str):
    """Find correlation between two users based on their rated movies using Pearson correlation"""
    rated_movies_by_both = ratings_df[[user1, user2]].dropna(axis=0).values
    user1_ratings = rated_movies_by_both[:, 0]
    user2_ratings = rated_movies_by_both[:, 1]
    return np.corrcoef(user1_ratings, user2_ratings)[0, 1]
      

## Create a matrix that shows the similarities between all pairs of users

In [None]:
users = list(ratings_df.columns)
movies = list(ratings_df.index)
similarity_matrix = np.array([[find_correlation_between_two_users(ratings_df, user1, user2) for user1 in users] for user2 in users])
similarity_df = pd.DataFrame(similarity_matrix, columns=users, index=users)
similarity_df

# Get Similar Users
Imagine we want to predict the rating of user 3 for the movie 1 based on the ratings of other users. we first want to select only the users who have rated move 1.

In [11]:
def get_rated_for_a_movie(ratings_df: pd.DataFrame, movie: str):
    return ratings_df.loc[movie, :].dropna().index.values

Next, we only pick the k number of users that are the most similar to user 3.