In [1]:
import numpy as np
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
movie_df = pd.read_csv('/content/drive/MyDrive/MovieRatings/MovieRatings.csv', index_col=0)
movie_df

Unnamed: 0_level_0,15,30,311,452,468,509,547,564,624,73
movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
88 Minutes,2.0,4.0,2.0,,2.0,2.0,1.5,2,3.0,3.5
A Time to Kill,3.0,5.0,2.5,2.0,2.0,3.0,,4,3.0,3.0
Barry Lyndon,4.0,,3.0,3.0,3.0,3.5,4.0,5,4.0,2.5
Gleaming the Cube,2.5,4.0,4.0,,3.0,3.0,3.0,4,3.0,4.0
Judgment Night,4.0,4.0,4.0,4.0,3.5,3.0,2.5,1,,4.5
License to Wed,5.0,4.0,,2.0,3.0,4.0,3.5,5,4.0,5.0
Lonely Hearts,5.0,5.0,3.0,4.0,4.0,4.0,5.0,5,,4.0
Mr. Holland's Opus,2.0,1.0,3.5,3.0,2.5,2.0,,4,4.0,3.5
Psycho,2.0,3.0,4.0,4.0,3.0,3.0,3.0,5,3.0,
Rain Man,4.0,4.0,3.5,3.0,,2.0,3.0,5,3.0,3.0


In [4]:
user1 = [0, 1, 2]
user2 = [0, 1, 2]
np.corrcoef(user1, user2)[0, 1]

1.0

In [5]:
user1 = [0, 1, 2]
user2 = [0, 2, 4]
np.corrcoef(user1, user2)

array([[1., 1.],
       [1., 1.]])

In [6]:
user1 = [0, 1, 2, 3]
user2 = [3, 2, 1, 0]
np.corrcoef(user1, user2)[0, 1]

-1.0

# Creating a function that finds a correlation between two users

In [7]:
def find_correlation_between_two_users(movie_df: pd.DataFrame, user1: str, user2: str):
    rated_movies_by_both = movie_df[[user1, user2]].dropna(axis=0).values
    user1_ratings = rated_movies_by_both[:, 0]
    user2_ratings = rated_movies_by_both[:, 1]
    return np.corrcoef(user1_ratings, user2_ratings)[0, 1]

# Creating a matrix that shows the similarities between all pairs of users

In [8]:
users = list(movie_df.columns)
movies = list(movie_df.index)
similarity_matrix = np.array([[find_correlation_between_two_users(movie_df, user1, user2) for user1 in users] for user2 in users])
similarity_df = pd.DataFrame(similarity_matrix, columns=users, index=users)
similarity_df

Unnamed: 0,15,30,311,452,468,509,547,564,624,73
15,1.0,0.395367,0.305552,0.230556,0.43494,0.469956,0.123855,0.006502,0.267311,0.462184
30,0.395367,1.0,-0.186997,0.140313,0.102723,0.535891,0.330386,-0.154949,-0.122837,0.122264
311,0.305552,-0.186997,1.0,0.746033,0.344309,0.238744,-0.013878,-0.011111,-0.016278,0.513114
452,0.230556,0.140313,0.746033,1.0,0.807781,0.453188,0.145556,-0.534522,0.537484,0.449013
468,0.43494,0.102723,0.344309,0.807781,1.0,0.595241,0.606714,-0.090911,0.676868,0.500932
509,0.469956,0.535891,0.238744,0.453188,0.595241,1.0,0.734303,-0.204034,0.554024,0.511659
547,0.123855,0.330386,-0.013878,0.145556,0.606714,0.734303,1.0,0.344611,0.436309,-0.072267
564,0.006502,-0.154949,-0.011111,-0.534522,-0.090911,-0.204034,0.344611,1.0,-0.42361,-0.440686
624,0.267311,-0.122837,-0.016278,0.537484,0.676868,0.554024,0.436309,-0.42361,1.0,0.501961
73,0.462184,0.122264,0.513114,0.449013,0.500932,0.511659,-0.072267,-0.440686,0.501961,1.0


# For getting similar users

In [9]:
def get_rated_user_for_a_movie(movie_df: pd.DataFrame, movie: str):
    return movie_df.loc[movie, :].dropna().index.values

In [10]:

def get_top_neighbors(similarity_df: pd.DataFrame, user: str, rated_users: str, n_neighbors: int):
    return similarity_df[user][rated_users].nlargest(n_neighbors).to_dict()

# For getting the ratings of the similar users on a movie

In [11]:
def subtract_bias(rating: float, mean_rating: float):
    return rating - mean_rating


def get_neighbor_rating_without_bias_per_movie(
    movie_df_df: pd.DataFrame, user: str, movie: str
):
    mean_rating = movie_df[user].mean()
    rating = movie_df.loc[movie, user]
    return subtract_bias(rating, mean_rating)

def get_ratings_of_neighbors(movie_df: pd.DataFrame, neighbors: list, movie: str):
    return [
        get_neighbor_rating_without_bias_per_movie(movie_df, neighbor, movie)
        for neighbor in neighbors
    ]

In [12]:
def get_weighted_average_rating_of_neighbors(ratings: list, neighbor_distance: list):
    weighted_sum = np.array(ratings).dot(np.array(neighbor_distance))
    abs_neigbor_distance = np.abs(neighbor_distance)
    return weighted_sum / np.sum(abs_neigbor_distance)

In [13]:
def ger_user_rating(movie_df: pd.DataFrame, user: str, avg_neighbor_rating: float):
    user_avg_rating = movie_df[user].mean()
    return round(user_avg_rating + avg_neighbor_rating, 2)

# For getting the missing ratings of All Users

In [14]:
def predict_rating(
    df: pd.DataFrame,
    similarity_df: pd.DataFrame,
    user: str,
    movie: str,
    n_neighbors: int = 2,
):
    movie_df = df.copy()

    rated_users = get_rated_user_for_a_movie(movie_df, movie)

    top_neighbors_distance = get_top_neighbors(
        similarity_df, user, rated_users, n_neighbors
    )
    neighbors, distance = top_neighbors_distance.keys(), top_neighbors_distance.values()

    print(f"Top {n_neighbors} neighbors of user {user}, {movie}: {list(neighbors)}")

    ratings = get_ratings_of_neighbors(movie_df, neighbors, movie)
    avg_neighbor_rating = get_weighted_average_rating_of_neighbors(
        ratings, list(distance)
    )

    return ger_user_rating(movie_df, user, avg_neighbor_rating)

# Predicting the ratings of all missing ratings

In [15]:
full_ratings = movie_df.copy()

for user, movies in full_ratings.iteritems():
    for movie in movies.keys():
        if np.isnan(full_ratings.loc[movie, user]):
            full_ratings.loc[movie, user] = predict_rating(
                movie_df, similarity_df, user, movie
            )

Top 2 neighbors of user 30, Barry Lyndon: ['509', '15']
Top 2 neighbors of user 311, License to Wed: ['452', '73']
Top 2 neighbors of user 311, Sleepless in Seattle: ['452', '73']
Top 2 neighbors of user 452, 88 Minutes: ['468', '311']
Top 2 neighbors of user 452, Gleaming the Cube: ['468', '311']
Top 2 neighbors of user 452, The Endless Summer: ['468', '311']
Top 2 neighbors of user 452, The Forbidden Kingdom: ['468', '311']
Top 2 neighbors of user 452, The Terminal: ['468', '311']
Top 2 neighbors of user 468, Rain Man: ['452', '624']
Top 2 neighbors of user 547, A Time to Kill: ['509', '468']
Top 2 neighbors of user 547, Mr. Holland's Opus: ['509', '468']
Top 2 neighbors of user 547, The 39 Steps: ['509', '468']
Top 2 neighbors of user 624, Judgment Night: ['468', '509']
Top 2 neighbors of user 624, Lonely Hearts: ['468', '509']
Top 2 neighbors of user 73, Psycho: ['311', '509']


  for user, movies in full_ratings.iteritems():


In [16]:
full_ratings

Unnamed: 0_level_0,15,30,311,452,468,509,547,564,624,73
movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
88 Minutes,2.0,4.0,2.0,2.2,2.0,2.0,1.5,2,3.0,3.5
A Time to Kill,3.0,5.0,2.5,2.0,2.0,3.0,2.9,4,3.0,3.0
Barry Lyndon,4.0,4.28,3.0,3.0,3.0,3.5,4.0,5,4.0,2.5
Gleaming the Cube,2.5,4.0,4.0,3.68,3.0,3.0,3.0,4,3.0,4.0
Judgment Night,4.0,4.0,4.0,4.0,3.5,3.0,2.5,1,3.66,4.5
License to Wed,5.0,4.0,3.01,2.0,3.0,4.0,3.5,5,4.0,5.0
Lonely Hearts,5.0,5.0,3.0,4.0,4.0,4.0,5.0,5,4.38,4.0
Mr. Holland's Opus,2.0,1.0,3.5,3.0,2.5,2.0,2.58,4,4.0,3.5
Psycho,2.0,3.0,4.0,4.0,3.0,3.0,3.0,5,3.0,3.9
Rain Man,4.0,4.0,3.5,3.0,2.57,2.0,3.0,5,3.0,3.0
