In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [3]:
# Generate some toy user and movie data

# Number of users
n_users = 100

# Number of movies
n_movies = 10

# Number of ratings
n_ratings = 1000

# Generate random user ids
user_ids = np.random.randint(0, n_users, n_ratings)

# Generate random movie ids
movie_ids = np.random.randint(0, n_movies, n_ratings)

# Generate random ratings
ratings = np.random.randint(1, 6, n_ratings)

# Create a dataframe with the data
df = pd.DataFrame({'user_id': user_ids, 'movie_id': movie_ids, 'rating': ratings})

# We should not have any duplicate ratings for the same user and movie
# Drop any rows that have duplicate user_id and movie_id pairs
df = df.drop_duplicates(['user_id', 'movie_id'])


In [4]:
df

Unnamed: 0,user_id,movie_id,rating
0,25,5,3
1,28,2,2
2,51,6,3
3,84,3,2
4,38,9,4
...,...,...,...
978,69,3,3
990,95,8,2
991,20,9,2
992,3,5,3


In [5]:
# Create a user-item matrix

A = df.pivot(index='user_id', columns='movie_id', values='rating')
A

movie_id,0,1,2,3,4,5,6,7,8,9
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,3.0,5.0,,2.0,2.0,2.0,5.0,5.0,5.0,1.0
1,,,1.0,1.0,4.0,3.0,2.0,,4.0,
2,,,,,2.0,,,3.0,,1.0
3,5.0,5.0,5.0,2.0,3.0,3.0,2.0,5.0,,
4,2.0,,1.0,3.0,,,5.0,,,2.0
...,...,...,...,...,...,...,...,...,...,...
95,4.0,4.0,,2.0,,3.0,,,2.0,1.0
96,,4.0,2.0,,,,5.0,,3.0,4.0
97,5.0,2.0,,,,2.0,1.0,4.0,,1.0
98,4.0,4.0,2.0,5.0,,1.0,3.0,3.0,,1.0


In [6]:
# Fill in the missing values with zeros
A = A.fillna(0)

A

movie_id,0,1,2,3,4,5,6,7,8,9
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,3.0,5.0,0.0,2.0,2.0,2.0,5.0,5.0,5.0,1.0
1,0.0,0.0,1.0,1.0,4.0,3.0,2.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,2.0,0.0,0.0,3.0,0.0,1.0
3,5.0,5.0,5.0,2.0,3.0,3.0,2.0,5.0,0.0,0.0
4,2.0,0.0,1.0,3.0,0.0,0.0,5.0,0.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...
95,4.0,4.0,0.0,2.0,0.0,3.0,0.0,0.0,2.0,1.0
96,0.0,4.0,2.0,0.0,0.0,0.0,5.0,0.0,3.0,4.0
97,5.0,2.0,0.0,0.0,0.0,2.0,1.0,4.0,0.0,1.0
98,4.0,4.0,2.0,5.0,0.0,1.0,3.0,3.0,0.0,1.0


In [7]:
# Cosine similarity between U1 and U2

# User 1
u1 = A.loc[0]

# User 2
u2 = A.loc[1]

# Compute the dot product
dot = np.dot(u1, u2)

# Compute the L2 norm
norm_u1 = np.linalg.norm(u1)
norm_u2 = np.linalg.norm(u2)

# Compute the cosine similarity
cos_sim = dot / (norm_u1 * norm_u2)
cos_sim

0.6074758080303854

In [8]:
# Calculate the cosine similarity between users
from sklearn.metrics.pairwise import cosine_similarity

sim_matrix = cosine_similarity(A)

pd.DataFrame(sim_matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,1.000000,0.607476,0.483934,0.733966,0.538456,0.681371,0.553347,0.606397,0.736212,0.757769,...,0.374522,0.514039,0.634808,0.533855,0.637699,0.678595,0.692550,0.697265,0.754465,0.244853
1,0.607476,1.000000,0.311872,0.415830,0.311419,0.633928,0.217443,0.434216,0.464140,0.389999,...,0.391397,0.331274,0.767069,0.422236,0.587095,0.391940,0.418421,0.163401,0.259316,0.468458
2,0.483934,0.311872,1.000000,0.500000,0.081514,0.481604,0.000000,0.099449,0.629941,0.124274,...,0.328688,0.910465,0.216225,0.286534,0.537853,0.037796,0.127775,0.486513,0.296957,0.180702
3,0.733966,0.415830,0.500000,1.000000,0.421155,0.812116,0.743698,0.850842,0.839921,0.724931,...,0.727097,0.549170,0.336350,0.744989,0.796819,0.667737,0.425918,0.785905,0.831479,0.602339
4,0.538456,0.311419,0.081514,0.421155,1.000000,0.404120,0.522862,0.586369,0.233638,0.780014,...,0.323947,0.148431,0.616887,0.376040,0.511496,0.345065,0.637947,0.363019,0.711660,0.335101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.678595,0.391940,0.037796,0.667737,0.345065,0.524672,0.505964,0.771811,0.700000,0.805555,...,0.537587,0.091766,0.343247,0.818746,0.537587,1.000000,0.439480,0.693103,0.722820,0.239046
96,0.692550,0.418421,0.127775,0.425918,0.637947,0.570123,0.694879,0.563349,0.211289,0.680818,...,0.293987,0.000000,0.676891,0.461311,0.494433,0.439480,1.000000,0.284521,0.517932,0.262640
97,0.697265,0.163401,0.486513,0.785905,0.363019,0.371073,0.438357,0.555787,0.841625,0.667395,...,0.641880,0.454311,0.094407,0.675566,0.704502,0.693103,0.284521,1.000000,0.715699,0.118345
98,0.754465,0.259316,0.296957,0.831479,0.711660,0.683110,0.679102,0.840680,0.667823,0.878315,...,0.496904,0.450615,0.449467,0.631355,0.596285,0.722820,0.517932,0.715699,1.000000,0.450749


In [24]:
# Find the most similar users to user u 

def k_nearest_neighbors(A, u, k):
    """Find the k nearest neighbors for user u"""
    # Find the index of the user in the matrix
    u_index = A.index.get_loc(u)
    
    # Compute the similarity between the user and all other users
    sim_matrix = cosine_similarity(A)

    # Find the k most similar users
    k_nearest = np.argsort(sim_matrix[u_index])[::-1][1:k+1]
    
    # Return the user ids
    return A.index[k_nearest]

In [25]:
k_nearest_neighbors(A, 0, 5)

Int64Index([22, 43, 36, 53, 83], dtype='int64', name='user_id')

In [26]:
# Show matrix of movie ratings for u and k nearest neighbors

def show_neighbors(A, u, k):
    """Show the movie ratings for user u and k nearest neighbors"""
    # Get the user ids of the k nearest neighbors
    neighbors = k_nearest_neighbors(A, u, k)
    
    # Get the movie ratings for user u and the k nearest neighbors
    df = A.loc[[u] + list(neighbors)]
    
    # Return the dataframe
    return df

In [27]:
show_neighbors(A, 0, 5)

movie_id,0,1,2,3,4,5,6,7,8,9
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,3.0,5.0,0.0,2.0,2.0,2.0,5.0,5.0,5.0,1.0
22,4.0,3.0,0.0,5.0,0.0,0.0,2.0,5.0,5.0,0.0
43,1.0,5.0,1.0,2.0,2.0,5.0,3.0,3.0,2.0,2.0
36,5.0,5.0,0.0,2.0,0.0,5.0,4.0,1.0,4.0,2.0
53,3.0,2.0,2.0,1.0,1.0,3.0,3.0,1.0,3.0,0.0
83,0.0,3.0,0.0,3.0,0.0,0.0,4.0,1.0,3.0,0.0


In [28]:
# Rating for user u for movie 0 is: (4.0 + 3.0) / 2 = 3.5 (Discard 0s)

def predict_rating(A, u, m, k=5):
    """Predict the rating for user u for movie m"""
    # Get the user ids of the k nearest neighbors
    neighbors = k_nearest_neighbors(A, u, k)
    
    # Get the movie ratings for user u and the k nearest neighbors
    df = A.loc[[u] + list(neighbors)]
    
    # Get the ratings for movie m
    ratings = df[m]
    
    # Calculate the mean of the ratings
    mean = ratings[ratings != 0].mean()
    
    # Return the mean
    return mean

In [29]:
predict_rating(A, 0, 0)

3.2

In [30]:
# Now working with real data

# Load the data

df = pd.read_excel("mov-rec.xlsx")
df.head()

Unnamed: 0,Timestamp,Your name,Sholay,Swades (We The People),The Matrix (I),Interstellar,Dangal,Taare Zameen Par,Shawshank Redemption,The Dark Knight,Notting Hill,Uri: The Surgical Strike
0,2023-04-11 10:58:44.990,Nipun,4.0,5.0,4.0,4.0,5.0,5.0,4.0,5.0,4.0,5.0
1,2023-04-11 10:59:49.617,Gautam Vashishtha,3.0,4.0,4.0,5.0,3.0,1.0,5.0,5.0,4.0,3.0
2,2023-04-11 11:12:44.033,Eshan Gujarathi,4.0,,5.0,5.0,4.0,5.0,5.0,5.0,,4.0
3,2023-04-11 11:13:48.674,Sai Krishna Avula,5.0,3.0,3.0,4.0,4.0,5.0,5.0,3.0,3.0,4.0
4,2023-04-11 11:13:55.658,Ankit Yadav,3.0,3.0,2.0,5.0,2.0,5.0,5.0,3.0,3.0,4.0


In [31]:
# Discard the timestamp column

df = df.drop('Timestamp', axis=1)

# Make the "Your Name" column the index

df = df.set_index('Your name')
df

Unnamed: 0_level_0,Sholay,Swades (We The People),The Matrix (I),Interstellar,Dangal,Taare Zameen Par,Shawshank Redemption,The Dark Knight,Notting Hill,Uri: The Surgical Strike
Your name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Nipun,4.0,5.0,4.0,4.0,5.0,5.0,4.0,5.0,4.0,5.0
Gautam Vashishtha,3.0,4.0,4.0,5.0,3.0,1.0,5.0,5.0,4.0,3.0
Eshan Gujarathi,4.0,,5.0,5.0,4.0,5.0,5.0,5.0,,4.0
Sai Krishna Avula,5.0,3.0,3.0,4.0,4.0,5.0,5.0,3.0,3.0,4.0
Ankit Yadav,3.0,3.0,2.0,5.0,2.0,5.0,5.0,3.0,3.0,4.0
Dhruv,,,5.0,5.0,3.0,,5.0,5.0,4.0,5.0
Saatvik Rao,4.0,3.0,4.0,5.0,2.0,2.0,4.0,5.0,3.0,5.0
Zeel B Patel,5.0,4.0,5.0,4.0,4.0,4.0,,2.0,,5.0
Neel,4.0,,5.0,5.0,3.0,3.0,5.0,5.0,,4.0
Sachin Jalan,4.0,,5.0,5.0,3.0,4.0,4.0,5.0,,3.0


In [36]:
df.index

Index(['Nipun', 'Gautam Vashishtha', 'Eshan Gujarathi', 'Sai Krishna Avula',
       'Ankit Yadav ', 'Dhruv', 'Saatvik Rao ', 'Zeel B Patel', 'Neel ',
       'Sachin Jalan ', 'Ayush Shrivastava', '....', 'Hari Hara Sudhan',
       'Etikikota Hrushikesh', 'Chirag'],
      dtype='object', name='Your name')

In [56]:
# Get index for user and movie
user = 'Ayush Shrivastava'

# Get the movie ratings for user
user_ratings = df.loc[user]
user_ratings

Sholay                      5.0
Swades (We The People)      4.0
The Matrix (I)              5.0
Interstellar                5.0
Dangal                      3.0
Taare Zameen Par            3.0
Shawshank Redemption        4.0
The Dark Knight             4.0
Notting Hill                NaN
Uri: The Surgical Strike    4.0
Name: Ayush Shrivastava, dtype: float64

In [57]:
df_copy = df.copy()
df_copy.fillna(0, inplace=True)

pd.Series(np.nanmean(show_neighbors(df_copy, user, 5)[1:], axis=0),
            index=df.columns)


Sholay                      4.2
Swades (We The People)      1.8
The Matrix (I)              4.6
Interstellar                4.8
Dangal                      3.4
Taare Zameen Par            3.2
Shawshank Redemption        4.0
The Dark Knight             4.8
Notting Hill                1.6
Uri: The Surgical Strike    4.4
dtype: float64

In [55]:
df.loc[user]

Sholay                      3.0
Swades (We The People)      3.0
The Matrix (I)              2.0
Interstellar                5.0
Dangal                      2.0
Taare Zameen Par            5.0
Shawshank Redemption        5.0
The Dark Knight             3.0
Notting Hill                3.0
Uri: The Surgical Strike    4.0
Name: Ankit Yadav , dtype: float64