In [179]:
import pandas as pd
import numpy as np
import scipy as sp
import os
import matplotlib.pyplot as plt
import seaborn as sns

<h2> Load User Interaction Matrix </h2>

In [180]:
train_x = pd.read_csv("./data/MovieLens/ua.base", sep='\t', encoding='latin-1', header=None)
test_x = pd.read_csv("./data/MovieLens/ua.test", sep='\t', encoding='latin-1', header=None)
for df in [train_x, test_x]:
    df.columns = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

In [181]:
num_users, num_movie = train_x.user_id.max(), train_x.movie_id.max()
intmatrix = np.zeros((num_users, num_movie), dtype=np.float64)
for row in train_x.itertuples():
    intmatrix[row[1]-1, row[2]-1] = row[3]
print(intmatrix.shape)

(943, 1682)


<h2> Vanilla User-User and Item-Item Similarity </h2>

In [182]:
def optimized_row_cosine_sim(matrix): #nxm matrix
    dot = np.matmul(matrix, matrix.T) #nxn dot products
    lth = np.sqrt((matrix**2).sum(axis=1)) #nx1 length vector
    rowlth = np.repeat(lth.reshape(-1,1), matrix.shape[0], axis=1)
    collth = np.repeat(lth.reshape(1,-1), matrix.shape[0], axis=0)
    return dot/(rowlth*collth+1e-8) #1e-8 avoids div0 errors for unknown entries

In [183]:
user_similarity = optimized_row_cosine_sim(intmatrix)
item_similarity = optimized_row_cosine_sim(intmatrix.T)
user_similarity.shape, item_similarity.shape

((943, 943), (1682, 1682))

In [184]:
def user_recommender_help(ratematrix, simmatrix):
    mean_rating = np.nanmean(ratematrix, axis=1)
    diff = (ratematrix-mean_rating.reshape(-1,1))
    preddiff = np.matmul(simmatrix, diff)/(np.repeat(np.sum(np.abs(
        simmatrix), axis=1).reshape(-1,1), ratematrix.shape[1], axis=1)+1e-8) #Div0 error avoidance
    return np.repeat(mean_rating.reshape(-1,1), ratematrix.shape[1], axis=1)+preddiff
def top_k_unseen_user_recommender(ratematrix, simmatrix, user, k):
    predrating = user_recommender_help(ratematrix, simmatrix)[user]
    predrating = pd.Series(predrating)[np.where(ratematrix[user]==0)[0]].sort_values(ascending=False)
    return predrating.iloc[:k]

In [185]:
#Top items to be recommended to chosen user
top_k_unseen_user_recommender(intmatrix, user_similarity, 0, 5)

317    2.171805
422    2.072825
356    1.987419
116    1.965397
201    1.919865
dtype: float64

In [186]:
def item_recommender_help(ratematrix, simmatrix):
    mean_rating = np.nanmean(ratematrix, axis=1)
    diff = (ratematrix-mean_rating.reshape(-1,1)).T
    predfit = np.matmul(simmatrix, diff)/(np.repeat(np.sum(np.abs(
    simmatrix), axis=1).reshape(-1,1), diff.shape[1], axis=1)+1e-8) #Div0 error avoidance
    return predfit
def top_k_users_for_item_recommender(ratematrix, simmatrix, item, k):
    predfit = item_recommender_help(ratematrix, simmatrix)[item]
    predfit = pd.Series(predfit)[np.where(ratematrix[:,item]==0)[0]].sort_values(ascending=False)
    return predfit.iloc[:k]

In [187]:
#Top users who might really like chosen item
top_k_users_for_item_recommender(intmatrix, item_similarity, 0, 5)

6      0.748582
307    0.740049
550    0.728063
473    0.703333
845    0.691157
dtype: float64

<h2> Improving Vanilla Similarity with Targetted Comparisons </h2>