In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from datetime import datetime
from sortedcontainers import SortedList

In [2]:
import os
if not os.path.exists('user2movie.json') or \
   not os.path.exists('movie2user.json') or \
   not os.path.exists('usermovie2rating.json') or \
   not os.path.exists('usermovie2rating_test.json'):
   import preprocess2dict 

In [3]:
with open('user2movie.json','rb') as f:
    user2movie=pickle.load(f)
with open('movie2user.json','rb') as f:
    movie2user=pickle.load(f)
with open('usermovie2rating.json','rb') as f:
    usermovie2rating=pickle.load(f)
with open('usermovie2rating_test.json','rb') as f:
    usermovie2rating_test=pickle.load(f)

In [5]:
N = np.max(list(user2movie.keys()))+1
m1=np.max(list(movie2user.keys()))
m2=np.max([m for (u,m),r in usermovie2rating_test.items()])

M = max(m1,m2)+1

In [27]:
K=25
limit=5
neighbors=[]
averages=[]
deviations=[]

for i in range(M):
    
    # to find 25 closest moview to movie i
    users_i = movie2user[i]
    ratings_i = {user: usermovie2rating[(user,i)] for user in users_i}
    avg_i = np.mean(list(ratings_i.values()))
    dev_i = {user: (rating - avg_i) for user, rating in ratings_i.items()}
    dev_i_values = np.array(list(dev_i.values()))
    sigma_i = np.sqrt(dev_i_values.dot(dev_i_values))
    
    averages.append(avg_i)
    deviations.append(dev_i)
    
    sl = SortedList()
    
    for j in range(M):
        if j!=i:
            user_j = movie2user[j]
            # & operation do works on a list, that is why using set to find the  common users
            common_users = (set(user_j) & set(users_i))
            if len(common_users)>limit:
                ratings_j={user:usermovie2rating[(user,j)] for user in user_j}
                avg_j = np.mean(list(ratings_j.values()))
                dev_j={user:(rating-avg_j) for user,rating in ratings_j.items()}
                dev_j_values= np.array(list(dev_j.values()))
                sigma_j = np.sqrt(dev_j_values.dot(dev_j_values 
                
                # calculate numerator for only common users only
                numerator = sum(dev_i[u]*dev_j[u] for u in common_users)
                w_ij = numerator/(sigma_i*sigma_j)
                # insert into sorted list and truncate
                # negate weight, because list is sorted ascending.
                # maximum value (1) is "closest"
                sl.add((-w_ij,j))
                if len(sl)>K:
                    del sl[-1]
    
    # we can simply get the neighbors by just using indexing.
    neighbors.append(sl)
    


In [28]:
def predict(i,u):
    numerator=0
    denominator=0
    for neg_w,j in neighbors[i]:
        try:
            numerator+=-neg_w*deviations[j][u]
            denominator+=abs(neg_w)
        except KeyError:
            #neighbour may not have been rated by the same user
            # don;t want to do dictionary lookup twice
            # so just throw exception
            pass
        if denominator==0:
            prediction=averages[i]
        else:
            prediction = numerator/denominator + averages[i]
        predcition = min(5,prediction)
        prediction = max(0.5, prediction)
        return prediction
        

In [29]:
train_predictions=[]
train_targets=[]
for (u,m), target in usermovie2rating.items():
    # calculate the prediction for this movie
    prediction =predict(m,u)

    train_predictions.append(prediction)
    train_targets.append(target)

In [30]:
test_predictions=[]
test_targets=[]
for (u,m), target in usermovie2rating_test.items():
    # calculate the prediction for this movie
    prediction =predict(m,u)

    test_predictions.append(prediction)
    test_targets.append(target)

In [31]:
def mse(p,t):
    p=np.array(p)
    t=np.array(t)
    return np.mean((p-t)**2)

In [32]:
print('train mse: ',mse(train_predictions,train_targets))
print('test mse: ',mse(test_predictions,test_targets))

train mse:  0.8208719956212318
test mse:  0.8671789742777469
