In [1]:
import json
import time

In [2]:
with open("train.json") as training:
    training = json.loads(training.read())
    
with open("test.json") as testing:
    testing = json.loads(testing.read())

In [3]:
training_dict = {}
testing_dict = {}

for user in training.keys():
    for [item, rating] in training[user]:
        if user in training_dict:
            training_dict[user][item] = rating
        else:
            training_dict[user] = {item: rating}
            
for user in testing.keys():
    for [item, rating] in testing[user]:
        if user in testing_dict:
            testing_dict[user][item] = rating
        else:
            testing_dict[user] = {item: rating}



In [4]:
def sub_row_mean(ratings_dict):
    sub_row_mean_ratings_dict = {}
    for user in ratings_dict:
        list_item = list(ratings_dict[user].values())
        avg = sum(list_item) / len(list_item)
        for item in ratings_dict[user].keys():
            if user in sub_row_mean_ratings_dict:
                sub_row_mean_ratings_dict[user][item] = ratings_dict[user][item] - avg
            else:
                sub_row_mean_ratings_dict[user] = {item: ratings_dict[user][item] - avg}
    
    return sub_row_mean_ratings_dict
        


In [5]:
import numpy as np
from numpy.linalg import norm
import math

def cosine_sim(ratings_dict, user):
    cosine_sim_val = {}

    for other_user in ratings_dict.keys():
        dot_prod = sum(ratings_dict[user][item]*ratings_dict[other_user].get(item, 0) for item in ratings_dict[user])
        user_norm = np.linalg.norm(np.array([rating for rating in ratings_dict[user].values()]))
        other_user_norm = np.linalg.norm(np.array([rating for rating in ratings_dict[other_user].values()]))
        norm_mul = user_norm * other_user_norm
        if norm_mul == 0:
            cos_sim = 0
        else:
            cos_sim = dot_prod / norm_mul
        if math.isnan(cos_sim):
            cos_sim = 0
        cosine_sim_val[other_user] = cos_sim
    
        if cosine_sim_val[other_user] > 1:
            cosine_sim_val[other_user] = 1
        if cosine_sim_val[other_user] < 0:

            cosine_sim_val[other_user] = 0        
    
    return cosine_sim_val

In [6]:

def predict_item_rating_avg(ratings_dict, cos_similarities_dict, k, item, user):

    cos_similarities_dict.pop(user)
    other_users = list(cos_similarities_dict.keys())
    for other_user in other_users:
        if item not in ratings_dict[other_user]:
            cos_similarities_dict.pop(other_user)
    
    list_cos_sim = sorted(cos_similarities_dict.items(), key=lambda x: x[1], reverse=True)[:k]

    avg_rating = 0
    
    if len(list_cos_sim) != 0 and list_cos_sim[0][1] != 0:
        for pair in list_cos_sim:
            avg_rating += ratings_dict[pair[0]][item]
        
        avg_rating = avg_rating / len(list_cos_sim)
    else:
        total = 0
        rating = 0
        for rat in training_dict[user]:
            rating += training_dict[user][rat]
            total += 1
        rating = rating / total
        return rating
    
    return avg_rating


    
    

In [7]:


def predict_item_rating_sim(ratings_dict, cos_similarities_dict, k, item, user):

    if user in cos_similarities_dict:
        cos_similarities_dict.pop(user)
    other_users = list(cos_similarities_dict.keys())
    for other_user in other_users:
        if item not in ratings_dict[other_user]:
            cos_similarities_dict.pop(other_user)
    
    list_cos_sim = sorted(cos_similarities_dict.items(), key=lambda x: x[1], reverse=True)[:k]
    rating = 0
    similarities = 0
    
    if len(list_cos_sim) != 0 and list_cos_sim[0][1] != 0:
        for pair in list_cos_sim:
            rating += ratings_dict[pair[0]][item] * cos_similarities_dict[pair[0]]
            similarities += cos_similarities_dict[pair[0]]
        rating = rating / similarities
    else:
        total = 0
        rating = 0
        for rat in training_dict[user]:
            rating += training_dict[user][rat]
            total += 1
        rating = rating / total
        
    if rating > 5.1:
        print("Flag")
        print("")
        rating = 4.99
    return rating

  
    

In [8]:
def predict(ratings_dict, user, items):
    predicted_ratings_dict = {}
    sub_mean_dict = sub_row_mean(ratings_dict)
    cos_sim_dict = cosine_sim(sub_mean_dict, user)
    for item in items:
        avg_rating_prediction = predict_item_rating_avg(training_dict, cos_sim_dict.copy(), k, item, user)
        sim_rating_prediction = predict_item_rating_sim(training_dict, cos_sim_dict.copy(), k, item, user)
        
        predicted_ratings_dict[item] = (avg_rating_prediction, sim_rating_prediction)
        
    return predicted_ratings_dict

In [9]:
k = 10


start = time.time()

    
N = 0
MAE_avg = 0
RMSE_avg = 0 
MAE_sim = 0
RMSE_sim = 0

i = 0
j = 0

for user, list_ratings in testing_dict.items():
    predicted_vals = predict(training_dict, user, list(testing_dict[user].keys()))
    orig_vals = testing_dict[user]
    i += 1



    for item, (avg_rat, sim_rat) in predicted_vals.items():
        j += 1
        corr_rat = orig_vals[item]
        MAE_avg += abs(avg_rat-corr_rat)
        MAE_sim += abs(sim_rat-corr_rat)
        RMSE_avg += (avg_rat-corr_rat)**2
        RMSE_sim += (sim_rat-corr_rat)**2
        N+=1
        
        if j % 1000 == 0:
            print("Item" + str(j))
            print("User" + str(i))
            print(time.time() - start)
            print("predicted vals: ")
            print(predicted_vals)
            print("original vals: ")
            print(orig_vals)

MAE_avg = MAE_avg/N
MAE_sim = MAE_sim/N
RMSE_avg = math.sqrt(RMSE_avg/N)
RMSE_sim = math.sqrt(RMSE_sim/N)

print((MAE_avg, RMSE_avg))
print((MAE_sim, RMSE_sim))
    



end = time.time()

print(end - start) 

Item1000
User294
129.25333952903748
predicted vals: 
{'B001VW9UES': (4.3, 3.1009828382532656), 'B0042U8VUA': (2.7628865979381443, 2.7628865979381443), 'B00138FSPU': (2.7628865979381443, 2.7628865979381443), 'B001NTPENA': (2.7628865979381443, 2.7628865979381443), 'B00W4SVHYE': (2.7628865979381443, 2.7628865979381443), 'B000V61B74': (4.6, 4.0), 'B00122ULG8': (4.7, 2.0), 'B00124DTBK': (4.4, 5.0), 'B002WPXHV0': (4.8, 4.795690343361815), 'B00I83XD14': (4.714285714285714, 4.999999999999999), 'B00137MSX6': (4.7, 4.052554730217244), 'B00137OBEA': (4.666666666666667, 5.0), 'B000VRPGS8': (4.7, 4.128877049712486), 'B000TE2NSY': (2.7628865979381443, 2.7628865979381443), 'B001NTZFNY': (4.285714285714286, 3.1849277743464253), 'B000W00IGY': (4.5, 4.152345334652093), 'B00L39GAY2': (2.7628865979381443, 2.7628865979381443), 'B00136NE8A': (4.9, 4.0), 'B001NTWIN4': (2.7628865979381443, 2.7628865979381443), 'B013MBXATG': (2.7628865979381443, 2.7628865979381443), 'B00PMMHEQO': (2.7628865979381443, 2.7628865

Item11000
User3480
1481.0662970542908
predicted vals: 
{'B0016O0MK2': (5.0, 5.0), 'B008HTZDWQ': (5.0, 5.0)}
original vals: 
{'B0016O0MK2': 5.0, 'B008HTZDWQ': 5.0}
Item12000
User3889
1645.37406539917
predicted vals: 
{'B000QQ9694': (5.0, 5.0), 'B001I8C7H2': (5.0, 5.0), 'B005IKZ2W8': (5.0, 5.0), 'B00C5ZMK5Q': (5.0, 5.0), 'B00HO15UFI': (5.0, 5.0)}
original vals: 
{'B000QQ9694': 5.0, 'B001I8C7H2': 5.0, 'B005IKZ2W8': 5.0, 'B00C5ZMK5Q': 5.0, 'B00HO15UFI': 5.0}
Item13000
User4247
1793.1727039813995
predicted vals: 
{'B00110DT4C': (3.4, 3.3148036283170765), 'B00124DTBK': (4.7, 5.0)}
original vals: 
{'B00110DT4C': 3.0, 'B00124DTBK': 5.0}
Item14000
User4703
1972.7684681415558
predicted vals: 
{'B001386D0E': (5.0, 5.0), 'B001NYGM2M': (5.0, 5.0), 'B001NYBZYW': (5.0, 5.0)}
original vals: 
{'B001386D0E': 5.0, 'B001NYGM2M': 5.0, 'B001NYBZYW': 5.0}
Item15000
User5181
2157.8672573566437
predicted vals: 
{'B00138I27G': (5.0, 5.0), 'B001Q1QC4K': (5.0, 5.0), 'B000V68S8O': (5.0, 5.0), 'B001NYRP1Y': (5.0, 5