In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('ratings.csv')
targets = pd.read_csv('targets.csv')
df[['User', 'Item']] = df['UserId:ItemId'].str.split(':', expand=True)
targets[['User', 'Item']] = targets['UserId:ItemId'].str.split(':', expand=True)

In [6]:
users = df['User'].unique()
items = df['Item'].unique()

In [7]:
import time
def log_time(f):
    def wrapper(*args, **kwargs):
        a = time.time()
        ret = f(*args, **kwargs)
        b = time.time()
        print(f'time in {f.__name__}: {b-a}')
        return ret
    return wrapper

In [8]:
def search_ratings_item_id(item_id):
    return np.array(df[df['Item'] == item_id]['Rating'])

In [9]:
def search_ratings_user_id(user_id):
    return np.array(df[df['User'] == user_id]['Rating'])

In [10]:
def get_users_from_item_id(item_id):
    return np.array(df[df['Item'] == item_id]['User'])

In [11]:
def get_items_from_user_id(user_id):
    return list(df[df['User'] == user_id]['Item'])

In [12]:
def check_item_rating_by_user(item_id, user_id) -> int:
    aux_df = df[(df['Item'] == item_id) & (df['User'] == user_id)]
    if len(aux_df) > 0:
        return int(aux_df['Rating'])
    else:
        return search_ratings_user_id(user_id).mean()

In [13]:
def check_item_ratings_by_users(item_id, users):
    ratings = []
    for u in users:
        ratings.append(check_item_rating_by_user(item_id, u))
    return ratings

In [14]:
def get_common_users_from_item_id(item_id, item_id2):
    a1 = get_users_from_item_id(item_id)
    a2 = get_users_from_item_id(item_id2)

    return [e for e in a1 if e in a2]

In [15]:
def get_items_from_users(users:list):
    items = []
    for u in users:
        items.extend(get_items_from_user_id(u))
    return items

In [16]:
def mean_item(item_id):
    return np.mean(search_ratings_item_id(item_id))

In [37]:
def mean_user(user_id):
    return np.mean(search_ratings_user_id(user_id))

In [17]:
def value_similarity(a,b,c):
    x = (a+b-2*c)
    if not x:
        x += 0.5
    return ((a**2 + b**2)/4 - c) /x**2

In [18]:
import math

def gaussian_mapping(x, mean=0, std_deviation=2.4):
    # Calculate the CDF value for x
    cdf_value = 0.5 * (1 + math.erf((x - mean) / (std_deviation * math.sqrt(2))))
    
    # Map the CDF value to the range [-1, 1]
    mapped_value = 2 * cdf_value - 1
    
    # Ensure the result is within the [-1, 1] range
    mapped_value = max(-1, min(1, mapped_value))
    
    if mapped_value > 0.5:
        mapped_value = mapped_value - 0.15
    else:
        mapped_value = mapped_value + 0.15

    return mapped_value

In [19]:
def custom_tanh(z):
    return 2.15 * np.tanh(z- 0.5)


In [20]:
def custom_similarity(a, b, c):
    v = value_similarity(a, b, c)
    g = gaussian_mapping(v)
    return custom_tanh(g)

In [21]:
a = 2
b = 5
c = 3

v = value_similarity(a, b, c)
g = gaussian_mapping(v)
m = custom_tanh(g)
print(v, g, m)

4.25 0.7734115789040786 0.573612370722538


In [22]:
def calc_pearson(item_id1, item_id2):
    # grab common neighbours
    users_common = get_common_users_from_item_id(item_id1, item_id2)

    # check rating of these users for the items
    ratings_item1 = check_item_ratings_by_users(item_id1, users_common)
    ratings_item2 = check_item_ratings_by_users(item_id2, users_common)

    if len(ratings_item1) == 1:
        c = (mean_item(item_id1) + ratings_item1[0] + ratings_item2[0] + mean_item(item_id2))/4
        return custom_similarity(mean_item(item_id1), mean_item(item_id2), c)

    # pearson
    return np.corrcoef(ratings_item1, ratings_item2)[0][1]

In [23]:
item_base = items[0]
users_of_item_base = get_users_from_item_id(item_base)
neighbors_items = pd.Series(get_items_from_users(users_of_item_base)).unique()
sims = {}
for n_i in neighbors_items:
    sims[n_i] = calc_pearson(item_base, n_i)
sims

{'8b05db84f2': 1.0,
 '39f1be0489': 0.7232074203231142,
 '8c50712c93': 0.7232074203231108,
 '1fd7a8b66f': 0.7232074203231142,
 '744cc124cb': -0.14108036900413545,
 '6b6c3e46e3': 0.014375765892817619}

In [32]:
def calc_similarity(item_id) -> dict:
    similarities = dict()

    users_of_item_base = list(set(get_users_from_item_id(item_id)))
    neighbors_items = list(set(get_items_from_users(users_of_item_base)))

    for n_i in neighbors_items:
        similarities[n_i] = calc_pearson(item_id, n_i)

    return similarities

In [25]:
def calc_similarity_plus_new_user(item_id, user_id) -> dict:
    similarities = dict()

    users_of_item_base = list(set(get_users_from_item_id(item_id))) + [user_id]
    neighbors_items = list(set(get_items_from_users(users_of_item_base)))

    for n_i in neighbors_items:
        similarities[n_i] = calc_pearson(item_base, n_i)
    
    return similarities

In [40]:
def get_normalized_score(user_id, item_id):
    
    ratings = search_ratings_item_id(item_id)
    ratings_mean = np.mean(ratings)
    ratings_std = np.std(ratings, ddof=1) if len(ratings) > 1 else 1

    if ratings_std == 0:
        return 1

    rating_user_item = check_item_rating_by_user(item_id, user_id)

    return (rating_user_item - ratings_mean) / ratings_std

In [47]:
def make_prediction(item_id, user_id):
    similarities = calc_similarity(item_id)

    ratings_user = search_ratings_user_id(user_id)
    ratings_user_mean = np.mean(ratings_user)
    ratings_user_std = np.std(ratings_user, ddof=1) if len(ratings_user) > 1 else 0

    upper = 0
    lower = 0
    for i in similarities:
        upper += similarities[i] * get_normalized_score(user_id, i)
        lower += similarities[i]

    return np.clip(ratings_user_mean + ratings_user_std * upper / lower, 0, 5)

In [42]:
print(df.loc[2]['Item'], df.loc[0]['User'])
make_prediction(df.loc[2]['Item'], df.loc[0]['User'])

8b05db84f2 4baf0ac888


3.4810013334396164

In [50]:
counter = 0
for i, row in targets.iterrows():
    counter += 1
    if counter == 20:
        break
    print(f'pred: {make_prediction(row["Item"], row["User"])}, mean_item: {mean_item(row["Item"])}, mean_user: {mean_user(row["User"])} ')

pred: 5.0, mean_item: 4.5, mean_user: 5.0 
pred: 5.0, mean_item: 4.5, mean_user: 5.0 
pred: 4.4603135803355, mean_item: 4.5, mean_user: 4.5 
pred: 5.0, mean_item: 4.5, mean_user: 5.0 
pred: 5.0, mean_item: 4.2, mean_user: 5.0 
pred: 5.0, mean_item: 5.0, mean_user: 5.0 
pred: 5.0, mean_item: 4.0, mean_user: 5.0 
pred: 5.0, mean_item: 4.5, mean_user: 4.428571428571429 
pred: 5.0, mean_item: 4.5, mean_user: 5.0 
pred: 5.0, mean_item: 3.5, mean_user: 5.0 


  c /= stddev[:, None]


pred: nan, mean_item: 5.0, mean_user: 5.0 
pred: nan, mean_item: 5.0, mean_user: 2.0 
pred: nan, mean_item: 5.0, mean_user: 5.0 
pred: 4.512547051324667, mean_item: 5.0, mean_user: 4.5 
pred: 0.0, mean_item: 5.0, mean_user: 3.0 
pred: 0.0, mean_item: 4.857142857142857, mean_user: 3.25 
pred: 1.0, mean_item: 4.857142857142857, mean_user: 1.0 
pred: 4.0, mean_item: 4.857142857142857, mean_user: 4.0 
pred: 5.0, mean_item: 4.666666666666667, mean_user: 5.0 


In [30]:
predictions = []
for i, row in targets.iterrows():
    predictions.append(make_prediction(row['Item'], row['User']))