In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

In [2]:
p = pd.read_pickle(f"../../../app/db/users_factors.pkl")
q = pd.read_pickle(f"../../../app/db/items_factors.pkl")
user_ratings = pd.read_csv("../../../data/cleaned/user_ratings.csv")
print(p.shape)
print(q.shape)
print(user_ratings.shape)

(224556, 3)
(21919, 3)
(18340221, 3)


In [3]:
def rmse(actual: np.ndarray, predicted: np.ndarray) -> float:
    return mean_squared_error(actual, predicted, squared=False)

In [4]:
user_ratings = user_ratings[user_ratings["Username"].isin(p.index)]
user_ratings = user_ratings[user_ratings["BGGId"].isin(q.index)]
print(user_ratings.shape)

(18340221, 3)


In [5]:
user_ratings["Latent rating"] = np.einsum("ij, ij->i", p.loc[user_ratings["Username"]].values, q.loc[user_ratings["BGGId"]].values)

In [6]:
user_ratings["Global mean"] = user_ratings["Rating"].mean()
user_ratings["User mean"] = user_ratings.groupby("Username")["Rating"].transform("mean")
user_ratings["Game mean"] = user_ratings.groupby("BGGId")["Rating"].transform("mean")
user_ratings["User bias"] = user_ratings["User mean"] - user_ratings["Global mean"]
user_ratings["Game bias"] = user_ratings["Game mean"] - user_ratings["Global mean"]
user_ratings["Naive rating"] = user_ratings["Global mean"] + user_ratings["User bias"] + user_ratings["Game bias"]
user_ratings.head(100)

Unnamed: 0,BGGId,Rating,Username,Latent rating,Global mean,User mean,Game mean,User bias,Game bias,Naive rating
0,213788,8.0,Tonydorrf,8.020448,7.095664,7.367589,7.990521,0.271925,0.894857,8.262446
1,213788,8.0,tachyon14k,7.355969,7.095664,6.898305,7.990521,-0.197359,0.894857,7.793162
2,213788,8.0,Ungotter,9.865546,7.095664,7.040000,7.990521,-0.055664,0.894857,7.934857
3,213788,8.0,brainlocki3,8.823724,7.095664,7.409091,7.990521,0.313427,0.894857,8.303948
4,213788,8.0,PPMP,8.125651,7.095664,7.641304,7.990521,0.545640,0.894857,8.536162
...,...,...,...,...,...,...,...,...,...,...
95,193500,5.0,Imscar,4.742672,7.095664,6.588174,5.063886,-0.507490,-2.031778,4.556396
96,193500,4.0,Geeken,4.125724,7.095664,5.036650,5.063886,-2.059014,-2.031778,3.004872
97,193500,4.0,prd1982,3.655050,7.095664,5.266648,5.063886,-1.829016,-2.031778,3.234869
98,193500,4.0,kalfa,4.622535,7.095664,7.212121,5.063886,0.116457,-2.031778,5.180343


In [7]:
print(rmse(user_ratings["Rating"], user_ratings["Latent rating"]))
print(rmse(user_ratings["Rating"], user_ratings["Naive rating"]))

1.2978798492121542
1.2066103932764292


# Conclusion
Latent factors model is worse than naive baseline. Maybe it would outperform it if it included (mean + user bias + item bias).