In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
p = pd.read_pickle(f"../../../app/db/users_factors.pkl")
q = pd.read_pickle(f"../../../app/db/items_factors.pkl")
user_ratings = pd.read_csv("../../../data/cleaned/user_ratings.csv")
print(p.shape)
print(q.shape)
print(user_ratings.shape)

(224556, 3)
(21919, 3)
(18340221, 3)


In [3]:
def train_val_test_split(
        df: pd.DataFrame, stratify_col_name: str
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    ratios = [0.8, 0.1, 0.1]
    assert sum(ratios) == 1

    train, val_test = train_test_split(
        df,
        train_size=ratios[0],
        test_size=ratios[1] + ratios[2],
        stratify=df[[stratify_col_name]],
        random_state=0,
    )

    val_size = ratios[1] / (ratios[1] + ratios[2])
    val, test = train_test_split(
        val_test,
        train_size=val_size,
        test_size=1 - val_size,
        stratify=val_test[[stratify_col_name]],
        random_state=0,
    )

    return train, val, test

def rmse(actual: np.ndarray, predicted: np.ndarray) -> float:
    return mean_squared_error(actual, predicted, squared=False)

def prints_errors(train: pd.DataFrame, test_in: pd.DataFrame) -> pd.DataFrame:
    test = test_in.copy()
    test = test[test["Username"].isin(p.index)]
    test = test[test["BGGId"].isin(q.index)]
    test["Latent rating"] = np.einsum("ij, ij->i", p.loc[test["Username"]].values, q.loc[test["BGGId"]].values)
    
    test["Global mean"] = train["Rating"].mean()

    # Naive with biases:
    #   user bias = user mean - global mean
    #   game bias = game mean - global mean 
    user_means = train.groupby("Username")["Rating"].agg("mean")
    test = test.merge(user_means, on="Username", how="left").rename(columns={"Rating_x": "Rating", "Rating_y": "User mean"})

    game_means = train.groupby("BGGId")["Rating"].agg("mean")
    test = test.merge(game_means, on="BGGId", how="left").rename(columns={"Rating_x": "Rating", "Rating_y": "Game mean"})
    
    test["User bias"] = test["User mean"] - test["Global mean"]
    test["Game bias"] = test["Game mean"] - test["Global mean"]
    test["Naive rating"] = test["Global mean"] + test["User bias"] + test["Game bias"]

    # Naive with biases second option:
    #   user bias = average difference between user rating and game mean
    #   game bias = average difference between game rating and user mean
    train_game_means = train.groupby("BGGId")["Rating"].agg("mean")
    train = train.merge(train_game_means, on="BGGId", how="left").rename(columns={"Rating_x": "Rating", "Rating_y": "Game mean"})
    user_biases = train.groupby("Username").apply(lambda x: (x["Rating"] - x["Game mean"]).mean())
    user_biases.name = "User bias 2"
    test = test.merge(user_biases, on="Username", how="left").rename(columns={0: "User bias 2"})

    train_user_means = train.groupby("Username")["Rating"].agg("mean")
    train = train.merge(train_user_means, on="Username", how="left").rename(columns={"Rating_x": "Rating", "Rating_y": "User mean"})
    game_biases = train.groupby("BGGId").apply(lambda x: (x["Rating"] - x["User mean"]).mean())
    game_biases.name = "Game bias 2"
    test = test.merge(game_biases, on="BGGId", how="left").rename(columns={0: "Game bias 2"})

    test["Naive rating 2"] = test["Global mean"] + test["User bias 2"] + test["Game bias 2"]
        
    for col in ["Global mean", "User mean", "Game mean", "Naive rating", "Naive rating 2", "Latent rating"]:
        print(f"{col} RMSE: {rmse(test['Rating'], test[col]):.3f}")

    return test

In [4]:
x_train, x_val, x_test = train_val_test_split(user_ratings, "BGGId")

In [5]:
result = prints_errors(x_train, x_test)
result.head(100)

Global mean RMSE: 1.530
User mean RMSE: 1.374
Game mean RMSE: 1.316
Naive rating RMSE: 1.226
Naive rating 2 RMSE: 1.183
Latent rating RMSE: 1.195


Unnamed: 0,BGGId,Rating,Username,Latent rating,Global mean,User mean,Game mean,User bias,Game bias,Naive rating,User bias 2,Game bias 2,Naive rating 2
0,150658,8.0,ValentineS,6.271517,7.095656,5.981395,7.317400,-1.114261,0.221744,6.203139,-0.983919,0.179018,6.290755
1,145186,5.0,Andychives,6.632857,7.095656,7.220930,6.407969,0.125274,-0.687687,6.533243,0.356351,-0.449816,7.002191
2,71,6.0,Hindrick,7.184106,7.095656,6.550000,7.530192,-0.545656,0.434536,6.984536,-0.892261,0.592496,6.795891
3,15511,7.0,Moviebuffs,6.971453,7.095656,6.952899,6.725949,-0.142758,-0.369707,6.583191,0.275200,-0.113396,7.257461
4,171623,8.0,masa5963,8.023049,7.095656,7.721311,7.887216,0.625655,0.791560,8.512871,-0.043014,0.622480,7.675123
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,143741,8.0,Cyantsunami,6.810093,7.095656,7.136194,6.974243,0.040538,-0.121413,7.014781,-0.273314,-0.220253,6.602090
96,207,8.0,koen2602,6.502048,7.095656,7.036026,6.068310,-0.059630,-1.027346,6.008680,0.122149,-0.570126,6.647679
97,12205,5.0,chrisnd,4.844284,7.095656,7.114072,4.718547,0.018416,-2.377109,4.736964,0.208934,-2.067289,5.237301
98,40628,7.5,WRMW,6.859257,7.095656,7.267857,7.079846,0.172201,-0.015810,7.252048,0.086607,0.131602,7.313865


# Conclusion
The second option of computing naive prediction seems to outperform latent factors.
