In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from surprise.dataset import Dataset
from surprise.reader import Reader
from surprise import SVDpp
from surprise import accuracy
from collections import defaultdict
from random import shuffle
from sklearn.metrics import root_mean_squared_error as rmse, mean_absolute_error as mae


In [2]:
df_games = pd.read_csv('../data/df_games_train2.csv')
df_users = pd.read_csv('../data/df_users_train2.csv')
df_groups = pd.read_csv('../data/df_groups.csv')

In [3]:
df_games.head()

Unnamed: 0,BGGId,Name,Description,YearPublished,GameWeight,AvgRating,BayesAvgRating,StdDev,MinPlayers,MaxPlayers,...,Cat:Strategy,Cat:War,Cat:Family,Cat:CGS,Cat:Abstract,Cat:Party,Cat:Childrens,mask,Description_embedding,Image_embedding
0,88,Torres,torre abstract game resource management tactic...,1999,2.8656,7.10972,6.82406,1.27896,2,4,...,1,0,0,0,1,0,0,True,"[-2.683112726255317, -0.42332172554470826, -1....","[1.6416904926300049, 0.06321828812360764, 0.39..."
1,91,Paths of Glory,gmt game websitethey call great war year titan...,1999,3.8421,8.05888,7.33726,1.64059,2,2,...,0,1,0,0,0,0,0,True,"[1.0637854293471487, 0.649491797066982, 0.4205...","[-1.1839122772216797, -0.6947698593139648, 0.3..."
2,150,PitchCar,game entry refer nearly identical game compati...,1995,1.1263,7.26579,6.99936,1.34552,2,8,...,0,0,1,0,0,1,0,True,"[0.820082462099059, -0.0750128060778531, 1.587...","[1.035536527633667, 1.0266635417938232, -0.521..."
3,171,Chess,chess twoplayer abstract strategy board game r...,1475,3.6886,7.15112,6.96557,1.83887,2,2,...,0,0,0,0,1,0,0,True,"[-0.858757313022056, -0.0162148123075708, 0.17...","[0.8292618989944458, -0.3407784700393677, 0.03..."
4,188,Go,appearance player take turn lay stone time sma...,-2200,3.942,7.63525,7.31794,1.91275,2,2,...,0,0,0,0,1,0,0,True,"[0.09952368867430286, 0.5561095815916068, 0.60...","[4.239888668060303, -0.1471511274576187, 0.231..."


In [4]:
df_users.head()

Unnamed: 0,BGGId,Rating,Username,isTest
0,35865,10.0,zebracat,False
1,35865,9.0,Pfahrer,False
2,35865,9.0,gregorus,False
3,35865,9.0,tryytty,False
4,35865,9.0,Tolkana,False


In [5]:
df_groups.head()

Unnamed: 0,members
0,"['Snawk', 'Opal82']"
1,"['David546', 'taragalinas']"
2,"['Vadorojo', 'Travellingmatti', 'DanKill']"
3,"['HeavyAdge', 'Jawaswag']"
4,"['Qelha14', 'MarcusK']"


In [6]:
df_users.Username.value_counts().head()

Username
leffe dubbel      245
oldgoat3769967    223
TomVasel          219
Tolkana           210
Doel              207
Name: count, dtype: int64

# Utils

In [7]:
def games_min_n(n=9):
    aux = df_games[(df_games.MinPlayers <= n) & (df_games.MaxPlayers >= n)]
    return aux.BGGId

# Recomendación individual

## FunkSVD User only

In [8]:
reader = Reader(rating_scale=(0, 10))
X_train = Dataset.load_from_df(df_users.loc[df_users.isTest == False, ['Username', 'BGGId', 'Rating']],
                               reader=reader).build_full_trainset()
X_test = list(df_users.loc[df_users.isTest == True, ['Username', 'BGGId', 'Rating']].itertuples(index=False, name=None))

svdpp = SVDpp(n_factors=10, n_epochs=150)
svdpp.fit(X_train)
yhat_svdpp = svdpp.test(X_test)

print(accuracy.mae(yhat_svdpp),
      accuracy.rmse(yhat_svdpp))

MAE:  0.7807
RMSE: 1.0414
0.7807340953557931 1.0414232243677881


In [9]:
X_recommend = X_train.build_anti_testset()

In [10]:
predictions = svdpp.test(X_recommend)
predictions

[Prediction(uid='zebracat', iid=163412, r_ui=7.3711534995698615, est=7.550726175626433, details={'was_impossible': False}),
 Prediction(uid='zebracat', iid=84876, r_ui=7.3711534995698615, est=7.107102100400906, details={'was_impossible': False}),
 Prediction(uid='zebracat', iid=161936, r_ui=7.3711534995698615, est=6.7062351548270485, details={'was_impossible': False}),
 Prediction(uid='zebracat', iid=12333, r_ui=7.3711534995698615, est=5.150129443148268, details={'was_impossible': False}),
 Prediction(uid='zebracat', iid=54043, r_ui=7.3711534995698615, est=6.899370902940081, details={'was_impossible': False}),
 Prediction(uid='zebracat', iid=170216, r_ui=7.3711534995698615, est=6.798186215596172, details={'was_impossible': False}),
 Prediction(uid='zebracat', iid=41114, r_ui=7.3711534995698615, est=7.913998083711814, details={'was_impossible': False}),
 Prediction(uid='zebracat', iid=193738, r_ui=7.3711534995698615, est=7.374576083011571, details={'was_impossible': False}),
 Prediction

In [11]:
def get_top_n(predictions, n=10):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # for uid, user_ratings in top_n.items():
    #     user_ratings.sort(key=lambda x: x[1], reverse=True)
    #     top_n[uid] = user_ratings

    return top_n

In [12]:
top_n = get_top_n(predictions)

In [13]:
top_n2 = {k: {k2: v2 for k2, v2 in v} for k, v in top_n.items()}

In [14]:
top_n3 = pd.DataFrame(top_n2.values(), index=top_n2.keys()).fillna(-100)

In [15]:
def get_preds_svdpp(user_list, df_pred):
    df = []
    n = len(user_list)
    filter_list = games_min_n(n)
    df_pred2 = df_pred.loc[:, df_pred.columns.isin(filter_list)]

    for usr in user_list:
        usr_pred = df_pred2.loc[usr].sort_values(ascending=False)
        usr_pred = pd.DataFrame(usr_pred[usr_pred > 0]).T
        if n < usr_pred.size:
            usr_pred = usr_pred.loc[:, usr_pred.columns[:n]].copy()

        df.append(usr_pred)
    df = pd.concat(df, axis=0).reset_index(drop=True).fillna(0)
    return df


In [16]:
df_groups['members2'] = df_groups['members'].apply(lambda x: eval(x))
df_groups['preds_svdpp'] = df_groups['members2'].apply(lambda x: get_preds_svdpp(x, top_n3))

In [17]:
df_groups.to_pickle('df_groups_svdpp')

## Random

In [18]:

def random_rank(user_list):
    games = games_min_n(len(user_list)).to_list()
    df = []
    for _ in range(len(user_list)):
        shuffle(games)
        rcm = pd.DataFrame([{games[i]: 10 - i for i in range(10)}])
        df.append(rcm)
    df = pd.concat(df, axis=0).reset_index(drop=True).fillna(0)
    return df


df_groups['preds_random'] = df_groups['members2'].apply(lambda x: random_rank(x))


In [19]:
df_groups.to_pickle('df_groups_svdpp_random')

In [20]:
yhat_random = np.random.uniform(1, 10, size=(df_users.shape[0], 1))

print('MAE: ', mae(df_users.Rating, yhat_random))
print('RMSE: ', rmse(df_users.Rating, yhat_random))

MAE:  2.8284895681095956
RMSE:  3.460299819901082


## Most polular

In [21]:
from sklearn.linear_model import LinearRegression

ranking = df_games.sort_values(by=['Rank:boardgame']).BGGId.to_numpy().reshape(-1, 1)
x = np.linspace(10, 1, len(ranking)).reshape(-1, 1)

popularmap = LinearRegression()

popularmap.fit(ranking, x)

yhat_popular = popularmap.predict(df_users[df_users.isTest].BGGId.to_numpy().reshape(-1, 1))

print('MAE: ', mae(df_users[df_users.isTest].Rating, yhat_popular))
print('RMSE: ', rmse(df_users[df_users.isTest].Rating, yhat_popular))


MAE:  2.188580056325345
RMSE:  2.4680396869540084


In [22]:

def rank_popular(user_list):
    ranking = df_games.sort_values(by=['Rank:boardgame']).BGGId.to_list()
    df = []
    for _ in range(len(user_list)):
        rcm = pd.DataFrame([{ranking[i]: 10 - i for i in range(10)}])
        df.append(rcm)
    df = pd.concat(df, axis=0).reset_index(drop=True).fillna(0)
    return df


df_groups['preds_popular'] = df_groups['members2'].apply(lambda x: rank_popular(x))


In [23]:
df_groups.to_pickle('df_groups_svdpp_random_popular')
