In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from surprise.dataset import Dataset
from surprise.reader import Reader
from surprise import SVDpp
from surprise import accuracy
from collections import defaultdict
from random import shuffle
from sklearn.metrics import root_mean_squared_error as rmse, mean_absolute_error as mae


In [2]:
df_games = pd.read_csv('../data/df_games_train.csv')
df_users = pd.read_csv('../data/df_users_train.csv')
df_groups = pd.read_csv('../data/df_groups.csv')

In [3]:
df_games.head()

Unnamed: 0,BGGId,Name,Description,YearPublished,GameWeight,AvgRating,BayesAvgRating,StdDev,MinPlayers,MaxPlayers,...,Cat:Strategy,Cat:War,Cat:Family,Cat:CGS,Cat:Abstract,Cat:Party,Cat:Childrens,mask,Description_embedding,Image_embedding
0,42,Tigris & Euphrates,regard reiner knizia masterpiece tigris amp eu...,1997,3.5064,7.69599,7.52665,1.48238,2,4,...,1,0,0,0,0,0,0,True,"[1.6221164078648176, 0.4162736554681593, 0.411...","[-0.027837831526994705, 0.266801118850708, -0...."
1,45,Perudo,liars dice aka bluff perudo dudo south america...,1800,1.2625,6.90371,6.72095,1.40634,2,6,...,0,0,1,0,0,1,0,True,"[-1.9267694786956318, -0.8144234246559084, 0.5...","[2.309972047805786, -0.1461237072944641, 0.051..."
2,50,Lost Cities,lose city card game kosmos twoplayer series ga...,1999,1.4921,7.18784,7.08937,1.26552,2,2,...,0,0,1,0,0,0,0,True,"[2.7072484611295216, -1.2085546881032145, 0.75...","[-0.10191409289836884, -0.17125050723552704, 0..."
3,84,Rommel in the Desert,rommel desert fastmoving challenge tense playa...,1982,3.4183,7.54678,6.24443,1.43983,2,2,...,0,1,0,0,0,0,0,True,"[2.0997226904405784, 0.28591152242846124, -0.0...","[1.1813172101974487, -0.09463532269001007, 0.3..."
4,88,Torres,torre abstract game resource management tactic...,1999,2.8656,7.10972,6.82406,1.27896,2,4,...,1,0,0,0,1,0,0,True,"[-2.6206971568635415, -0.6790494237723477, -1....","[1.732688546180725, 0.19650126993656158, -0.33..."


In [4]:
df_users.head()

Unnamed: 0,BGGId,Rating,Username,isTest
0,121657,7.0,crash331,False
1,121657,7.0,randywilburn,False
2,121657,7.0,Lord Kalbut,True
3,121657,7.0,Vegas King,False
4,121657,7.0,Chilwd,False


In [5]:
df_groups.head()

Unnamed: 0,members
0,"['Sam Vimaire', 'Jhopper88']"
1,"['Kehmh', 'dracosf2']"
2,"['Drury67', 'sjackson080', 'mosaicu']"
3,"['CallMeTim72', 'calderyn', 'leochab', 'sirei'..."
4,"['alwaystang', 'Candorras', 'jmellby', 'Manpac..."


In [6]:
df_users.Username.value_counts().head()

Username
leffe dubbel      504
oldgoat3769967    426
TomVasel          413
Tolkana           404
Doel              402
Name: count, dtype: int64

# Utils

In [7]:
def games_min_n(n=9):
    aux = df_games[(df_games.MinPlayers <= n) & (df_games.MaxPlayers >= n)]
    return aux.BGGId

# Recomendación individual

## FunkSVD User only

In [8]:
reader = Reader(rating_scale=(0, 10))
X_train = Dataset.load_from_df(df_users.loc[df_users.isTest == False, ['Username', 'BGGId', 'Rating']],
                               reader=reader).build_full_trainset()
X_test = list(df_users.loc[df_users.isTest == True, ['Username', 'BGGId', 'Rating']].itertuples(index=False, name=None))

svdpp = SVDpp(n_factors=10, n_epochs=150)
svdpp.fit(X_train)
yhat_svdpp = svdpp.test(X_test)

print(accuracy.mae(yhat_svdpp),
      accuracy.rmse(yhat_svdpp))

MAE:  0.9118
RMSE: 1.2102
0.9118345690260528 1.2102351574267065


In [9]:
X_recommend = X_train.build_anti_testset()

In [10]:
predictions = svdpp.test(X_recommend)
predictions

[Prediction(uid='crash331', iid=93819, r_ui=7.450641493627015, est=7.2435554102981925, details={'was_impossible': False}),
 Prediction(uid='crash331', iid=259345, r_ui=7.450641493627015, est=6.814362210882075, details={'was_impossible': False}),
 Prediction(uid='crash331', iid=264295, r_ui=7.450641493627015, est=8.795539534485442, details={'was_impossible': False}),
 Prediction(uid='crash331', iid=25537, r_ui=7.450641493627015, est=6.858620265938407, details={'was_impossible': False}),
 Prediction(uid='crash331', iid=26118, r_ui=7.450641493627015, est=7.1449330244714995, details={'was_impossible': False}),
 Prediction(uid='crash331', iid=3836, r_ui=7.450641493627015, est=7.497880993057428, details={'was_impossible': False}),
 Prediction(uid='crash331', iid=35865, r_ui=7.450641493627015, est=7.443787950618804, details={'was_impossible': False}),
 Prediction(uid='crash331', iid=63778, r_ui=7.450641493627015, est=7.273304537896432, details={'was_impossible': False}),
 Prediction(uid='cras

In [11]:
def get_top_n(predictions, n=10):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # for uid, user_ratings in top_n.items():
    #     user_ratings.sort(key=lambda x: x[1], reverse=True)
    #     top_n[uid] = user_ratings

    return top_n

In [12]:
top_n = get_top_n(predictions)

In [13]:
top_n2 = {k: {k2: v2 for k2, v2 in v} for k, v in top_n.items()}

In [14]:
top_n3 = pd.DataFrame(top_n2.values(), index=top_n2.keys()).fillna(-100)

In [15]:
def get_preds_svdpp(user_list, df_pred):
    df = []
    n = len(user_list)
    filter_list = games_min_n(n)
    df_pred2 = df_pred.loc[:, df_pred.columns.isin(filter_list)]

    for usr in user_list:
        usr_pred = df_pred2.loc[usr].sort_values(ascending=False)
        usr_pred = pd.DataFrame(usr_pred[usr_pred > 0]).T
        if n < usr_pred.size:
            usr_pred = usr_pred.loc[:, usr_pred.columns[:n]].copy()

        df.append(usr_pred)
    df = pd.concat(df, axis=0).reset_index(drop=True).fillna(0)
    return df


In [16]:
df_groups['members2'] = df_groups['members'].apply(lambda x: eval(x))
df_groups['preds_svdpp'] = df_groups['members2'].apply(lambda x: get_preds_svdpp(x, top_n3))

In [19]:
df_groups.to_pickle('df_groups_svdpp')

## Random

In [57]:

def random_rank(user_list):
    games = games_min_n(len(user_list)).to_list()
    df = []
    for _ in range(len(user_list)):
        shuffle(games)
        rcm = pd.DataFrame([{games[i]: 10 - i for i in range(10)}])
        df.append(rcm)
    df = pd.concat(df, axis=0).reset_index(drop=True).fillna(0)
    return df


df_groups['preds_random'] = df_groups['members2'].apply(lambda x: random_rank(x))


In [59]:
df_groups.to_pickle('df_groups_svdpp_random')

In [25]:
yhat_random = np.random.uniform(1, 10, size=(df_users.shape[0], 1))

print('MAE: ', mae(df_users.Rating, yhat_random))
print('RMSE: ', rmse(df_users.Rating, yhat_random))

MAE:  2.867704460921501
RMSE:  3.509065144503993


## Most polular

In [44]:
from sklearn.linear_model import LinearRegression

ranking = df_games.sort_values(by=['Rank:boardgame']).BGGId.to_numpy().reshape(-1, 1)
x = np.linspace(10, 1, len(ranking)).reshape(-1, 1)

popularmap = LinearRegression()

popularmap.fit(ranking, x)

yhat_popular = popularmap.predict(df_users[df_users.isTest].BGGId.to_numpy().reshape(-1, 1))

print('MAE: ', mae(df_users[df_users.isTest].Rating, yhat_popular))
print('RMSE: ', rmse(df_users[df_users.isTest].Rating, yhat_popular))


MAE:  2.1765626030559133
RMSE:  2.4531154702347564


In [52]:

def rank_popular(user_list):
    ranking = df_games.sort_values(by=['Rank:boardgame']).BGGId.to_list()
    df = []
    for _ in range(len(user_list)):
        rcm = pd.DataFrame([{ranking[i]: 10 - i for i in range(10)}])
        df.append(rcm)
    df = pd.concat(df, axis=0).reset_index(drop=True).fillna(0)
    return df


df_groups['preds_popular'] = df_groups['members2'].apply(lambda x: rank_popular(x))


In [60]:
df_groups.to_pickle('df_groups_svdpp_random_popular')
