In [1]:

from itertools import product
import numpy as np
import pandas as pd
import xgboost as xgb
from scipy.spatial.distance import cosine
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.compose import ColumnTransformer
from sklearn.metrics import root_mean_squared_error as rmse, mean_absolute_error as mae
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils.validation import check_X_y, check_array


In [2]:
df_users = pd.read_csv('../data/df_users_train2.csv', low_memory=False)
df_games = pd.read_csv('../data/df_games_train2.csv', low_memory=False)
df_groups = pd.read_pickle('df_groups_svdpp_random_popular')

In [3]:
print('df_games columns: ', df_games.columns)
print('df_users columns: ', df_users.columns)

df_games columns:  Index(['BGGId', 'Name', 'Description', 'YearPublished', 'GameWeight',
       'AvgRating', 'BayesAvgRating', 'StdDev', 'MinPlayers', 'MaxPlayers',
       'ComAgeRec', 'LanguageEase', 'BestPlayers', 'GoodPlayers', 'NumOwned',
       'NumWant', 'NumWish', 'NumWeightVotes', 'MfgPlaytime', 'ComMinPlaytime',
       'ComMaxPlaytime', 'MfgAgeRec', 'NumUserRatings', 'NumComments',
       'NumAlternates', 'NumExpansions', 'NumImplementations',
       'IsReimplementation', 'Family', 'Kickstarted', 'ImagePath',
       'Rank:boardgame', 'Rank:strategygames', 'Rank:abstracts',
       'Rank:familygames', 'Rank:thematic', 'Rank:cgs', 'Rank:wargames',
       'Rank:partygames', 'Rank:childrensgames', 'Cat:Thematic',
       'Cat:Strategy', 'Cat:War', 'Cat:Family', 'Cat:CGS', 'Cat:Abstract',
       'Cat:Party', 'Cat:Childrens', 'mask', 'Description_embedding',
       'Image_embedding'],
      dtype='object')
df_users columns:  Index(['BGGId', 'Rating', 'Username', 'isTest'], dtype='obje

In [4]:
df_games.loc[0, 'Image_embedding']

'[1.6416904926300049, 0.06321828812360764, 0.3920721709728241, -0.24892094731330872, 0.19002988934516907, -0.42599934339523315, -0.028604289516806602, -0.0818495824933052, 0.14028476178646088, 0.39112919569015503, 0.24689967930316925, -0.2009597271680832, -0.012522021308541298, -0.22951282560825348, 0.08051355928182602]'

In [5]:

df_games['Description_embedding'] = df_games['Description_embedding'].apply(
    lambda x: np.array(eval(x)) if isinstance(x, str) else x)
df_games['Image_embedding'] = df_games['Image_embedding'].apply(
    lambda x: np.array(eval(x)) if isinstance(x, str) else x)

desc_embedding_dim = len(df_games['Description_embedding'].iloc[0])
img_embedding_dim = len(df_games['Image_embedding'].iloc[0])

desc_embedding_cols = [f'desc_embedding_{i}' for i in range(desc_embedding_dim)]
img_embedding_cols = [f'img_embedding_{i}' for i in range(img_embedding_dim)]

game_features_df = df_games[['BGGId']].copy()
game_features_df[desc_embedding_cols] = pd.DataFrame(df_games['Description_embedding'].tolist(), index=df_games.index)
game_features_df[img_embedding_cols] = pd.DataFrame(df_games['Image_embedding'].tolist(), index=df_games.index)

user_game_embeddings = pd.merge(df_users, df_games[['BGGId', 'Description_embedding', 'Image_embedding']], on='BGGId',
                                how='left')


# user_game_embeddings['Description_embedding'] = user_game_embeddings['Description_embedding'].apply(lambda x: np.array(x) if isinstance(x, list) else x)
# user_game_embeddings['Image_embedding'] = user_game_embeddings['Image_embedding'].apply(lambda x: np.array(x) if isinstance(x, list) else x)

def calculate_weighted_embedding_profile(group_df, embedding_col, embedding_dim):
    embeddings = [arr for arr in group_df[embedding_col] if isinstance(arr, np.ndarray)]
    ratings = group_df.loc[[isinstance(arr, np.ndarray) for arr in group_df[embedding_col]], 'Rating'].values

    if len(embeddings) == 0:
        return np.zeros(embedding_dim)

    max_rating = group_df['Rating'].max()
    if max_rating == 0: max_rating = 1
    weights = ratings / max_rating

    weighted_sum = np.sum([embed * weight for embed, weight in zip(embeddings, weights)], axis=0)

    sum_of_weights = np.sum(weights)
    if sum_of_weights > 0:
        return weighted_sum / sum_of_weights
    else:
        return np.zeros(embedding_dim)


print("Calculando perfiles de embedding de usuario ponderados por rating...")
user_desc_embedding_profiles = user_game_embeddings.groupby('Username', group_keys=False).apply(
    lambda x: calculate_weighted_embedding_profile(x, 'Description_embedding', desc_embedding_dim)
).reset_index(name='user_desc_embedding_profile')

user_img_embedding_profiles = user_game_embeddings.groupby('Username', group_keys=False).apply(
    lambda x: calculate_weighted_embedding_profile(x, 'Image_embedding', img_embedding_dim)
).reset_index(name='user_img_embedding_profile')

user_df = pd.merge(user_desc_embedding_profiles, user_img_embedding_profiles, on='Username', how='left')

user_df['user_desc_embedding_profile'] = user_df['user_desc_embedding_profile'].apply(
    lambda x: x if isinstance(x, np.ndarray) else np.zeros(desc_embedding_dim)
)

user_df['user_img_embedding_profile'] = user_df['user_img_embedding_profile'].apply(
    lambda x: x if isinstance(x, np.ndarray) else np.zeros(img_embedding_dim)
)

user_desc_profile_cols = [f'user_desc_profile_{i}' for i in range(desc_embedding_dim)]
user_img_profile_cols = [f'user_img_profile_{i}' for i in range(img_embedding_dim)]

user_df[user_desc_profile_cols] = pd.DataFrame(user_df['user_desc_embedding_profile'].tolist(), index=user_df.index)
user_df[user_img_profile_cols] = pd.DataFrame(user_df['user_img_embedding_profile'].tolist(), index=user_df.index)


Calculando perfiles de embedding de usuario ponderados por rating...


  user_desc_embedding_profiles = user_game_embeddings.groupby('Username', group_keys=False).apply(
  user_img_embedding_profiles = user_game_embeddings.groupby('Username', group_keys=False).apply(


In [6]:


df_merged = pd.merge(df_users, user_df, on='Username', how='left')
df_merged = pd.merge(df_merged, df_games[['BGGId', 'Description_embedding', 'Image_embedding']], on='BGGId', how='left')


def calculate_cosine_similarity(vec1, vec2):
    if isinstance(vec1, np.ndarray) and isinstance(vec2, np.ndarray) and vec1.size > 0 and vec2.size > 0:
        if np.linalg.norm(vec1) == 0 or np.linalg.norm(vec2) == 0:
            return 0
        return 1 - cosine(vec1, vec2)
    return 0


print("Calculando similitud coseno entre perfiles de usuario y embeddings de juego...")
df_merged['cosine_similarity_desc'] = df_merged.apply(
    lambda row: calculate_cosine_similarity(row['user_desc_embedding_profile'], row['Description_embedding']),
    axis=1
)
df_merged['cosine_similarity_img'] = df_merged.apply(
    lambda row: calculate_cosine_similarity(row['user_img_embedding_profile'], row['Image_embedding']),
    axis=1
)

Calculando similitud coseno entre perfiles de usuario y embeddings de juego...


In [7]:

numerical_features = (
    # desc_embedding_cols +
    # img_embedding_cols +
        user_desc_profile_cols +
        user_img_profile_cols +
        ['cosine_similarity_desc', 'cosine_similarity_img']
)

df_model = df_merged.drop(columns=[
    'Description_embedding', 'Image_embedding',
    'user_desc_embedding_profile', 'user_img_embedding_profile',
], errors='ignore')


In [8]:

X_train_df = df_model[df_model['isTest'] == False].copy()
y_train = X_train_df['Rating']
X_train = X_train_df.drop(columns=['Rating', 'BGGId', 'Username', 'isTest'], errors='ignore')

X_test_df = df_model[df_model['isTest'] == True].copy()
y_test = X_test_df['Rating']
X_test = X_test_df.drop(columns=['Rating', 'BGGId', 'Username', 'isTest'], errors='ignore')

print(f"Dimensiones de X_train antes del preprocesamiento: {X_train.shape}")
print(f"Dimensiones de X_test antes del preprocesamiento: {X_test.shape}")
print(f"Columnas numéricas a escalar (solo embeddings y similitudes): {numerical_features}")
print(f"Columnas : {X_train.columns}")

Dimensiones de X_train antes del preprocesamiento: (550982, 32)
Dimensiones de X_test antes del preprocesamiento: (13049, 32)
Columnas numéricas a escalar (solo embeddings y similitudes): ['user_desc_profile_0', 'user_desc_profile_1', 'user_desc_profile_2', 'user_desc_profile_3', 'user_desc_profile_4', 'user_desc_profile_5', 'user_desc_profile_6', 'user_desc_profile_7', 'user_desc_profile_8', 'user_desc_profile_9', 'user_desc_profile_10', 'user_desc_profile_11', 'user_desc_profile_12', 'user_desc_profile_13', 'user_desc_profile_14', 'user_img_profile_0', 'user_img_profile_1', 'user_img_profile_2', 'user_img_profile_3', 'user_img_profile_4', 'user_img_profile_5', 'user_img_profile_6', 'user_img_profile_7', 'user_img_profile_8', 'user_img_profile_9', 'user_img_profile_10', 'user_img_profile_11', 'user_img_profile_12', 'user_img_profile_13', 'user_img_profile_14', 'cosine_similarity_desc', 'cosine_similarity_img']
Columnas : Index(['user_desc_profile_0', 'user_desc_profile_1', 'user_desc_

In [9]:
class XGBWithFeatureWeights(BaseEstimator, RegressorMixin):
    def __init__(self, feature_weights_dict=None, **xgb_params):
        self.feature_weights_dict = feature_weights_dict or {}
        self.xgb_params = xgb_params
        self.booster = None
        self.feature_names_ = None

    def fit(self, X, y):
        X, y = check_X_y(X, y)
        self.feature_names_ = (
            X.columns.tolist() if isinstance(X, pd.DataFrame) else [f'f{i}' for i in range(X.shape[1])]
        )

        dtrain = xgb.DMatrix(X, label=y, feature_names=self.feature_names_)

        weights = np.ones(X.shape[1])
        for i, name in enumerate(self.feature_names_):
            weights[i] = self.feature_weights_dict.get(name, 1.0)
        dtrain.set_info(feature_weights=weights)

        xgb_params = self.xgb_params.copy()
        num_boost_round = xgb_params.pop("n_estimators", 100)

        self.booster = xgb.train(
            params=xgb_params,
            dtrain=dtrain,
            num_boost_round=num_boost_round
        )

        return self

    def predict(self, X):
        X = check_array(X)
        dtest = xgb.DMatrix(X, feature_names=self.feature_names_)
        return self.booster.predict(dtest)


preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),
    ],
    remainder='passthrough'
)

feature_weights = {
    'cosine_similarity_desc': 2.0,
    'cosine_similarity_img': 2.0
}

full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBWithFeatureWeights(
        feature_weights_dict=feature_weights,
        objective='reg:squarederror',
        n_estimators=1000,
        learning_rate=0.03,
        max_depth=16,
        subsample=0.9,
        colsample_bytree=0.7,
        random_state=432,
        nthread=-1
    ))
])

full_pipeline.fit(X_train, y_train)


In [10]:

y_pred = full_pipeline.predict(X_test)

print(f"MAE: {mae(y_test, y_pred)}")
print(f"RMSE: {rmse(y_test, y_pred)}")

MAE: 0.9064645516914919
RMSE: 1.1814321195367057


In [11]:
def get_recommend_list(user_name, verbose=False, top_n=10):
    games_rated_by_user = df_users[df_users['Username'] == user_name]['BGGId'].tolist()
    games_to_recommend_df_raw = df_games[~df_games['BGGId'].isin(games_rated_by_user)].copy()

    if games_to_recommend_df_raw.empty:
        print(
            "Este usuario ya ha calificado todos los juegos disponibles. No se pueden generar nuevas recomendaciones.")
    else:

        df_predict_raw = games_to_recommend_df_raw.copy()
        df_predict_raw['Username'] = user_name

        df_predict_raw = pd.merge(df_predict_raw, user_df, on='Username', how='left', suffixes=('_game', '_user'))

        df_predict_raw['cosine_similarity_desc'] = df_predict_raw.apply(
            lambda row: calculate_cosine_similarity(row['user_desc_embedding_profile'], row['Description_embedding']),
            axis=1
        )
        df_predict_raw['cosine_similarity_img'] = df_predict_raw.apply(
            lambda row: calculate_cosine_similarity(row['user_img_embedding_profile'], row['Image_embedding']),
            axis=1
        )

        df_predict_raw[desc_embedding_cols] = pd.DataFrame(
            np.vstack(df_predict_raw['Description_embedding'].values),
            index=df_predict_raw.index
        )
        df_predict_raw[img_embedding_cols] = pd.DataFrame(
            np.vstack(df_predict_raw['Image_embedding'].values),
            index=df_predict_raw.index
        )
        df_predict_raw[user_desc_profile_cols] = pd.DataFrame(
            np.vstack(df_predict_raw['user_desc_embedding_profile'].values),
            index=df_predict_raw.index
        )
        df_predict_raw[user_img_profile_cols] = pd.DataFrame(
            np.vstack(df_predict_raw['user_img_embedding_profile'].values),
            index=df_predict_raw.index
        )

        X_predict = df_predict_raw[X_train.columns]

        predicted_ratings = full_pipeline.predict(X_predict)
        df_predict_raw['predicted_rating'] = predicted_ratings

        recommendations = df_predict_raw.sort_values(by='predicted_rating', ascending=False).head(top_n)

        if verbose:
            print(f"\nTop {top_n} juegos recomendados para {user_name}:")
            for idx, row in recommendations.iterrows():
                print(f"- {row['Name']} (BGGId: {row['BGGId']}) - Rating Estimado: {row['predicted_rating']:.2f}")
        return recommendations[['BGGId', 'predicted_rating']].set_index('BGGId').T


In [12]:
example_user_df = df_users[df_users['isTest'] == False]['Username'].sample(1)
example_user = example_user_df.iloc[0]

get_recommend_list(example_user, True)


Top 10 juegos recomendados para iamohcy:
- Star Wars: Destiny (BGGId: 205359) - Rating Estimado: 8.61
- Animal Upon Animal: Crest Climbers (BGGId: 111081) - Rating Estimado: 8.44
- For the People (BGGId: 833) - Rating Estimado: 8.42
- Advanced Squad Leader (BGGId: 243) - Rating Estimado: 8.42
- Dixit (BGGId: 39856) - Rating Estimado: 8.37
- Animal Upon Animal (BGGId: 17329) - Rating Estimado: 8.24
- Unconditional Surrender! World War 2 in Europe (BGGId: 61487) - Rating Estimado: 8.23
- Magic Maze Kids (BGGId: 242740) - Rating Estimado: 8.21
- A Fake Artist Goes to New York (BGGId: 135779) - Rating Estimado: 8.18
- Doomtown: Reloaded (BGGId: 156714) - Rating Estimado: 8.17


BGGId,205359,111081,833,243,39856,17329,61487,242740,135779,156714
predicted_rating,8.612644,8.440524,8.415916,8.415916,8.365713,8.239415,8.233847,8.210233,8.177531,8.168461


In [13]:
def games_min_n(n=9):
    aux = df_games[(df_games.MinPlayers <= n) & (df_games.MaxPlayers >= n)]
    return aux.BGGId

def get_preds_xgb(user_list):
    df = []
    n = len(user_list)
    filter_list = games_min_n(n)

    for usr in user_list:
        usr_pred = get_recommend_list(usr)
        usr_pred = usr_pred.loc[:, usr_pred.columns.isin(filter_list)]

        df.append(usr_pred)
    df = pd.concat(df, axis=0).reset_index(drop=True).fillna(1)
    return df

# df_groups['preds_xgb'] = df_groups['members2'].apply(lambda x: get_preds_xgb(x))

In [14]:

def get_recommendations_batch(user_list, top_n=10):

    rated_games = df_users[df_users['Username'].isin(user_list)][['Username', 'BGGId']].set_index(['Username', 'BGGId'])

    all_possible_pairs = pd.DataFrame(product(user_list, df_games['BGGId']), columns=['Username', 'BGGId'])
    all_possible_pairs = all_possible_pairs.set_index(['Username', 'BGGId'])

    unrated_pairs_index = all_possible_pairs.index.difference(rated_games.index)

    df_predict = unrated_pairs_index.to_frame(index=False)

    if df_predict.empty:
        print("Todos los usuarios en la lista han calificado todos los juegos.")
        return {}

    df_predict = pd.merge(df_predict, user_df, on='Username', how='left')
    df_predict = pd.merge(df_predict, df_games, on='BGGId', how='left', suffixes=('_game', '_user'))

    user_desc_embeddings = np.vstack(df_predict['user_desc_embedding_profile'].values)
    game_desc_embeddings = np.vstack(df_predict['Description_embedding'].values)
    user_img_embeddings = np.vstack(df_predict['user_img_embedding_profile'].values)
    game_img_embeddings = np.vstack(df_predict['Image_embedding'].values)

    df_predict['cosine_similarity_desc'] = np.einsum('ij,ij->i', user_desc_embeddings, game_desc_embeddings) / (np.linalg.norm(user_desc_embeddings, axis=1) * np.linalg.norm(game_desc_embeddings, axis=1))
    df_predict['cosine_similarity_img'] = np.einsum('ij,ij->i', user_img_embeddings, game_img_embeddings) / (np.linalg.norm(user_img_embeddings, axis=1) * np.linalg.norm(game_img_embeddings, axis=1))

    df_predict[desc_embedding_cols] = pd.DataFrame(game_desc_embeddings, index=df_predict.index)
    df_predict[img_embedding_cols] = pd.DataFrame(game_img_embeddings, index=df_predict.index)
    df_predict[user_desc_profile_cols] = pd.DataFrame(user_desc_embeddings, index=df_predict.index)
    df_predict[user_img_profile_cols] = pd.DataFrame(user_img_embeddings, index=df_predict.index)

    X_predict = df_predict[X_train.columns]

    predicted_ratings = full_pipeline.predict(X_predict)
    df_predict['predicted_rating'] = predicted_ratings


    recommendations = (df_predict.sort_values(by=['predicted_rating'], ascending=[False])
                       .groupby('Username')
                       .head(top_n))

    # 7. Formatear la salida en un diccionario
    results = {
        user: group[['BGGId', 'predicted_rating']].set_index('BGGId').T
        for user, group in recommendations.groupby('Username')
    }

    return pd.concat(results.values(), axis=0, keys=results.keys())


In [15]:
df_groups['preds_xgb'] = None
df_groups.loc[:, 'preds_xgb'] = df_groups.loc[:, 'members2'].apply(lambda x: get_recommendations_batch(x))

In [16]:
df_groups.to_pickle('df_groups_svdpp_random_popular_xgb')