In [2]:
import pandas as pd
import numpy as np
from itertools import combinations
from scipy.stats import kendalltau
from tqdm import tqdm

In [2]:
df_users = pd.read_csv('../data/df_users_train2.csv')
df_users.head()

Unnamed: 0,BGGId,Rating,Username
0,35865,10.0,zebracat
1,35865,9.0,Pfahrer
2,35865,9.0,gregorus
3,35865,9.0,tryytty
4,35865,9.0,Tolkana


# 1. Formar grupos

## 1.1 Verificar que todos los usuarios del test tengan al menos 3 juegos con nota mayor a 5

In [5]:
valid_games_count = df_users[(df_users['Rating'] > 5)].groupby('Username').size()

valid_users = valid_games_count[valid_games_count >= 3].index.tolist()
valid_users.__len__(), df_users.Username.unique().size

(8231, 8231)

## 1.2 Agrupar

In [29]:
def random_similar_groups(
        data: pd.DataFrame,
        l_groups: int,
        s_min: int = 2,
        s_max: int = 10,
        tau_sim: float = 0.5
) -> list[list[str]]:
    user_item_matrix = data.pivot_table(index='Username', columns='BGGId', values='Rating')
    users = user_item_matrix.index.tolist()

    groups = []

    with tqdm(total=l_groups, desc="Grouping") as pbar:
        while len(groups) < l_groups:
            group_size = np.random.randint(s_min, s_max + 1)

            selected_users = []
            attempts = 0
            max_attempts = 1000

            while len(selected_users) < group_size and attempts < max_attempts:
                candidate_user = np.random.choice(users)

                if candidate_user not in selected_users:
                    temp_group = selected_users + [candidate_user]

                    is_similar = True
                    if len(temp_group) > 1:
                        for u1, u2 in combinations(temp_group, 2):
                            ratings_u1 = user_item_matrix.loc[u1].dropna()
                            ratings_u2 = user_item_matrix.loc[u2].dropna()

                            common_items = list(set(ratings_u1.index) & set(ratings_u2.index))

                            if len(common_items) < 2:
                                is_similar = False
                                break

                            tau, _ = kendalltau(ratings_u1[common_items], ratings_u2[common_items])
                            if tau < tau_sim:
                                is_similar = False
                                break

                    if is_similar:
                        selected_users.append(candidate_user)
                attempts += 1

            if len(selected_users) == group_size:
                groups.append(selected_users)
                pbar.update(1)

    return groups

In [31]:
groups = random_similar_groups(df_users, 249)

Grouping: 100%|██████████| 250/250 [07:02<00:00,  1.69s/it]


In [54]:
len(groups)

249

# 2. Separar entrenamiento y test

In [55]:
users_in_groups = []
for g in groups:
    users_in_groups.extend(g)
users_in_groups = set(users_in_groups)
len(users_in_groups)

467

In [56]:
df_users.Username.unique().size

8231

In [57]:
def assign_test_items_group_wise(df_users: pd.DataFrame, groups: list[list[str]]) -> pd.DataFrame:
    df_users['isTest'] = False

    with tqdm(total=len(groups), desc="Grouping") as pbar:
        for group in groups:
            if not group:
                continue

            group_ratings = df_users[df_users['Username'].isin(group)]

            item_user_counts = group_ratings.groupby('BGGId')['Username'].nunique()

            common_items_in_group = item_user_counts[item_user_counts == len(group)].index.tolist()

            if common_items_in_group:
                pbar.update(1)
                for user in group:
                    df_users.loc[
                        (df_users['Username'] == user) &
                        (df_users['BGGId'].isin(common_items_in_group)),
                        'isTest'
                    ] = True
            else:
                print(group)


In [58]:
assign_test_items_group_wise(df_users, groups)

Grouping: 100%|██████████| 249/249 [00:09<00:00, 24.98it/s]


In [60]:
df_groups = pd.DataFrame({'members': groups})
print(df_groups.shape)
df_groups.head()

(249, 1)


Unnamed: 0,members
0,"[Snawk, Opal82]"
1,"[David546, taragalinas]"
2,"[Vadorojo, Travellingmatti, DanKill]"
3,"[HeavyAdge, Jawaswag]"
4,"[Qelha14, MarcusK]"


In [62]:
df_users.isTest.value_counts()

isTest
False    550982
True      13049
Name: count, dtype: int64

# 3. Consistencia entre test y train

In [13]:
train_users = set(df_users[df_users.isTest==False].Username.unique())
test_users = set(df_users[df_users.isTest==True].Username.unique())

In [14]:
test_users - train_users

{'UsuarioMedio'}

In [9]:
df_users.loc[df_users.Username=='UsuarioMedio', 'isTest'] = False

In [18]:
df_groups = df_groups[~df_groups['members'].apply(lambda x: 'UsuarioMedio' in x)]
df_groups.head()

Unnamed: 0,members
0,"['Snawk', 'Opal82']"
1,"['David546', 'taragalinas']"
2,"['Vadorojo', 'Travellingmatti', 'DanKill']"
3,"['HeavyAdge', 'Jawaswag']"
4,"['Qelha14', 'MarcusK']"


In [19]:
df_groups.to_csv('../data/df_groups.csv', index=False)

In [20]:
df_users.to_csv('../data/df_users_train2.csv', index=False)