## Jupyter Notebook para desarrollar la Task 3

Sistema de recomendación de 5 productos para cada sesión nueva.

Lectura de los DataFrames:

In [1]:
from pathlib import Path    
import pandas as pd

RAW_DATA_PATH = Path('../../data/raw')

train_df = pd.read_csv(         RAW_DATA_PATH / 'train.csv')
test_df = pd.read_csv(          RAW_DATA_PATH / 'test.csv')
products_df = pd.read_pickle(   RAW_DATA_PATH / 'products.pkl')
users_df = pd.read_csv(         RAW_DATA_PATH / 'users_data.csv')

In [2]:
import ast

# Extraer datos del string (JSON) de la columna 'values'
def extract_data_from_string(df, column_name):
    
    df[column_name] = df[column_name].apply(ast.literal_eval)
    
    df['country'] = df[column_name].apply(lambda x: x['country'])
    df['R'] = df[column_name].apply(lambda x: x['R'])
    df['F'] = df[column_name].apply(lambda x: x['F'])
    df['M'] = df[column_name].apply(lambda x: x['M'])
    
    df = df.drop(columns=[column_name])
    
    df = df.explode(['country', 'R', 'F', 'M'])
    
    df['country'] = df['country'].astype(int)
    df['R'] = df['R'].astype(int)
    df['F'] = df['F'].astype(int)
    df['M'] = df['M'].astype(float)
    
    return df

users_df = extract_data_from_string(users_df, 'values')

In [3]:
import pandas as pd
import numpy as np

# Products DataFrame Processing
products_df['discount'] = products_df['discount'].astype(int)
products_df['cod_section'] = products_df['cod_section'].fillna(products_df['cod_section'].median()).astype(int)

# Train DataFrame Processing
train_df['user_id'] = train_df['user_id'].fillna(0).astype(int)
train_df['timestamp_local'] = pd.to_datetime(train_df['timestamp_local'])
train_df['pagetype'] = train_df['pagetype'].fillna(train_df['pagetype'].mode()[0]).astype(int)

test_df['user_id'] = test_df['user_id'].fillna(0).astype(int)
test_df['timestamp_local'] = pd.to_datetime(test_df['timestamp_local'])
test_df['pagetype'] = test_df['pagetype'].fillna(test_df['pagetype'].mode()[0]).astype(int)

# Verify conversions
print("Users DataFrame dtypes:\n", users_df.dtypes)
print("\nTrain DataFrame dtypes:\n", train_df.dtypes)
print("\nTest DataFrame dtypes:\n", test_df.dtypes)
print("\nProducts DataFrame dtypes:\n", products_df.dtypes)

Users DataFrame dtypes:
 user_id      int64
country      int32
R            int32
F            int32
M          float64
dtype: object

Train DataFrame dtypes:
 session_id                  int64
date                       object
timestamp_local    datetime64[ns]
add_to_cart                 int64
user_id                     int32
country                     int64
partnumber                  int64
device_type                 int64
pagetype                    int32
dtype: object

Test DataFrame dtypes:
 session_id                  int64
date                       object
timestamp_local    datetime64[ns]
user_id                     int32
country                     int64
partnumber                  int64
device_type                 int64
pagetype                    int32
dtype: object

Products DataFrame dtypes:
 discount        int32
embedding      object
partnumber      int32
color_id        int32
cod_section     int32
family          int32
dtype: object


In [4]:
# Convertimos las variables a categoricas, nos conviene para que el modelo las interprete como tal
users_df['country'] = users_df['country'].astype('category')

categorical_cols_train = ['device_type', 'pagetype', 'country']
for col in categorical_cols_train:
    train_df[col] = train_df[col].astype('category')
    test_df[col] = test_df[col].astype('category')

categorical_cols_products = ['color_id', 'cod_section', 'family']
for col in categorical_cols_products:
    products_df[col] = products_df[col].astype('category')

print("Users DataFrame dtypes:\n", users_df.dtypes)
print("\nTrain DataFrame dtypes:\n", train_df.dtypes)
print("\nTest DataFrame dtypes:\n", test_df.dtypes)
print("\nProducts DataFrame dtypes:\n", products_df.dtypes)

Users DataFrame dtypes:
 user_id       int64
country    category
R             int32
F             int32
M           float64
dtype: object

Train DataFrame dtypes:
 session_id                  int64
date                       object
timestamp_local    datetime64[ns]
add_to_cart                 int64
user_id                     int32
country                  category
partnumber                  int64
device_type              category
pagetype                 category
dtype: object

Test DataFrame dtypes:
 session_id                  int64
date                       object
timestamp_local    datetime64[ns]
user_id                     int32
country                  category
partnumber                  int64
device_type              category
pagetype                 category
dtype: object

Products DataFrame dtypes:
 discount          int32
embedding        object
partnumber        int32
color_id       category
cod_section    category
family         category
dtype: object


In [5]:
# El user_id trae muchos valores nulos, por lo que se nos ocurre crear dos nuevos ids user_id, session_id sin colisiones entre ellos.
# El objetivo es reeplazar los nulos en user_id por el session_id, de forma que si no tenemos el dato de usuario, al menos
#     somos capaces de relacionar registros por sesión (supongamos, por ejemplo que son usuarios no registrados en la web)

# Usamos columnas temporales para crear los nuevos
users_df['temp_user_id'] = users_df['user_id'] * 10
train_df['temp_session_id'] = train_df['session_id'] * 10 + 1
train_df['temp_user_id'] = train_df['user_id'] * 10
test_df['temp_session_id'] = test_df['session_id'] * 10 + 1
test_df['temp_user_id'] = test_df['user_id'] * 10

train_df.loc[train_df['temp_user_id'] == 0, 'temp_user_id'] = train_df.loc[train_df['temp_user_id'] == 0, 'temp_session_id'].astype('int32')
test_df.loc[test_df['temp_user_id'] == 0, 'temp_user_id'] = test_df.loc[test_df['temp_user_id'] == 0, 'temp_session_id'].astype('int32')

users_df['user_id'] = users_df['temp_user_id']
train_df['session_id'] = train_df['temp_session_id']
train_df['user_id'] = train_df['temp_user_id']
test_df['session_id'] = test_df['temp_session_id']
test_df['user_id'] = test_df['temp_user_id']

users_df.drop('temp_user_id', axis=1, inplace=True)
train_df.drop(['temp_session_id', 'temp_user_id'], axis=1, inplace=True)
test_df.drop(['temp_session_id', 'temp_user_id'], axis=1, inplace=True)

# Print, y verificar que no hay colisiones entre los ids.
print(train_df[['session_id', 'user_id']].head())
print(test_df[['session_id', 'user_id']].head())
print(users_df['user_id'].head())

print("\nExisten colisiones:", 
      bool(set(users_df['user_id']).intersection(set(train_df['session_id']))))

   session_id  user_id
0         641      641
1        1171     1171
2        1171     1171
3        5791     5791
4       12201  4807290
   session_id  user_id
0        7461     7461
1        7461     7461
2        7461     7461
3        7461     7461
4        7461     7461
0    1803480
1    1754230
1    1754230
2    1803490
3    1803500
Name: user_id, dtype: int64

Existen colisiones: False


In [6]:
def add_user_features(df: pd.DataFrame) -> pd.DataFrame:
    """Añadimos variables de interés calculadas a nivel de user_id"""
    
    print("Transformamos timestamps...")
    df['total_seconds'] = df['timestamp_local'].astype(np.int64) // 1e9
    df['hour'] = df['timestamp_local'].dt.hour
    
    print("Agrupamos por user_id...")
    grouped = df.groupby('user_id', observed=True).agg({
        'partnumber': 'nunique',
        'pagetype': 'nunique',
        'total_seconds': ['min', 'max'],
        'hour': 'first',
        'user_id': 'size' # Interacciones
    }).reset_index()
    
    grouped.columns = ['user_id', 'unique_products', 'unique_pagetypes', 
                      'min_time', 'max_time', 'first_interaction_hour',
                      'total_interactions']
    
    # Calculamos el tiempo
    grouped['total_user_time'] = (grouped['max_time'] - grouped['min_time']).round(2)
    grouped = grouped.drop(['min_time', 'max_time'], axis=1)
    
    # Merge
    print("Merging results...")
    result = df.merge(grouped, on='user_id')
    
    return result

In [7]:
train_df = add_user_features(train_df)
test_df = add_user_features(test_df)

Transformamos timestamps...
Agrupamos por user_id...
Merging results...
Transformamos timestamps...
Agrupamos por user_id...
Merging results...


In [8]:
# Añadiremos tres columnas, el ratio de exito del usuario con el producto, el ratio de exito global del producto, y el numero de veces que se ha añadido al carrito globalmente cada producto.
user_product_metrics = (
    train_df
    .groupby(['user_id', 'partnumber'])
    .agg({
        'add_to_cart': ['sum', 'count']
    })
    .reset_index()
)

user_product_metrics.columns = ['user_id', 'partnumber', 'adds_to_cart', 'total_visits']
user_product_metrics['success_rate'] = user_product_metrics['adds_to_cart'] / user_product_metrics['total_visits']

train_df = train_df.merge(
    user_product_metrics[['user_id', 'partnumber', 'success_rate']], 
    on=['user_id', 'partnumber'], 
    how='left'
)

product_metrics = (
    train_df
    .groupby('partnumber')
    .agg({
        'add_to_cart': ['sum', 'count']
    })
    .reset_index()
)

product_metrics.columns = ['partnumber', 'total_adds_to_cart', 'total_visits']
product_metrics['global_success_rate'] = product_metrics['total_adds_to_cart'] / product_metrics['total_visits']

products_df = products_df.merge(
    product_metrics[['partnumber', 'total_adds_to_cart', 'global_success_rate']], 
    on='partnumber', 
    how='left'
)

# Cubrimos nulos
train_df['success_rate'] = train_df['success_rate'].fillna(0)
products_df['total_adds_to_cart'] = products_df['total_adds_to_cart'].fillna(0)
products_df['global_success_rate'] = products_df['global_success_rate'].fillna(0)

In [9]:
import pandas as pd
import numpy as np

def analyze_dataframes(dfs_dict):
    for name, df in dfs_dict.items():
        print(f"\n{'='*20} {name} Analysis {'='*20}")
        
        print(f"\nShape: {df.shape}")
        print(f"Memory Usage: {df.memory_usage().sum() / 1024**2:.2f} MB")
        
        print("\nColumns and Types:")
        print(df.dtypes)
        
        print("\nMissing Values:")
        print(df.isnull().sum())

        print("\nNumerical Columns Statistics:")
        print(df.describe())
        
        print("\nSample Data:")
        print(df.head())

dfs = {
    'Train': train_df,
    'Test': test_df,
    'Users': users_df,
    'Products': products_df
}

analyze_dataframes(dfs)



Shape: (46551445, 17)
Memory Usage: 4572.68 MB

Columns and Types:
session_id                         int64
date                              object
timestamp_local           datetime64[ns]
add_to_cart                        int64
user_id                            int32
country                         category
partnumber                         int64
device_type                     category
pagetype                        category
total_seconds                    float64
hour                               int32
unique_products                    int64
unique_pagetypes                   int64
first_interaction_hour             int32
total_interactions                 int64
total_user_time                  float64
success_rate                     float64
dtype: object

Missing Values:
session_id                0
date                      0
timestamp_local           0
add_to_cart               0
user_id                   0
country                   0
partnumber                0
device_

## Entrenamiento del modelo: Versión 1

In [10]:
import pandas as pd
from lightgbm import LGBMRegressor
from pathlib import Path
import json

def prepare_features(df, users_df, products_df):
    
    # Temporales
    df['hour'] = pd.to_datetime(df['timestamp_local']).dt.hour.astype('int8')
    df['day'] = pd.to_datetime(df['timestamp_local']).dt.day.astype('int8')
    
    # Merge de los datasets
    df = df.merge(users_df[['user_id', 'country', 'R', 'F', 'M']], on=['user_id','country'], how='left')
    df = df.merge(products_df[['partnumber', 'discount']], on='partnumber', how='left')
    

    features = ['hour', 'day', 'country', 'device_type', 'pagetype',
                'R', 'F', 'M', 'discount', 'total_seconds',
                'unique_products', 'unique_pagetypes', 'first_interaction_hour', 'total_interactions', 'total_user_time']

    return df[features]

# Prepare train and test data
X_train = prepare_features(train_df, users_df, products_df)
y_train = train_df['success_rate']
X_test = prepare_features(test_df, users_df, products_df)

print('Entrenando modelo...')

# Entrenamos el modelo
model = LGBMRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=7,
    random_state=42
)
model.fit(X_train, y_train)

# Prediccion
test_df['predicted_success_rate'] = model.predict(X_test)

Entrenando modelo...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.128495 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1877
[LightGBM] [Info] Number of data points in the train set: 46551445, number of used features: 15
[LightGBM] [Info] Start training from score 0.058984


In [14]:

# Completaremos las recomendaciones con productos populares
popular_products = (
    products_df
    .sort_values('global_success_rate', ascending=False)
    ['partnumber']
    .tolist()
)

enriched_test = (
    test_df
    .merge(
        products_df[['partnumber', 'global_success_rate']], 
        on='partnumber', 
        how='left'
    )
)

# Crearemos una composición para las recomendaciones, ponderando popularidad global y predicción
PREDICTION_WEIGHT = 0.8
GLOBAL_WEIGHT = 0.2

enriched_test['composite_score'] = (
    PREDICTION_WEIGHT * enriched_test['predicted_success_rate'] + 
    GLOBAL_WEIGHT * enriched_test['global_success_rate'].fillna(0)
)

recommendations = (
    enriched_test
    .sort_values(['composite_score'], ascending=[False])
    .groupby('session_id')
    .agg({
        'partnumber': lambda x: list(x)
    }) 
)

def pad_recommendations(prods):
    prods = list(dict.fromkeys(prods))  
    
    if len(prods) >= 5:
        return prods[:5] 
    
    remaining = [p for p in popular_products if p not in prods]
    return prods + remaining[:5-len(prods)]

result_dict = {
    "target": {
        str(session_id)[:-1]: pad_recommendations(prods)
        for session_id, prods in recommendations['partnumber'].items()
    }
}

for session_id, prods in result_dict['target'].items():
    assert len(prods) == len(set(prods)) == 5, f"Session {session_id} has duplicates or wrong length"

# Guardamos las predicciones en un archivo JSON
output_path = Path('../../predictions/predictions_3.json')
output_path.parent.mkdir(parents=True, exist_ok=True)

with open(output_path, 'w') as f:
    json.dump(result_dict, f, indent=4)

In [13]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
import json
from pathlib import Path

# Entrenamiento de modelo rapido, no supervisado, de productos similares.
product_features = products_df[['discount', 'total_adds_to_cart', 'global_success_rate','cod_section','family','color_id']].values
nn_model = NearestNeighbors(n_neighbors=5, metric='cosine', algorithm='brute')
nn_model.fit(product_features)

distances, indices = nn_model.kneighbors(product_features)

# Mapeamos cada valor de products_df con sus 5 K-vecinos del modelo anterior (lo usaremos en las recomendaciones)
product_similarities = {
    products_df.iloc[i]['partnumber']: products_df.iloc[indices[i]]['partnumber'].tolist()
    for i in range(len(products_df))
}

def get_similar_products(partnumber, k=3):
    return list(dict.fromkeys(product_similarities.get(partnumber, [])))[:k]

def pad_recommendations(prods):
    prods = list(dict.fromkeys(prods))  # Valores unicos
    
    if len(prods) >= 5:
        return prods[:5]

    # Si el modelo no es capaz de recomendar 5 productos, añadimos productos similares 
    similar_prods = []
    for p in prods:
        similar_prods.extend(get_similar_products(p, k=3))

    combined_prods = list(dict.fromkeys(prods + similar_prods))  # Valores unicos

    if len(combined_prods) >= 5:
        return combined_prods[:5]

    # Rellenamos con productos populares (puede suceder que no tengamos recomendaciones, por ejemplo un nuevo usuario, en ese caso recomendaremos productos que tienen un exito general)
    remaining = [p for p in popular_products if p not in combined_prods]
    final_recommendations = (combined_prods + remaining)[:5]

    return final_recommendations

result_dict = {
    "target": {
        str(session_id)[:-1]: pad_recommendations(prods)
        for session_id, prods in recommendations['partnumber'].items()
    }
}

# Debugging
for session_id, prods in result_dict['target'].items():
    if len(prods) != len(set(prods)) or len(prods) != 5:
        print(f"⚠️ Debug: Session {session_id} - Recommendations: {prods}")

for session_id, prods in result_dict['target'].items():
    assert len(prods) == len(set(prods)) == 5, f"⚠️ AssertionError: Session {session_id} has duplicates or wrong length"

# Guardar JSON
output_path = Path('../../predictions/predictions_3.json')
output_path.parent.mkdir(parents=True, exist_ok=True)

with open(output_path, 'w') as f:
    json.dump(result_dict, f, indent=4)

print("✅ Recomendaciones guardadas!")


✅ Recomendaciones guardadas!


## Entrenamiento del modelo: Versión 2

In [10]:
# Calcular historial del usuario en train_df
user_history = (
    train_df
    .groupby('user_id')
    .agg(
        avg_success_rate_per_user=('success_rate', 'mean'),
        num_unique_products_interacted=('partnumber', 'nunique')
    )
    .reset_index()
)

In [12]:
print(user_history.head())


   user_id  avg_success_rate_per_user  num_unique_products_interacted
0       10                       0.00                               5
1       11                       0.00                               1
2       20                       0.15                               7
3       31                       0.00                               2
4       40                       0.00                               4


In [11]:
from lightgbm import LGBMRanker
import pandas as pd

def prepare_features(df, users_df, products_df):
    
    # Temporales
    df['hour'] = pd.to_datetime(df['timestamp_local']).dt.hour.astype('int8')
    df['day'] = pd.to_datetime(df['timestamp_local']).dt.day.astype('int8')
    
    # Merge de los datasets
    df = df.merge(users_df[['user_id', 'country', 'R', 'F', 'M']], on=['user_id','country'], how='left')
    df = df.merge(products_df[['partnumber', 'discount']], on='partnumber', how='left')

    features = ['hour', 'day', 'country', 'device_type', 'pagetype',
                'R', 'F', 'M', 'discount', 'total_seconds',
                'unique_products', 'unique_pagetypes', 'first_interaction_hour', 'total_interactions', 'total_user_time']

    return df[features]

# Preparar características
X_train = prepare_features(train_df, users_df, products_df)
X_train = X_train.merge(user_history, on='user_id', how='left')
y_train = train_df['success_rate']

X_test = prepare_features(test_df, users_df, products_df)
X_test = X_test.merge(user_history, on='user_id', how='left')

# Definir grupos (número de interacciones por sesión)
train_groups = train_df.groupby("user_id").size().tolist()
test_groups = test_df.groupby("user_id").size().tolist()

# Entrenar modelo de ranking
model = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="gbdt",
    n_estimators=200,
    learning_rate=0.05,
    max_depth=7,
    random_state=42
)
model.fit(X_train, y_train, group=train_groups)

# Predicción y ordenamiento
test_df['predicted_success_rate'] = model.predict(X_test)
test_df = test_df.sort_values(by=['user_id', 'predicted_success_rate'], ascending=[True, False])


KeyError: 'user_id'