# Noteboook destinada al entrenamiento del modelo

## 1. Importar librerías y raw data
Todas las librerías (Pandas, NumPy, Seaborn...) y el propio código modular `src/`.

In [119]:
import sys
sys.path.insert(0, '../src/')
import pandas as pd
import importlib
import feature_engineering 
import preprocesing
importlib.reload(feature_engineering)
importlib.reload(preprocesing)
import numpy as np
from feature_engineering import get_cat_num_features
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [120]:
df = pd.read_csv('../data/processed/df_feat.csv')
df['fecha'] = pd.to_datetime(df['fecha'])

target = 'minutos_visitados_mes'

In [121]:
# Probar modelar sin los outliers mayores al percentil 0.99 (sino comentar)

#df = df[(df[target] < df[target].quantile(0.99))]
#df[target].hist()
#df['cliente_id'].nunique()

### Split temporal Train - Valid - Test

In [122]:
df_train = df[df['aniomes'] < 202504]
df_valid = df[(df['aniomes'] == 202504)]
df_test = df[df['aniomes'] == 202505]

### Agrego una feature según ranking de ventas. 

La idea es codificar a aquellos clientes que conforman el 80% de las ventas de la empresa

Grupo 1: Clientes que representan el 80% acumulado de las ventas totales de la empresa.

Grupo 0: Clientes que contribuyen al 20% restante de las ventas.

Esto lo hago en el Train para que no haya fuga de datos

In [123]:
ventas_historicas = (
    df_train.groupby("cliente_id")["venta_total_negocios_mes"]
    .sum()
    .reset_index()
    .rename(columns={"venta_total_negocios_mes": "ventas_totales"})
)

ventas_historicas = ventas_historicas.sort_values("ventas_totales", ascending=False)

# Calcular % acumulado
ventas_historicas["porc_acum"] = ventas_historicas["ventas_totales"].cumsum() / ventas_historicas["ventas_totales"].sum()

# Marcar los que representan el 80% de las ventas
ventas_historicas["top_80"] = ventas_historicas["porc_acum"] <= 0.8

# top 80%
clientes_top80 = ventas_historicas[ventas_historicas["top_80"]]

In [124]:
ventas_historicas

Unnamed: 0,cliente_id,ventas_totales,porc_acum,top_80
3621,4287a574ace354de2bdb68b16cff63df,6699.25876,0.014312,True
9272,a8d7a94c59bd58456b330d1d496227ae,5247.12972,0.025522,True
6734,7c74aa66e32a6ab05b043b02cee501bb,4910.90215,0.036013,True
11415,cfdd0ec5a9c983616e7813cf9607ba92,4062.11506,0.044691,True
5075,5e36317c7dabe9ba2764f6646144087d,3995.40344,0.053227,True
...,...,...,...,...
10061,b79aead1332ed45ab682b3844d069a1e,0.00000,1.000001,False
5957,6e2c22c3085c315374a7078d3d6725d5,0.00000,1.000001,False
12511,e3cea37f6ee0f52810cd0e8f64185f82,0.00000,1.000001,False
5313,62a434488b6f3eb6db0894a94fc4c11f,0.00000,1.000001,False


In [None]:
df_train['top_80'] = df_train['cliente_id'].isin(clientes_top80['cliente_id'])
df_valid['top_80'] = df_valid['cliente_id'].isin(clientes_top80['cliente_id'])
df_test['top_80'] = df_test['cliente_id'].isin(clientes_top80['cliente_id'])


In [126]:
cat_features, num_features = get_cat_num_features(df_train)
num_features = [col for col in num_features if col not in ['aniomes', 'minutos_visitados_mes']]
cat_features = [col for col in cat_features if col not in ['cliente_id']]

3 variables categóricas
19 variables numéricas
22 variables en total


In [127]:
num_features

['cantidad_heladeras',
 'venta_total_negocios_mes',
 'cantidad_productos_total_negocios_vendidos_mes',
 'cantidad_compras_total_negocios_mes',
 'dias_entre_compras_total_negocios_mes',
 'venta_negocio1_mes',
 'venta_negocio2_mes',
 'venta_negocio3_mes',
 'venta_negocio4_mes',
 'productos_por_compra',
 'venta_promedio_x_compra',
 'venta_por_heladera',
 'ratio_neg1',
 'ratio_neg2',
 'ratio_neg3',
 'ratio_neg4',
 'flag_suma_ventas_0']

Separar features de target

In [128]:
X_train, y_train = df_train[cat_features + num_features], df_train[target]
X_valid, y_valid = df_valid[cat_features + num_features], df_valid[target]
X_test, y_test = df_test[cat_features + num_features], df_test[target]

Custom metric:

Se crea una metrica basada en si el error se comete en clientes importantes o no 

In [129]:
def weighted_mae(y_true, y_pred, sample_weight):
    return np.sum(sample_weight * np.abs(y_true - y_pred)) / np.sum(sample_weight)

sample_weight_train = np.where(df_train['top_80'], 2, 1)
sample_weight_valid = np.where(df_valid['top_80'], 2, 1)
sample_weight_test = np.where(df_test['top_80'], 2, 1)

### Linear Regression

In [130]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
    ])

cat_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_features),
        ('cat', cat_pipeline, cat_features)
    ]
)
# Pipeline final con modelo
model = Pipeline([
    ('preproc', preprocessor),
    ('regressor', LinearRegression())
])

# Ajustar el modelo
model.fit(X_train, y_train)
import joblib
filename = f"modelo_linear.pkl"
joblib.dump(model, filename)

# Predecir en entrenamiento y validación
y_train_pred = model.predict(X_train)
y_valid_pred = model.predict(X_valid)

# Calcular métricas
mae_train = mean_absolute_error(y_train, y_train_pred)
rmse_train = root_mean_squared_error(y_train, y_train_pred)

mae_valid = mean_absolute_error(y_valid, y_valid_pred)
rmse_valid = root_mean_squared_error(y_valid, y_valid_pred)

print(f"Train MAE (Baseline LR): {mae_train:.4f}")
print(f"Train RMSE (Baseline LR): {rmse_train:.4f}")
print(f"Custom Weighted MAE: {weighted_mae(y_train, y_train_pred, sample_weight_train):.4f}")
print(f"------------------------------")
print(f"Validation MAE (Baseline LR): {mae_valid:.4f}")
print(f"Validation RMSE (Baseline LR): {rmse_valid:.4f}")
print(f"Custom Weighted MAE: {weighted_mae(y_valid, y_valid_pred, sample_weight_valid):.4f}")

Train MAE (Baseline LR): 31.7355
Train RMSE (Baseline LR): 55.3890
Custom Weighted MAE: 36.8699
------------------------------
Validation MAE (Baseline LR): 26.6024
Validation RMSE (Baseline LR): 41.7914
Custom Weighted MAE: 30.3571


### CatBoost

In [131]:
from sklearn.impute import SimpleImputer
X_train_cb = X_train.copy()
X_valid_cb = X_valid.copy()


imputer = SimpleImputer(strategy='median')
X_train_cb[num_features] = imputer.fit_transform(X_train_cb[num_features])
X_valid_cb[num_features] = imputer.transform(X_valid_cb[num_features])


model_cb = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.05,
        depth=10,
        loss_function='MAE',
        eval_metric='MAE',
        random_seed=42,
        early_stopping_rounds=100,
        verbose=100
    )

model_cb.fit(
        X_train_cb, y_train,
        eval_set=(X_valid_cb, y_valid),
        cat_features=cat_features,
        use_best_model=True,
    )



0:	learn: 42.2738150	test: 39.0203708	best: 39.0203708 (0)	total: 298ms	remaining: 4m 57s
100:	learn: 26.6261610	test: 22.6335972	best: 22.6335972 (100)	total: 19.5s	remaining: 2m 53s
200:	learn: 26.3488255	test: 22.6028087	best: 22.6026525 (198)	total: 38.3s	remaining: 2m 32s
300:	learn: 26.1807835	test: 22.5815385	best: 22.5813155 (292)	total: 59.8s	remaining: 2m 18s
400:	learn: 26.1236784	test: 22.5603947	best: 22.5600820 (398)	total: 1m 21s	remaining: 2m 1s
500:	learn: 26.1115149	test: 22.5623450	best: 22.5592972 (423)	total: 1m 43s	remaining: 1m 43s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 22.55929718
bestIteration = 423

Shrink model to first 424 iterations.


<catboost.core.CatBoostRegressor at 0x1b182852af0>

In [132]:
import joblib
filename = f"modelo_catboost.pkl"
joblib.dump(model_cb, filename)

preds_cb_train = model_cb.predict(X_train_cb)
preds_cb_valid= model_cb.predict(X_valid_cb)

mae_cb_train = mean_absolute_error(y_train, preds_cb_train)
rmse_cb_train = root_mean_squared_error(y_train, preds_cb_train)
print(f"Train MAE CatBoost: {mae_cb_train:.4f}")
print(f"Train RMSE CatBoost: {rmse_cb_train:.4f}")
print(f"Custom Weighted MAE: {weighted_mae(y_train, preds_cb_train, sample_weight_train):.4f}")

mae_cb = mean_absolute_error(y_valid, preds_cb_valid)
rmse_cb = root_mean_squared_error(y_valid, preds_cb_valid)

print(f"------------------------------")
print(f"Validation MAE CatBoost: {mae_cb:.4f}")
print(f"Validation RMSE CatBoost: {rmse_cb:.4f}")
print(f"Custom Weighted MAE: {weighted_mae(y_valid, preds_cb_valid, sample_weight_valid):.4f}")

Train MAE CatBoost: 26.1468
Train RMSE CatBoost: 51.1250
Custom Weighted MAE: 30.9547
------------------------------
Validation MAE CatBoost: 22.5593
Validation RMSE CatBoost: 38.6731
Custom Weighted MAE: 26.2669


### Feature importance

In [133]:
df_importances = pd.DataFrame({
    "feature": X_train_cb.columns,
    "importance": model_cb.feature_importances_
})

# Filtrar solo importancias > 0 y ordenar
df_importances = df_importances[df_importances["importance"] > 0] \
                                .sort_values(by="importance", ascending=False)

print(df_importances)

                                           feature  importance
0                                            canal   54.113977
1                                           region   10.071543
2                               cantidad_heladeras    9.560851
6            dias_entre_compras_total_negocios_mes    4.694238
4   cantidad_productos_total_negocios_vendidos_mes    3.601356
11                            productos_por_compra    2.776224
5              cantidad_compras_total_negocios_mes    2.553693
12                         venta_promedio_x_compra    1.880980
7                               venta_negocio1_mes    1.571591
8                               venta_negocio2_mes    1.528713
3                         venta_total_negocios_mes    1.417613
14                                      ratio_neg1    1.283577
15                                      ratio_neg2    1.096356
13                              venta_por_heladera    0.847091
16                                      ratio_neg3    0

In [134]:
# Calcular porcentaje acumulado
df_importances["cum_importance"] = df_importances["importance"].cumsum()

# Normalizar a porcentaje sobre el total
df_importances["cum_importance_pct"] = 100 * df_importances["cum_importance"] / df_importances["importance"].sum()

# Mostrar las primeras features que cubren el 95% de la importancia
n_features_95 = (df_importances["cum_importance_pct"] <= 95).sum()

print(f"Con las {n_features_95} primeras features cubro el 95% de la importancia total.")
top_features = df_importances.loc[df_importances["cum_importance_pct"] <= 95, "feature"].tolist()

Con las 11 primeras features cubro el 95% de la importancia total.


### Predicción en Test

In [135]:
X_test_cb = X_test.copy()
X_test_cb[num_features] = imputer.transform(X_test_cb[num_features])

y_pred_test = model_cb.predict(X_test_cb)

In [136]:
print(f"Test MAE CatBoost: {mean_absolute_error(y_test, y_pred_test):.4f}")
print(f"Test RMSE CatBoost: {root_mean_squared_error(y_test, y_pred_test):.4f}")
print(f"Test Custom Weighted MAE: {weighted_mae(y_test, y_pred_test, sample_weight_test):.4f}")

Test MAE CatBoost: 26.3660
Test RMSE CatBoost: 46.0008
Test Custom Weighted MAE: 30.6834
