In [None]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt



In [None]:
# --- CARGA Y PREPARACIÓN DE DATOS ---
df_full = pd.read_parquet('./data/l_vm_completa_train.parquet')
print(df_full.shape)

categorical_features = [
    'ANIO','MES','TRIMESTRE','ID_CAT1','ID_CAT2','ID_CAT3',
    'ID_BRAND','SKU_SIZE','CUSTOMER_ID','PRODUCT_ID','PLAN_PRECIOS_CUIDADOS'
]
for col in categorical_features:
    df_full[col] = df_full[col].astype('category')


In [None]:

plt.hist(df_full['CLASE'], bins=50)
plt.title('Distribución del target (CLASE)')
plt.show()
print(df_full['CLASE'].describe())
print(df_full['CLASE'].value_counts().head(30))
total = df_full['CLASE'].count()
ceros = df_full['CLASE'].value_counts().get(0, 0)
porcentaje_ceros = (ceros / total) * 100
print(f"Porcentaje de ceros: {porcentaje_ceros:.2f}%")



In [None]:


# --- SPLIT TRAIN/VALID ---
periodos_valid = [201909, 201910]
X = df_full[df_full['PERIODO'] <= 201910].drop(columns=['CLASE', 'CLASE_DELTA'])
y = df_full[df_full['PERIODO'] <= 201910]['CLASE']

X_train = X[X['PERIODO'] < periodos_valid[0]]
y_train = y[X['PERIODO'] < periodos_valid[0]]
X_val_list = [X[X['PERIODO'] == p] for p in periodos_valid]
y_val_list = [y[X['PERIODO'] == p] for p in periodos_valid]


In [None]:

# --- ETAPA 1: CLASIFICACIÓN ---
df_full['CLASE_BIN'] = (df_full['CLASE'] > 0).astype(int)
y_train_bin = df_full.loc[X_train.index, 'CLASE_BIN']
y_val_bin_list = [df_full.loc[X_val.index, 'CLASE_BIN'] for X_val in X_val_list]

clf = lgb.LGBMClassifier(
    n_estimators=50000,
    learning_rate=0.0001,
    num_leaves=2048,
    min_child_samples=50,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    max_bin=1024
)

clf.fit(
    X_train,
    y_train_bin,
    eval_set=[(X_val_list[0], y_val_bin_list[0])],
    callbacks=[lgb.early_stopping(stopping_rounds=300), lgb.log_evaluation(period=300)],
    categorical_feature=categorical_features
)


In [None]:
# --- ETAPA 2: REGRESIÓN SOBRE TODOS LOS DATOS (incluyendo ceros) ---
X_train_reg = X_train
y_train_reg = y_train

X_val_reg_list = X_val_list
y_val_reg_list = y_val_list

reg = lgb.LGBMRegressor(
    n_estimators=50000,
    learning_rate=0.0001,
    num_leaves=2048,
    max_bin=1024    
)
reg.fit(
    X_train_reg,
    y_train_reg,
    eval_set=[(X_val_reg_list[0], y_val_reg_list[0])],
    callbacks=[lgb.early_stopping(stopping_rounds=300), lgb.log_evaluation(period=300)],
    categorical_feature=categorical_features
)


In [None]:
# Guardar los modelos entrenados
clf.booster_.save_model('lgbm_classifier.txt')
reg.booster_.save_model('lgbm_regressor.txt')


In [None]:
# --- PREDICCIÓN Y EVALUACIÓN por periodo ---
for i, (X_val, y_val, periodo) in enumerate(zip(X_val_list, y_val_list, periodos_valid)):
    proba_no_cero = clf.predict_proba(X_val)[:, 1]
    umbral = 0.25
    pred_bin = (proba_no_cero > umbral)
    pred_reg = np.zeros(len(X_val))
    if pred_bin.sum() > 0:
        pred_reg[pred_bin] = reg.predict(X_val[pred_bin])
    y_val_real = y_val.values
    # WAPE solo en no-cero
    mask_nocero = y_val_real != 0
    if mask_nocero.sum() > 0:
        wape_nocero = np.sum(np.abs(y_val_real[mask_nocero] - pred_reg[mask_nocero])) / np.sum(np.abs(y_val_real[mask_nocero]))
        print(f"WAPE (no-cero) periodo {periodo}: {wape_nocero:.4f}")
    else:
        print(f"WAPE (no-cero) periodo {periodo}: N/A (no hay valores no-cero)")
    # También puedes seguir mostrando el WAPE global
    wape = np.sum(np.abs(y_val_real - pred_reg)) / np.sum(np.abs(y_val_real))
    print(f"WAPE global periodo {periodo}: {wape:.4f}")
    print(f"Valores distintos de cero en pred_reg: {(pred_reg != 0).sum()} de {len(pred_reg)}")

In [None]:

# --- IMPORTANCIA DE VARIABLES ---
importancia = reg.feature_importances_
nombres = X_train_reg.columns
df_importancia = pd.DataFrame({'feature': nombres, 'importance': importancia})
df_importancia = df_importancia.sort_values(by='importance', ascending=False)
print(df_importancia)

plt.figure(figsize=(10,6))
plt.barh(df_importancia['feature'], df_importancia['importance'])
plt.gca().invert_yaxis()
plt.title('Importancia de variables LightGBM (Regresión)')
plt.xlabel('Importancia')
plt.show()