In [1]:
import numpy as np
import pandas as pd
import gc
import os
import matplotlib.pyplot as plt
import polars as pl
from sklearn.metrics import mean_squared_error, mean_absolute_error
from joblib import Parallel, delayed
from more_itertools import chunked
from functools import reduce
from typing import List
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import pandas as pd
import joblib
import os

In [2]:
# Abrir el archivo parquet y cargarlo en un DataFrame data/l_vm_completa_train_pendientes.parquet
gc.collect()
df_full = pd.read_parquet('./data/l_vm_completa_train.parquet', engine='fastparquet')
df_pendientes = pd.read_parquet('./data/l_vm_completa_train_pendientes.parquet', engine='fastparquet')

In [3]:
# Buscar en df_full los product_id, customer_id que solo tienen ceros en TN
def buscar_productos_solo_ceros(df: pd.DataFrame) -> pd.DataFrame:
    grouped = df.groupby(['PRODUCT_ID', 'CUSTOMER_ID'])['TN'].sum().reset_index()
    productos_solo_ceros = grouped[grouped['TN'] == 0]
    return productos_solo_ceros

productos_solo_ceros = buscar_productos_solo_ceros(df_full)
print(f"🔍 Combinaciones PRODUCT_ID + CUSTOMER_ID con TN = 0 en todos sus registros: {len(productos_solo_ceros)}")

# Eliminar del df_full los product_id, customer_id que solo tienen ceros en TN
def eliminar_productos_solo_ceros(df: pd.DataFrame, productos_solo_ceros: pd.DataFrame) -> pd.DataFrame:
    productos_set = set(zip(productos_solo_ceros['PRODUCT_ID'], productos_solo_ceros['CUSTOMER_ID']))
    mask = df.set_index(['PRODUCT_ID', 'CUSTOMER_ID']).index.isin(productos_set)
    
    cantidad_eliminada = mask.sum()
    print(f"🗑️ Filas eliminadas de df_full: {cantidad_eliminada:,}")
    
    df_filtrado = df[~mask]
    return df_filtrado

df_full = eliminar_productos_solo_ceros(df_full, productos_solo_ceros)


🔍 Combinaciones PRODUCT_ID + CUSTOMER_ID con TN = 0 en todos sus registros: 327068
🗑️ Filas eliminadas de df_full: 6,594,430


In [4]:
# Eliminar de df_full las filas donde la columna A_PREDECIR sea 'N'
df_full = df_full[df_full['A_PREDECIR'] != 'N']
df_full = df_full.drop(columns=['A_PREDECIR'])

In [6]:
df_mediana = (
    df_full
    .groupby(['CUSTOMER_ID', 'PRODUCT_ID'], as_index=False)['TN']
    .median()
    .rename(columns={'TN': 'TN_median'})
    .sort_values('TN_median', ascending=True)
    .reset_index(drop=True)
).copy()

In [7]:
print(df_mediana.head())

   CUSTOMER_ID  PRODUCT_ID  TN_median
0        10637       20741        0.0
1        10285       20409        0.0
2        10285       20411        0.0
3        10285       20414        0.0
4        10285       20416        0.0


In [9]:
# Imprimir de df_full PERIODO, TN donde CUSTOMER_ID== 10637 y PRODUCT_ID == 20741
def imprimir_periodo_tn(df: pd.DataFrame, customer_id: int, product_id: int):
    filtered = df[(df['CUSTOMER_ID'] == customer_id) & (df['PRODUCT_ID'] == product_id)]
    if not filtered.empty:
        print(filtered[['PERIODO', 'TN']])
    else:
        print(f"No se encontraron registros para CUSTOMER_ID={customer_id} y PRODUCT_ID={product_id}")

imprimir_periodo_tn(df_full, 10074, 20032)

        PERIODO        TN
606697   201902   0.00000
607002   201903   0.00000
607376   201904   0.00000
608383   201905   0.00000
608778   201906  21.32106
609460   201907   0.00000
609848   201908  16.51190
610594   201909  43.91375
610918   201910   0.00000
611679   201911  42.24312
612049   201912   1.72512
