# 📘 Forecasting por Producto usando Regresión Lineal

## ✅ 1. Carga de librerías

In [1]:

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression


## 📌 2. Armado del Dataset


In [2]:
# Cargar parquet base
df = pd.read_parquet("C:/Developer/Laboratorio_III/data/dataset_product_periodo.parquet")

# Cargar lista de product_id a predecir
df_ids = pd.read_csv("C:/Developer/Laboratorio_III/data/product_id_apredecir201912.txt", sep='\t')
product_ids_objetivo = df_ids['product_id'].unique()

# Filtrar solo esos productos
df = df[df['product_id'].isin(product_ids_objetivo)].copy()

df.head()

Unnamed: 0,product_id,periodo,tn_total,clientes_positivos,cat1,cat2,cat3,brand,sku_size,descripcion,fecha,mm-yyyy,quarter
0,20001,201701,934.77222,186,HC,ROPA LAVADO,Liquido,ARIEL,3000.0,genoma,2017-01-01,01-2017,2017Q1
1,20001,201702,798.0162,185,HC,ROPA LAVADO,Liquido,ARIEL,3000.0,genoma,2017-02-01,02-2017,2017Q1
2,20001,201703,1303.35771,188,HC,ROPA LAVADO,Liquido,ARIEL,3000.0,genoma,2017-03-01,03-2017,2017Q1
3,20001,201704,1069.9613,104,HC,ROPA LAVADO,Liquido,ARIEL,3000.0,genoma,2017-04-01,04-2017,2017Q2
4,20001,201705,1502.20132,238,HC,ROPA LAVADO,Liquido,ARIEL,3000.0,genoma,2017-05-01,05-2017,2017Q2



## 📌 3. Calcular la clase (tn en periodo +2)


In [3]:
if not np.issubdtype(df['fecha'].dtype, np.datetime64):
    df['fecha'] = pd.to_datetime(df['fecha'])

df = df.sort_values(['product_id', 'fecha'])
df['clase'] = df.groupby('product_id')['tn_total'].shift(-2)

## ✅ 4. Feature Engineering

In [4]:
for lag in range(1, 12):
    df[f'tn_{lag}'] = df.groupby('product_id')['tn_total'].shift(lag)

lag_cols = [f'tn_{i}' for i in range(1,12)]

# Diferencia tn_total con lag 2 y lag 11
df['diff_tn_lag2'] = df['tn_total'] - df['tn_2']
df['diff_tn_lag11'] = df['tn_total'] - df['tn_11']

# Medias móviles
df['rollmean_3'] = df.groupby('product_id')['tn_total'].transform(lambda x: x.shift(1).rolling(3).mean())
df['rollmean_6'] = df.groupby('product_id')['tn_total'].transform(lambda x: x.shift(1).rolling(6).mean())
df['rollmean_9'] = df.groupby('product_id')['tn_total'].transform(lambda x: x.shift(1).rolling(9).mean())
df['rollmean_12'] = df.groupby('product_id')['tn_total'].transform(lambda x: x.shift(1).rolling(12).mean())

# Factorizar variables no numéricas
for col in df.select_dtypes(include='object').columns:
    df[col+'_factorized'], _ = pd.factorize(df[col])

## ✅ 5. Identificar productos completos e incompletos

In [5]:
df['lag_completo'] = df[lag_cols].notnull().all(axis=1).astype(int)

## ✅ 6. Filtrar datos de entrenamiento SOLO con productos sin NaN en lags

In [10]:
df_train = df[(df['fecha'] == '2018-12-01') & (df['lag_completo'] == 1)].copy()

# Seleccionar features finales
factor_cols = [col for col in df.columns if col.endswith('_factorized')]
roll_cols = ['diff_tn_lag2', 'diff_tn_lag11', 'rollmean_3', 'rollmean_6', 'rollmean_9']

X_train = df_train[['tn_total'] + lag_cols + roll_cols + factor_cols].copy()
y_train = df_train['clase']

print(X_train.isna().sum())
print(X_train[X_train.isna().any(axis=1)])

tn_total                  0
tn_1                      0
tn_2                      0
tn_3                      0
tn_4                      0
tn_5                      0
tn_6                      0
tn_7                      0
tn_8                      0
tn_9                      0
tn_10                     0
tn_11                     0
diff_tn_lag2              0
diff_tn_lag11             0
rollmean_3                0
rollmean_6                0
rollmean_9                0
periodo_factorized        0
cat1_factorized           0
cat2_factorized           0
cat3_factorized           0
brand_factorized          0
descripcion_factorized    0
mm-yyyy_factorized        0
quarter_factorized        0
dtype: int64
Empty DataFrame
Columns: [tn_total, tn_1, tn_2, tn_3, tn_4, tn_5, tn_6, tn_7, tn_8, tn_9, tn_10, tn_11, diff_tn_lag2, diff_tn_lag11, rollmean_3, rollmean_6, rollmean_9, periodo_factorized, cat1_factorized, cat2_factorized, cat3_factorized, brand_factorized, descripcion_factorized, mm-yy

## ✅ 7. Entrenar modelo

In [11]:
lr = LinearRegression()
lr.fit(X_train, y_train)

## ✅ 8. Separar productos para predicción

In [12]:
X_pred_df = df[df['fecha'] == '2019-12-01'].copy()
X_pred_df['lag_completo'] = X_pred_df[lag_cols].notnull().all(axis=1).astype(int)

X_pred_completo = X_pred_df[X_pred_df['lag_completo'] == 1].copy()
X_pred_incompleto = X_pred_df[X_pred_df['lag_completo'] == 0].copy()



## ✅ Predicción para productos completos

In [13]:
X_pred = X_pred_completo[['tn_total'] + lag_cols + roll_cols + factor_cols]
y_pred = lr.predict(X_pred)
predicciones = pd.DataFrame({
    'product_id': X_pred_completo['product_id'].values,
    'tn': y_pred
})

## ✅ Promedio para productos incompletos

In [14]:
promedios = (
    df[(df['fecha'] >= '2019-01-01') & (df['fecha'] <= '2019-12-01')]
    .groupby('product_id')['tn_total']
    .mean()
    .reset_index()
    .rename(columns={'tn_total': 'tn'})
)
promedios = promedios[promedios['product_id'].isin(X_pred_incompleto['product_id'].unique())]

## ✅ Unir ambos conjuntos y exportar

In [16]:
salida_final = pd.concat([predicciones, promedios], axis=0)
salida_final.to_csv('pronostico_regresion_lineal_202002_V3.csv', index=False)
print("📁 Archivo final generado: pronostico_regresion_lineal_202002_V3.csv")

📁 Archivo final generado: pronostico_regresion_lineal_202002_V3.csv
