In [None]:
import pandas as pd
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import LabelEncoder

# 1. Cargar dataset
df = pd.read_csv("datasetwg_features.csv")
df['periodo'] = pd.to_datetime(df['periodo'])

# 2. Codificar columnas categ√≥ricas
cat_cols = df.select_dtypes(include=['object']).columns.tolist()

for col in cat_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

# 3. Definir features y target
features = [col for col in df.columns if col not in ['product_id', 'periodo', 'tn']]
target = 'tn'

# 4. Separar datos en train y validaci√≥n
df_train = df[df['periodo'] < '2019-12-01']
df_val = df[df['periodo'] == '2019-12-01']

X_train = df_train[features].astype(np.float32)
y_train = df_train[target].astype(np.float32)
X_val = df_val[features].astype(np.float32)
y_val = df_val[target].astype(np.float32)

# 5. Definir y entrenar modelo
modelo = keras.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(63, activation='relu'),
    layers.Dense(28, activation='tanh'),
    layers.Dense(1, activation='linear')
])

modelo.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001552954889894353),
               loss='mse')

modelo.fit(X_train, y_train, epochs=50, batch_size=16, verbose=1, validation_data=(X_val, y_val))

# 6. Base para predicci√≥n enero (diciembre como input)
df_pred_base = df[df['periodo'] == '2019-12-01'].copy()
df_pred_base['periodo'] = pd.to_datetime('2020-01-01')

X_enero = df_pred_base[features].astype(np.float32)
df_pred_base['tn'] = modelo.predict(X_enero).flatten()

# 7. Predicci√≥n febrero (enero como input)
df_pred_feb = df_pred_base.copy()
df_pred_feb['periodo'] = pd.to_datetime('2020-02-01')

X_febrero = df_pred_feb[features].astype(np.float32)
df_pred_feb['tn_predicho'] = modelo.predict(X_febrero).flatten()

# 8. Exportar resultados
resultado = df_pred_feb[['product_id', 'tn_predicho']]
resultado.to_csv("prediccion_feb2020_keras.csv", index=False)
print("‚úÖ Predicci√≥n guardada en prediccion_feb2020_keras.csv")


In [5]:
import pandas as pd

columnas = pd.read_csv("datasetwg_features.csv", nrows=0).columns
print(len(columnas))
print(columnas[:180])  # te muestra las primeras 20 columnas


66
Index(['customer_id', 'product_id', 'periodo', 'plan_precios_cuidados',
       'cust_request_qty', 'cust_request_tn', 'tn', 'stock_final', 'cat1',
       'cat2', 'cat3', 'brand', 'sku_size', 'venta_id',
       'DIFF(cust_request_qty)', 'DIFF(cust_request_tn)',
       'DIFF(plan_precios_cuidados)', 'DIFF(sku_size)', 'DIFF(stock_final)',
       'DIFF(tn)', 'LAG(brand, periodo)', 'LAG(cat1, periodo)',
       'LAG(cat2, periodo)', 'LAG(cat3, periodo)',
       'LAG(cust_request_qty, periodo)', 'LAG(cust_request_tn, periodo)',
       'LAG(plan_precios_cuidados, periodo)', 'LAG(sku_size, periodo)',
       'LAG(stock_final, periodo)', 'LAG(tn, periodo)', 'MONTH(periodo)',
       'ROLLING_MEAN(periodo, cust_request_qty)',
       'ROLLING_MEAN(periodo, cust_request_tn)',
       'ROLLING_MEAN(periodo, plan_precios_cuidados)',
       'ROLLING_MEAN(periodo, sku_size)', 'ROLLING_MEAN(periodo, stock_final)',
       'ROLLING_MEAN(periodo, tn)', 'tn_lag_1', 'tn_lag_2', 'tn_lag_3',
       'tn_lag_4',

In [6]:
df.shape

(9460980, 66)

In [7]:
import pandas as pd
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import LabelEncoder

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# 1. CARGA EFICIENTE DEL CSV (float32 ‚Üì RAM)
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
csv_path = "datasetwg_features.csv"

# 1-A. Leer una muestra peque√±a para detectar columnas num√©ricas
sample = pd.read_csv(csv_path, nrows=5_000, low_memory=True)
num_cols = sample.select_dtypes(include=["float64", "float32", "int64", "int32"]).columns

# 1-B. Armar diccionario dtype ‚Üí float32 para esas columnas
dtype_map = {c: "float32" for c in num_cols}

# 1-C. Leer el CSV completo con esos dtypes (low_memory sigue en True)
df = pd.read_csv(csv_path, dtype=dtype_map, low_memory=True,
                 parse_dates=["periodo"])        # ‚Äúperiodo‚Äù pasa directamente a datetime

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# 2. CODIFICACI√ìN DE CATEG√ìRICAS (LabelEncoder)
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
cat_cols = df.select_dtypes(include=["object"]).columns
for col in cat_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# 3. DEFINIR FEATURES & TARGET
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
features = [c for c in df.columns if c not in ["product_id", "periodo", "tn"]]
target   = "tn"

# Split temporal: train hasta nov-2019, valid dic-2019
df_train = df[df["periodo"] <  "2019-12-01"]
df_val   = df[df["periodo"] == "2019-12-01"]

X_train = df_train[features].astype(np.float32)
y_train = df_train[target].astype(np.float32)
X_val   = df_val[features].astype(np.float32)
y_val   = df_val[target].astype(np.float32)

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# 4. MODELO KERAS CON HP √ìPTIMOS
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
model = keras.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(63, activation="relu"),
    layers.Dense(28, activation="tanh"),
    layers.Dense(1, activation="linear")
])

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001552954889894353),
    loss="mse"
)

model.fit(
    X_train, y_train,
    epochs=50, batch_size=16, verbose=1,
    validation_data=(X_val, y_val)
)

print("\n‚úÖ MSE validaci√≥n dic-2019:", model.evaluate(X_val, y_val, verbose=0))

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# 5. PREDICCI√ìN ENERO 2020 (diciembre como input)
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
base_dic = df[df["periodo"] == "2019-12-01"].copy()
base_dic["periodo"] = pd.to_datetime("2020-01-01")

X_enero = base_dic[features].astype(np.float32)
base_dic["tn"] = model.predict(X_enero, verbose=0).flatten()

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# 6. PREDICCI√ìN FEBRERO 2020 (enero como input)
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
base_feb = base_dic.copy()
base_feb["periodo"] = pd.to_datetime("2020-02-01")

X_feb   = base_feb[features].astype(np.float32)
base_feb["tn_predicho"] = model.predict(X_feb, verbose=0).flatten()

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# 7. EXPORTAR RESULTADOS
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
out = base_feb[["product_id", "tn_predicho"]]
out.to_csv("prediccion_feb2020_keras.csv", index=False)
print("üìÑ Archivo guardado: prediccion_feb2020_keras.csv")


Epoch 1/50
[1m574886/574886[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1863s[0m 3ms/step - loss: 2.7880 - val_loss: 2.4066
Epoch 2/50
[1m574886/574886[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1867s[0m 3ms/step - loss: 2.8486 - val_loss: 2.4055
Epoch 3/50
[1m574886/574886[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1888s[0m 3ms/step - loss: 2.8726 - val_loss: 2.4050
Epoch 4/50
[1m574886/574886[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1867s[0m 3ms/step - loss: 3.0098 - val_loss: 2.4061
Epoch 5/50
[1m574886/574886[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1060s[0m 2ms/step - loss: 2.9546 - val_loss: 2.4053
Epoch 6/50
[1m574886/574886[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m758s[0m 1ms/step - loss: 2.8664 - val_

In [8]:
import pandas as pd

# Cargar CSV
df = pd.read_csv("prediccion_feb2020_keras.csv")

# Revisar duplicados
duplicados = df[df.duplicated('product_id', keep=False)]
print("Duplicados encontrados:")
print(duplicados)

# Opci√≥n: quedarte con la predicci√≥n de mayor tn, menor, promedio, etc.
# Ac√° usamos promedio por producto_id
df_sin_duplicados = df.groupby('product_id', as_index=False).mean()

# Guardar CSV corregido
df_sin_duplicados.to_csv("prediccion_feb2020_keras_sin_duplicados.csv", index=False)


Duplicados encontrados:
        product_id  tn_predicho
0          20001.0     0.173253
1          20001.0     0.173253
2          20001.0     0.173253
3          20001.0     0.173253
4          20001.0     0.173253
...            ...          ...
262800     21276.0     0.173253
262801     21276.0     0.173253
262802     21276.0     0.173253
262803     21276.0     0.173253
262804     21276.0     0.173253

[262805 rows x 2 columns]
