In [1]:

# Entrenamiento de red neuronal con PyTorch usando Weighted MAE Loss

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt


In [2]:

# Parámetros
train_path = './data/df_train_limpio.parquet'
val_path = './data/df_val_limpio.parquet'
target_col = 'CLASE_DELTA_ZSCORE'
alpha = 0.5  # peso para la penalización de errores grandes
batch_size = 2048
epochs = 10
learning_rate = 1e-3

# Cargar datos
df_train = pd.read_parquet(train_path, engine='fastparquet')
df_val = pd.read_parquet(val_path, engine='fastparquet')

# Features
all_columns = df_train.columns.tolist()
feature_cols = [col for col in all_columns if col != target_col]

In [3]:
# Dataset personalizado
class ZScoreDataset(Dataset):
    def __init__(self, df, features, target):
        self.df = df
        self.features = features
        self.target = target

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        X = torch.tensor(row[self.features].values, dtype=torch.float32)
        y = torch.tensor(row[self.target], dtype=torch.float32).view(1)
        return X, y

In [4]:

# Red neuronal
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=1024, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, x):
        return self.net(x)

# Pérdida ponderada
def weighted_mae(pred, real, alpha=1.0):
    weights = 1.0 + alpha * torch.abs(real)
    return torch.mean(weights * torch.abs(pred - real))

# Entrenamiento
def train_model(model, train_loader, val_loader, n_epochs, lr, alpha):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(n_epochs):
        model.train()
        running_loss = 0.0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            preds = model(X_batch)
            loss = weighted_mae(preds, y_batch, alpha)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        print(f"Epoch {epoch+1}/{n_epochs}, Loss: {running_loss / len(train_loader):.4f}")

        # Evaluación
        model.eval()
        y_true, y_pred = [], []
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch = X_batch.to(device)
                preds = model(X_batch).cpu()
                y_true.append(y_batch)
                y_pred.append(preds)
        y_true = torch.cat(y_true)
        y_pred = torch.cat(y_pred)
        error = y_pred - y_true
        plt.figure(figsize=(8, 6))
        plt.scatter(y_true.numpy(), error.numpy(), alpha=0.2)
        plt.axhline(0, color='red', linestyle='--')
        plt.title("Error Signado vs Valor Real")
        plt.xlabel("Valor real")
        plt.ylabel("Error (pred - real)")
        plt.grid(True)
        plt.show()


In [8]:
# Reemplazar valores no numéricos y NaNs en todas las columnas de entrada
for col in feature_cols:
    df_train[col] = pd.to_numeric(df_train[col], errors='coerce')
    df_val[col] = pd.to_numeric(df_val[col], errors='coerce')

# Rellenar NaNs con 0 (o usar media, mediana, etc.)
df_train[feature_cols] = df_train[feature_cols].fillna(0).astype(np.float32)
df_val[feature_cols] = df_val[feature_cols].fillna(0).astype(np.float32)


In [9]:
df_train[target_col] = pd.to_numeric(df_train[target_col], errors='coerce').fillna(0).astype(np.float32)
df_val[target_col] = pd.to_numeric(df_val[target_col], errors='coerce').fillna(0).astype(np.float32)

In [None]:
print(df_train[feature_cols].dtypes[df_train[feature_cols].dtypes == 'object'])

Series([], dtype: object)


In [11]:

# Preparar loaders
train_dataset = ZScoreDataset(df_train, feature_cols, target_col)
val_dataset = ZScoreDataset(df_val, feature_cols, target_col)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)


In [12]:

# Entrenar modelo
model = MLP(input_dim=len(feature_cols))
train_model(model, train_loader, val_loader, epochs, learning_rate, alpha)


KeyboardInterrupt: 