In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from typing import Optional, Union, Tuple, List

from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler

import torch 
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader,TensorDataset
from torch.optim.lr_scheduler import OneCycleLR

import lightning.pytorch as pl
from lightning.pytorch.callbacks import EarlyStopping
from lightning.pytorch.callbacks import ModelCheckpoint


print(f"PyTorch version: {torch.__version__}")
print(f"PyTorch Lightning version: {pl.__version__}")

# local modules
import sys
sys.path.append("../src")
from preproc import preprocess_data

***
### load and preprocess data

In [None]:
# define some paths
path_raw = Path("../data/raw")
path_processed = Path("../data/processed")
path_results = Path("../data/results")

# load data
df_train = pd.read_csv(path_raw / "train.csv")
df_test = pd.read_csv(path_raw / "test.csv")

df_train

In [None]:
# Call the function
df_train, df_test, numerical_cols, categorical_cols = preprocess_data(
    df_train,
    df_test,
    scale_utility=True
)

# Print the results
print("Numerical Columns:", len(numerical_cols))
print("Categorical Columns:", len(categorical_cols))

In [None]:
plt.hist(df_train['utility_agent1_scaled'], bins=100)
plt.title('Distribution of Target Variable')
plt.xlabel('Utility Agent 1')
plt.ylabel('Frequency')
plt.show()

In [None]:
df_train[numerical_cols] = df_train[numerical_cols].astype(np.float32)
df_train[categorical_cols] = df_train[categorical_cols].astype(np.int32)

cat_input_dims = df_train[categorical_cols].nunique(axis=0).values.tolist()
print(cat_input_dims)

In [None]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the numerical columns of df_train
df_train[numerical_cols] = scaler.fit_transform(df_train[numerical_cols])

# Print a message to confirm the scaling
print("Numerical columns have been scaled using StandardScaler.")


***
### train model


In [None]:
class MLP(pl.LightningModule):

    def __init__(self, 
            num_input_dim: int,
            cat_input_dims: list[int],
            output_dim: int,
            layers: str,
            dropout: float,
            learning_rate: float = 1e-3,
            weight_decay: float = 1e-5,
            initialization: str = 'kaiming_normal',
            embedding_dim: Optional[List[int]] = None,
            target_range: Optional[Union[Tuple[float, float], List[Tuple[float, float]]]] = None,
        ):
        super().__init__()
        self.save_hyperparameters()

        self.target_range = target_range

        # Initialize embedding dimensions if not provided
        if embedding_dim is None:
            # Rule of thumb: min(50, num_unique // 2 + 1) for each categorical feature
            embedding_dim = [min(50, int(1 + np.ceil(np.sqrt(dim)))) for dim in cat_input_dims]

        elif len(embedding_dim) != len(cat_input_dims):
            raise ValueError("Length of embedding_dim must match number of categorical features.")

        self.embedding_dim = embedding_dim

        # Create embedding layers
        self.create_embeddings(cat_input_dims, embedding_dim)

        # Create backbone layers
        self.create_backbone(num_input_dim, layers)

        # Create head layers
        self.create_head(output_dim)

        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        self.initialization = initialization

        self._init_weights()

        # Initialize lists to store validation outputs
        self.validation_targets = []
        self.validation_predictions = []

    def create_embeddings(self, cat_input_dims: list[int], embedding_dim: list[int]):
        self.embeddings = nn.ModuleList(
            [nn.Embedding(dim, emb_dim) for dim, emb_dim in zip(cat_input_dims, embedding_dim)]
        )

    def create_backbone(self, num_input_dim: int, layers: str):
        # Calculate total input dimension after embeddings
        total_embedding_dim = sum(self.embedding_dim)
        total_input_dim = num_input_dim + total_embedding_dim

        # Parse layers string
        layer_sizes = [int(size) for size in layers.split('-')]

        # Create backbone network layers
        backbone_layers = []
        prev_size = total_input_dim
        for size in layer_sizes:
            backbone_layers.extend([
                nn.BatchNorm1d(prev_size),
                nn.Linear(prev_size, size),
                nn.ReLU(),
                nn.Dropout(self.hparams.dropout),
            ])
            prev_size = size
        self.backbone = nn.Sequential(*backbone_layers)
        self.backbone_output_size = prev_size

    def create_head(self, output_dim: int):
        # Output layer
        self.head = nn.Sequential(
            nn.BatchNorm1d(self.backbone_output_size),
            nn.Linear(self.backbone_output_size, output_dim)
        )

    def _init_weights(self):
        for module in self.modules():
            if isinstance(module, nn.Linear):
                if self.initialization == 'kaiming_normal':
                    nn.init.kaiming_normal_(module.weight, nonlinearity='relu')
                elif self.initialization == 'kaiming_uniform':
                    nn.init.kaiming_uniform_(module.weight, nonlinearity='relu')
                elif self.initialization == 'xavier_normal':
                    nn.init.xavier_normal_(module.weight)
                elif self.initialization == 'xavier_uniform':
                    nn.init.xavier_uniform_(module.weight)
                else:
                    raise ValueError(f"Unsupported initialization method: {self.initialization}")
                if module.bias is not None:
                    nn.init.zeros_(module.bias)

    def forward(self, x_num, x_cat):
        # Process categorical variables
        embedded = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
        embedded = torch.cat(embedded, dim=1)
        
        # Concatenate numerical and embedded categorical features
        x = torch.cat([x_num, embedded], dim=1)
        
        # Pass through backbone
        x = self.backbone(x)
        
        # Pass through head
        x = self.head(x)

        if self.target_range is not None:
            if isinstance(self.target_range, list):
                # Assuming multiple targets
                # Apply sigmoid for each target and scale
                min_vals, max_vals = zip(*self.target_range)
                min_vals = torch.tensor(min_vals, device=x.device).view(1, -1)
                max_vals = torch.tensor(max_vals, device=x.device).view(1, -1)
                x = torch.sigmoid(x) * (max_vals - min_vals) + min_vals
            else:
                # min_val, max_val = self.target_range
                # x = torch.sigmoid(x) * (max_val - min_val) + min_val
                x = torch.tanh(x)

        return x.squeeze(-1)  # Squeeze the last dimension to match target shape

    def training_step(self, batch, batch_idx):
        x_num, x_cat, y = batch
        y_hat = self(x_num, x_cat)
        loss = F.mse_loss(y_hat, y)
        self.log('train_loss', loss, prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x_num, x_cat, y = batch
        y_hat = self(x_num, x_cat)
        loss = F.mse_loss(y_hat, y)
        self.log('valid_loss', loss, prog_bar=True)
        # Store targets and predictions for later use
        self.validation_targets.append(y)
        self.validation_predictions.append(y_hat)
        return loss
    
    def predict_step(self, batch, batch_idx):
        if len(batch) == 2:
            x_num, x_cat = batch
        elif len(batch) == 3:
            x_num, x_cat, _ = batch
        y_hat = self(x_num, x_cat)
        return y_hat

    def on_validation_epoch_end(self):
        # Concatenate all targets and predictions
        y = torch.cat(self.validation_targets)
        y_hat = torch.cat(self.validation_predictions)
        rmse = torch.sqrt(F.mse_loss(y_hat, y))
        self.log('val_rmse', rmse, prog_bar=True)
        # Clear the lists for next epoch
        self.validation_targets.clear()
        self.validation_predictions.clear()
                
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(
            self.parameters(), 
            lr=self.learning_rate, 
            weight_decay=self.weight_decay,
        )
        scheduler = OneCycleLR(
            optimizer,
            max_lr=self.learning_rate,
            total_steps=self.trainer.estimated_stepping_batches,
            pct_start=0.2,
            anneal_strategy='cos',
            cycle_momentum=True,
            base_momentum=0.85,
            max_momentum=0.95,
        )
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": scheduler,
                "interval": "step",
            },
        }

In [None]:
early_stop_callback = EarlyStopping(
    monitor='val_rmse',
    patience=10,
    mode='min',
    verbose=False
)

class LearningRateMonitor(pl.Callback):
    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
        if batch_idx % 100 == 0:  # Log every 100 batches
            lr = pl_module.optimizers().param_groups[0]['lr']
            pl_module.log('learning_rate', lr, prog_bar=True)   

class BestValRMSELogger(pl.Callback):
    def __init__(self):
        super().__init__()
        self.best_val_rmse = float('inf')

    def on_validation_epoch_end(self, trainer, pl_module):
        current_val_rmse = trainer.callback_metrics.get('val_rmse')
        if current_val_rmse is not None:
            self.best_val_rmse = min(self.best_val_rmse, current_val_rmse)
            pl_module.log('best_val_rmse', self.best_val_rmse, prog_bar=True)


In [None]:
# Define the number of folds for cross-validation
num_folds = 5

# Define the column for stratified or group k-fold
groups_col = 'GameRulesetName'
gkf = GroupKFold(n_splits=num_folds)
split_list = gkf.split(df_train, groups=df_train[groups_col])

trained_models = []
oof = pd.DataFrame(index=df_train.index, columns=['utility_agent1_true', 'utility_agent1_pred'])
oof_scores = []

# Perform cross-validation
for fold, (train_index, val_index) in enumerate(split_list, 1):
    print(f"Fold {fold}")

    if fold == 1: 
        continue
    
    # Split the data
    train, valid = df_train.iloc[train_index], df_train.iloc[val_index]

    train_dataset = TensorDataset(
        torch.tensor(train[numerical_cols].values, dtype=torch.float32),
        torch.tensor(train[categorical_cols].values, dtype=torch.int32),
        torch.tensor(train['utility_agent1'].values, dtype=torch.float32)
    )
    train_loader = DataLoader(
        train_dataset,
        batch_size=32,
        shuffle=True,
        num_workers=8,
        persistent_workers=True
    )

    valid_dataset = TensorDataset(
        torch.tensor(valid[numerical_cols].values, dtype=torch.float32),
        torch.tensor(valid[categorical_cols].values, dtype=torch.int32),
        torch.tensor(valid['utility_agent1'].values, dtype=torch.float32)
    )
    valid_loader = DataLoader(
        valid_dataset,
        batch_size=32,
        shuffle=False,
        num_workers=8,
        persistent_workers=True
    )

    model = MLP(
        num_input_dim=len(numerical_cols),
        cat_input_dims=cat_input_dims,
        output_dim=1,
        layers="768-384-192",
        dropout=0.1,  # Updated dropout
        learning_rate=1e-2,  # Updated max learning rate
        weight_decay=5e-5,  # Updated weight decay
        initialization='kaiming_uniform',
        target_range=(-1, 1),
    )
    trainer = pl.Trainer(
        min_epochs=20, 
        max_epochs=100, 
        #deterministic=True,
        accelerator="mps", 
        #devices=1,
        callbacks=[
            early_stop_callback, 
            LearningRateMonitor(), 
            BestValRMSELogger(),
            ModelCheckpoint(monitor='val_rmse', mode='min', save_top_k=1),
        ],
    )
    trainer.fit(
        model, 
        train_loader,
        valid_loader,
    )

    # Load the best model
    best_model_path = trainer.checkpoint_callback.best_model_path
    model = MLP.load_from_checkpoint(best_model_path)

    # Predict on validation set using trainer.predict with the prediction DataLoader
    predictions = trainer.predict(model, dataloaders=valid_loader)
    y_pred = torch.cat(predictions).squeeze().cpu().numpy()
     
    # Compute RMSE on scaled values
    y_valid = valid['utility_agent1'].values
    rmse = np.sqrt(np.mean((y_pred - y_valid) ** 2))
    print(f"Fold {fold} - RMSE: {rmse}")

    # Save out-of-fold predictions
    oof.loc[val_index, 'utility_agent1_true'] = y_valid
    oof.loc[val_index, 'utility_agent1_pred'] = y_pred

    # Save RMSE to the list
    oof_scores.append(rmse)

# Print the list of oof scores and average oof score
print("List of oof scores:", oof_scores)
print("Average oof score:", np.mean(oof_scores))

oof.to_csv(path_results / 'oof_mlp.csv', index=False)


In [None]:
plt.hist(oof['utility_agent1_true'], alpha=0.5, label='Target')
plt.hist(oof['utility_agent1_pred'], alpha=0.5, label='Prediction')
plt.legend()
plt.show()

***