In [1]:
%%capture
import sys, os

IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    print("Running in Colab!")
    from google.colab import drive

    drive.mount('/content/drive', force_remount=False)
    !pip install pytorch_lightning
    !pip install torchmetrics
else:
    print("Not running in Colab.")

def resolve_path_gdrive(relativePath):
    if os.path.exists('/content/drive'):
        return '/content/drive/MyDrive/work/gdrive-workspaces/git/nn_catalyst/' + relativePath
    else:
        from utils import get_project_root
        return get_project_root() + "/" + relativePath

print(f"Root project folder is at {resolve_path_gdrive('.')}")

NUM_WORKERS = 0
CHECKPOINTS_FOLDER = "/checkpoints/tmp_r1"  #stn_2_r2"
import pytorch_lightning as pl
import torch, math, os

seed = 42
pl.seed_everything(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.set_float32_matmul_precision("medium")  # to make lightning happy

In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import pytorch_lightning as pl
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.callbacks.lr_monitor import LearningRateMonitor
from sklearn.metrics import r2_score
from typing import Dict, List, Tuple, Optional

class MultiTargetDataset(Dataset):
    def __init__(self, X: np.ndarray, y: np.ndarray):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
        
    def __len__(self) -> int:
        return len(self.X)
    
    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        return self.X[idx], self.y[idx]

class SequentialRegressor(pl.LightningModule):
    def __init__(self, input_dim: int, hidden_dims: List[int] = [512, 256, 128], 
                 dropout_rate: float = 0.2, learning_rate: float = 0.001):
        super().__init__()
        self.save_hyperparameters()
        self.test_step_outputs = []
        
        layers = []
        prev_dim = input_dim
        
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout_rate)
            ])
            prev_dim = hidden_dim
            
        layers.append(nn.Linear(prev_dim, 1))
        self.network = nn.Sequential(*layers)
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.network(x)
    
    def _compute_loss(self, batch: Tuple[torch.Tensor, torch.Tensor], 
                      stage: str) -> torch.Tensor:
        x, y = batch
        y_hat = self(x)
        loss = nn.MSELoss()(y_hat, y)
        self.log(f'{stage}_loss', loss, prog_bar=True)
        return loss
    
    def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], 
                     batch_idx: int) -> torch.Tensor:
        return self._compute_loss(batch, 'train')
    
    def validation_step(self, batch: Tuple[torch.Tensor, torch.Tensor], 
                       batch_idx: int) -> torch.Tensor:
        return self._compute_loss(batch, 'val')
    
    def test_step(self, batch: Tuple[torch.Tensor, torch.Tensor], 
                  batch_idx: int) -> None:
        x, y = batch
        y_hat = self(x)
        loss = nn.MSELoss()(y_hat, y)
        self.log('test_loss', loss)
        # Detach tensors before storing
        self.test_step_outputs.append({
            'y_true': y.cpu().detach(),
            'y_pred': y_hat.cpu().detach()
        })
    
    def on_test_epoch_end(self) -> None:
        y_true = torch.cat([out['y_true'] for out in self.test_step_outputs])
        y_pred = torch.cat([out['y_pred'] for out in self.test_step_outputs])
        # Detach tensors before converting to numpy
        r2 = r2_score(y_true.detach().numpy(), y_pred.detach().numpy())
        self.log('test_r2', r2, prog_bar=True)
        self.test_step_outputs.clear()
    
    def configure_optimizers(self) -> Dict:
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode='min', factor=0.5, patience=3, verbose=True
        )
        return {
            'optimizer': optimizer,
            'lr_scheduler': scheduler,
            'monitor': 'val_loss'
        }
def train_model(X_train: np.ndarray, X_val: np.ndarray, X_test: np.ndarray,
                y_train: np.ndarray, y_val: np.ndarray, y_test: np.ndarray,
                input_dim: int, batch_size: int = 32) -> pl.LightningModule:
    
    # Create datasets
    train_dataset = MultiTargetDataset(X_train, y_train)
    val_dataset = MultiTargetDataset(X_val, y_val)
    test_dataset = MultiTargetDataset(X_test, y_test)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
    # Initialize model with current input dimension
    model = SequentialRegressor(input_dim=input_dim)
    
    # Set up callbacks
    callbacks = [
        EarlyStopping(monitor='val_loss', patience=5, mode='min'),
        ModelCheckpoint(monitor='val_loss', mode='min', save_top_k=1),
        LearningRateMonitor(logging_interval='epoch')
    ]
    
    # Initialize trainer
    trainer = pl.Trainer(
        max_epochs=150,
        callbacks=callbacks,
        accelerator='auto',
        devices=1,
        logger=True,
        log_every_n_steps=10
    )
    
    # Train and test the model
    trainer.fit(model, train_loader, val_loader)
    trainer.test(model, test_loader)
    
    return model

In [3]:
def sequential_training(df: pd.DataFrame, num_features: int = 1479, 
                       num_targets: int = 29, 
                       target_range: Optional[Tuple[int, int]] = None) -> pd.DataFrame:
    X = df.iloc[:, :num_features].values
    y = df.iloc[:, num_features:num_features+num_targets].values
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed, shuffle=False)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=seed, shuffle=False)
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    # Create separate scalers for each target
    y_scalers = [StandardScaler() for _ in range(num_targets)]
    y_train_scaled = np.zeros_like(y_train)
    y_val_scaled = np.zeros_like(y_val)
    y_test_scaled = np.zeros_like(y_test)
    
    # Scale each target separately
    for i in range(num_targets):
        y_train_scaled[:, i] = y_scalers[i].fit_transform(y_train[:, i].reshape(-1, 1)).ravel()
        y_val_scaled[:, i] = y_scalers[i].transform(y_val[:, i].reshape(-1, 1)).ravel()
        y_test_scaled[:, i] = y_scalers[i].transform(y_test[:, i].reshape(-1, 1)).ravel()
    
    final_predictions = pd.DataFrame()
    
    # Initialize current features for each dataset
    current_train_features = X_train_scaled.copy()
    current_val_features = X_val_scaled.copy()
    current_test_features = X_test_scaled.copy()
    
    r2_scores = []
    
    if target_range:
        start, end = target_range
    else:
        start, end = 0, num_targets
    
    for target_idx in range(start, end):
        print(f"\nTraining model for target {target_idx + 1}/{end - start}")
        
        # Get current target
        current_target = y_train_scaled[:, target_idx].reshape(-1, 1)
        current_val_target = y_val_scaled[:, target_idx].reshape(-1, 1)
        current_test_target = y_test_scaled[:, target_idx].reshape(-1, 1)
        
        # Create model with current input dimension
        current_input_dim = current_train_features.shape[1]
        
        model = train_model(
            current_train_features,
            current_val_features,
            current_test_features,
            current_target,
            current_val_target,
            current_test_target,
            input_dim=current_input_dim  # Pass current input dimension
        )
        
        # Make predictions
        model.eval()
        with torch.no_grad():
            # Make predictions for training set
            train_predictions = model(torch.tensor(current_train_features, dtype=torch.float32))
            train_predictions_orig = y_scalers[target_idx].inverse_transform(
                train_predictions.cpu().detach().numpy()
            )
            
            # Make predictions for validation and test sets
            val_predictions = model(torch.tensor(current_val_features, dtype=torch.float32))
            test_predictions = model(torch.tensor(current_test_features, dtype=torch.float32))
            
            # Store predictions for current target
            final_predictions[f'target_{target_idx+1}_pred'] = train_predictions_orig.flatten()
            
            # Update features for next iteration
            current_train_features = np.hstack([
                current_train_features, 
                train_predictions.cpu().detach().numpy()
            ])
            current_val_features = np.hstack([
                current_val_features, 
                val_predictions.cpu().detach().numpy()
            ])
            current_test_features = np.hstack([
                current_test_features, 
                test_predictions.cpu().detach().numpy()
            ])
        
        # Calculate and store R2 score
        test_predictions_orig = y_scalers[target_idx].inverse_transform(
            test_predictions.cpu().detach().numpy()
        )
        r2 = r2_score(y_test[:, target_idx], test_predictions_orig.flatten())
        r2_scores.append(r2)
        print(f"R2 score for target {target_idx + 1}: {r2:.4f}")
    
    print("\nFinal R2 scores for all targets:")
    for i, r2 in enumerate(r2_scores, start=1):
        print(f"Target {i}: {r2:.4f}")
    
    return final_predictions


import numpy as np
datafile='src/pl/merged_data_last29_reordered_byR2.csv'
max_rows=200
df = pd.read_csv(resolve_path_gdrive(datafile), delimiter=',', skiprows=0, dtype=float, nrows=max_rows)
# Run sequential training
predictions = sequential_training(df)
# Save predictions
predictions.to_csv('predictions.csv', index=False)


Training model for target 1/29


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

D:\ds\work\utilities\python\envs\llm-dev\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
D:\ds\work\utilities\python\envs\llm-dev\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
D:\ds\work\utilities\python\envs\llm-dev\lib\site-packages\pytorch_lightning\loops\fit_loop.py:298: The number of training batches (4) is smaller than the logging interval Trainer(log_every_n_steps=10). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

D:\ds\work\utilities\python\envs\llm-dev\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

R2 score for target 1: 0.8300

Training model for target 2/29


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

D:\ds\work\utilities\python\envs\llm-dev\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
D:\ds\work\utilities\python\envs\llm-dev\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
D:\ds\work\utilities\python\envs\llm-dev\lib\site-packages\pytorch_lightning\loops\fit_loop.py:298: The number of training batches (4) is smaller than the logging interval Trainer(log_every_n_steps=10). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

D:\ds\work\utilities\python\envs\llm-dev\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

KeyboardInterrupt: 