In [1]:
%%capture
import sys, os
import pytorch_lightning as pl
import torch, math, os

IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    print("Running in Colab!")
    from google.colab import drive

    drive.mount('/content/drive', force_remount=False)
    !pip install pytorch_lightning
    !pip install torchmetrics
else:
    print("Not running in Colab.")

def resolve_path_gdrive(relativePath):
    if os.path.exists('/content/drive'):
        return '/content/drive/MyDrive/work/gdrive-workspaces/git/nn_catalyst/' + relativePath
    else:
        from utils import get_project_root
        return get_project_root() + "/" + relativePath

print(f"Root project folder is at {resolve_path_gdrive('.')}")

NUM_WORKERS = 0
CHECKPOINTS_FOLDER_BASE = "/checkpoints/stn_r1"
CHECKPOINTS_FOLDER = CHECKPOINTS_FOLDER_BASE
DEBUG = False
seed = 42

pl.seed_everything(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.set_float32_matmul_precision("medium")  # to make lightning happy

In [2]:
import pandas as pd
import numpy as np
import torch
import pytorch_lightning as pl
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.callbacks.lr_monitor import LearningRateMonitor
from sklearn.metrics import r2_score
from typing import Dict, List, Tuple, Optional

class MultiTargetDataset(Dataset):
    def __init__(self, X: np.ndarray, y: np.ndarray):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
        
    def __len__(self) -> int:
        return len(self.X)
    
    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        return self.X[idx], self.y[idx]

In [3]:
from torch import nn, optim
import torchmetrics
import torch.nn.functional as F

class BaseModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.r2 = torchmetrics.R2Score()
        self.loss_fn = nn.MSELoss()
        self.validation_step_outputs = []

    def training_step(self, batch, batch_idx):
        loss, scores, y = self._common_step(batch, batch_idx)
        self.log_dict(
            {
                "train_loss": loss,
            },
            on_step=False,
            on_epoch=True,
            prog_bar=True,
        )
        accuracy = self.r2(scores, y)
        self.log("train_acc", accuracy, prog_bar=True)
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        loss, scores, y = self._common_step(batch, batch_idx)
        self.log("val_loss", loss)
        self.validation_step_outputs.append(loss)
        return loss

    def on_validation_epoch_end(self):
        epoch_average = torch.stack(self.validation_step_outputs).mean()
        self.log("validation_epoch_average", epoch_average)
        self.validation_step_outputs.clear()  # free memory

    def test_step(self, batch, batch_idx):
        loss, scores, y = self._common_step(batch, batch_idx)
        self.log("test_loss", loss)
        return loss

    def _common_step(self, batch, batch_idx):
        x, y = batch
        x = x.reshape(x.size(0), -1)
        scores = self.forward(x)
        loss = self.loss_fn(scores, y)
        if DEBUG == True:
            print(f"loss: {loss}, len: {len(y)}")
        return loss, scores, y

    def predict_step(self, batch, batch_idx):
        x, y = batch
        x = x.reshape(x.size(0), -1)
        scores = self.forward(x)
        preds = torch.argmax(scores, dim=1)
        return preds

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(lr=self.lr, params=self.parameters())
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, min_lr=0.000000001, threshold=0.001)
        return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "val_loss"}

class SingleTargetNet(BaseModel):

    def __init__(self, input_size, learning_rate: float=0.001, dropout_rate: float = 0.2, target=1):
        super(SingleTargetNet, self).__init__()
        self.lr = learning_rate
        self.loss_fn = nn.MSELoss()

        self.fc1 = nn.Linear(input_size, 1024)
        self.bn1 = nn.BatchNorm1d(1024)
        self.fc2 = nn.Linear(1024, 512)
        self.bn2 = nn.BatchNorm1d(512)
        self.fc3 = nn.Linear(512, 1)
        self.fc_skip = nn.Linear(1024, 512)
        self.dropout = nn.Dropout(dropout_rate)
        self.save_hyperparameters()

    def forward(self, x):
        x1 = F.relu(self.bn1(self.fc1(x)))
        x1 = self.dropout(x1)

        x2 = F.relu(self.bn2(self.fc2(x1)))
        x2 = self.dropout(x2)

        # Skip connection
        x2 += self.fc_skip(x1)

        x3 = self.fc3(x2)
        return x3

In [4]:
def train_model(X_train: np.ndarray, X_val: np.ndarray, X_test: np.ndarray,
                y_train: np.ndarray, y_val: np.ndarray, y_test: np.ndarray,
                input_dim: int, target_num, batch_size: int = 32) -> pl.LightningModule:
    
    # Create datasets
    train_dataset = MultiTargetDataset(X_train, y_train)
    val_dataset = MultiTargetDataset(X_val, y_val)
    test_dataset = MultiTargetDataset(X_test, y_test)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
    # Initialize model with current input dimension
    #model = SequentialRegressor(input_dim=input_dim)
    model = SingleTargetNet (
        input_size=input_dim
    )    
    # Set up callbacks
    callbacks = [
        EarlyStopping(monitor='val_loss', patience=10, mode='min', verbose=True), #monitor="train_loss"
        ModelCheckpoint(
            dirpath=resolve_path_gdrive(f'{CHECKPOINTS_FOLDER}/{target_num}'),
            filename='{epoch:02d}-{val_loss:.2f}',
            save_top_k=1,
            verbose=True,
            monitor='val_loss',
            mode='min'
        ),
        LearningRateMonitor(logging_interval='epoch')
    ]

    # Initialize trainer
    trainer = pl.Trainer(
        max_epochs=150,
        callbacks=callbacks,
        accelerator='auto',
        devices=1,
        logger=True,
        log_every_n_steps=10
    )
    
    # Train and test the model
    trainer.fit(model, train_loader, val_loader)
    trainer.test(model, test_loader)
    
    return model

In [5]:
def sequential_training(df: pd.DataFrame, num_features: int = 1479, 
                       num_targets: int = 29, stack_predictions = True, scale_y = True,
                       target_range: Optional[Tuple[int, int]] = None) -> pd.DataFrame:
    X = df.iloc[:, :num_features].values
    y = df.iloc[:, num_features:num_features+num_targets].values
    global CHECKPOINTS_FOLDER 
    CHECKPOINTS_FOLDER = f"{CHECKPOINTS_FOLDER_BASE}/stack={stack_predictions}-scaleY={scale_y}"
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed, shuffle=True)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=seed, shuffle=True)
    
    print(f"Sizes: {X_train.shape}, {X_val.shape}, {X_test.shape}, {y_train.shape}, {y_val.shape}")
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    if scale_y:
        # Create separate scalers for each target
        y_scalers = [StandardScaler() for _ in range(num_targets)]
        y_train_scaled = np.zeros_like(y_train)
        y_val_scaled = np.zeros_like(y_val)
        y_test_scaled = np.zeros_like(y_test)
        
        # Scale each target separately
        for i in range(num_targets):
            y_train_scaled[:, i] = y_scalers[i].fit_transform(y_train[:, i].reshape(-1, 1)).ravel()
            y_val_scaled[:, i] = y_scalers[i].transform(y_val[:, i].reshape(-1, 1)).ravel()
            y_test_scaled[:, i] = y_scalers[i].transform(y_test[:, i].reshape(-1, 1)).ravel()
    
    final_predictions = pd.DataFrame()
    
    # Initialize current features for each dataset
    current_train_features = X_train_scaled.copy()
    current_val_features = X_val_scaled.copy()
    current_test_features = X_test_scaled.copy()
    
    r2_scores = []
    
    if target_range:
        start, end = target_range
    else:
        start, end = 0, num_targets
    
    for target_idx in range(start, end):
        print(f"\nTraining model for target {target_idx + 1}/{end - start}. Features: {current_train_features.shape[1]}")
        # Get current target        
        if scale_y:
            current_target = y_train_scaled[:, target_idx].reshape(-1, 1)
            current_val_target = y_val_scaled[:, target_idx].reshape(-1, 1)
            current_test_target = y_test_scaled[:, target_idx].reshape(-1, 1)
        else:
            current_target = y_train[:, target_idx].reshape(-1, 1)
            current_val_target = y_val[:, target_idx].reshape(-1, 1)
            current_test_target = y_test[:, target_idx].reshape(-1, 1)
                
        # Create model with current input dimension
        current_input_dim = current_train_features.shape[1]
        
        model = train_model(
            current_train_features,
            current_val_features,
            current_test_features,
            current_target,
            current_val_target,
            current_test_target,
            input_dim=current_input_dim,
            target_num=target_idx + 1
        )
        # Make predictions
        model.eval()
        with torch.no_grad():
            # Make predictions for training set
            train_predictions = model(torch.tensor(current_train_features, dtype=torch.float32))
            
            # Make predictions for validation and test sets
            val_predictions = model(torch.tensor(current_val_features, dtype=torch.float32))
            test_predictions = model(torch.tensor(current_test_features, dtype=torch.float32))
            
            all_predictions = torch.cat((train_predictions, val_predictions, test_predictions)).cpu().detach().numpy()
            all_targets = np.concatenate((current_target, current_val_target, current_test_target))
            # Store predictions for current target
            # final_predictions[f'train_target_{target_idx+1}_pred'] = train_predictions.flatten()
            # final_predictions[f'train_target_{target_idx+1}'] = current_target
            # 
            # final_predictions[f'val_target_{target_idx+1}_pred'] = test_predictions.flatten()
            # final_predictions[f'val_target_{target_idx+1}'] = current_val_target
            # 
            # final_predictions[f'test_target_{target_idx+1}_pred'] = test_predictions.flatten()
            # final_predictions[f'test_target_{target_idx+1}'] = current_test_target
            
            final_predictions[f'all_target_{target_idx+1}_pred'] = all_predictions.flatten()
            final_predictions[f'all_target_{target_idx+1}'] = all_targets

            if stack_predictions == True:
                # Update features for next iteration
                current_train_features = np.hstack([
                    current_train_features, 
                    train_predictions.cpu().detach().numpy()
                ])
                current_val_features = np.hstack([
                    current_val_features, 
                    val_predictions.cpu().detach().numpy()
                ])
                current_test_features = np.hstack([
                    current_test_features, 
                    test_predictions.cpu().detach().numpy()
                ])
        
        # Calculate and store R2 score
        if scale_y:
            test_predictions_orig = y_scalers[target_idx].inverse_transform(
                test_predictions.cpu().detach().numpy()
            )            
            r2 = r2_score(y_test[:, target_idx], test_predictions_orig.flatten())
        else:
            r2 = r2_score(y_test[:, target_idx], test_predictions.flatten())
            
        r2_scores.append(r2)
        print(f"R2 score for target {target_idx + 1}: {r2:.4f}")
    
    print("\nFinal R2 scores for all targets:")
    for i, r2 in enumerate(r2_scores, start=1):
        print(f"Target {i}: {r2:.4f}")
    
    return final_predictions


In [6]:
import numpy as np
datafile='src/pl/merged_data_last29_reordered_byR2.csv'
max_rows=None
df = pd.read_csv(resolve_path_gdrive(datafile), delimiter=',', skiprows=0, dtype=float, nrows=max_rows)

In [None]:
# Run sequential training
predictions = sequential_training(df, stack_predictions=True, scale_y=True, target_range=None)
# Save predictions
predictions.to_csv('predictions.csv', index=False)

Sizes: (15739, 1479), (5247, 1479), (5247, 1479), (15739, 29), (5247, 29)

Training model for target 1/29. Features: 1479


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

d:\ds\work\utilities\conda\envs\nn_310_2\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.
d:\ds\work\utilities\conda\envs\nn_310_2\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

# Model: stacking, test r2, 150 epoch
Final R2 scores for all targets:
Target 1: 0.9889
Target 2: 0.9922
Target 3: 0.9900
Target 4: 0.9893
Target 5: 0.9901
Target 6: 0.9879
Target 7: 0.9669
Target 8: 0.9625
Target 9: 0.9608
Target 10: 0.9448
Target 11: 0.9306
Target 12: 0.9446
Target 13: 0.9376
Target 14: 0.9301
Target 15: 0.9415
Target 16: 0.9219
Target 17: 0.8800
Target 18: 0.8838
Target 19: 0.8417
Target 20: 0.8386
Target 21: 0.8452
Target 22: 0.8136
Target 23: 0.7956
Target 24: 0.7864
Target 25: 0.6782
Target 26: 0.6834
Target 27: 0.5847
Target 28: 0.5586
Target 29: 0.4338

In [22]:
import pandas as pd
from sklearn.metrics import r2_score

# Load the file containing target and predicted values
file_path = '/checkpoints/stn_r1/stack=True-scaleY=True/predictions.csv'
data = pd.read_csv(resolve_path_gdrive(file_path))

# Adjusting the code to match the name pattern 'target_1_pred', 'target_1'
r2_scores = []
[train, test, val] = [15739, 5247, 5247]
for i in range(1, (len(data.columns) // 2) + 1):
    train_targets = data[f'all_target_{i}'][:train]
    train_predictions = data[f'all_target_{i}_pred'][:train]
    val_targets = data[f'all_target_{i}'][train:]
    val_predictions = data[f'all_target_{i}_pred'][train:]
    test_targets = data[f'all_target_{i}'][train+test:]
    test_predictions = data[f'all_target_{i}_pred'][train+test:]
    
    r2 = [r2_score(train_targets, train_predictions), r2_score(val_targets, val_predictions), r2_score(test_targets, test_predictions)]
    r2_scores.append(r2)

# Create a DataFrame to tabulate the results
results = pd.DataFrame({
    'Target': [f'target_{i}' for i in range(1, (len(data.columns) // 2) + 1)],
    'R2 Score [train, val, test]': r2_scores
})

print(results)

       Target                        R2 Score [train, val, test]
0    target_1  [0.9824148100660581, 0.9780502984742219, 0.972...
1    target_2  [0.8979788138926885, 0.8929131427514477, 0.887...
2    target_3  [0.9737432100223783, 0.965676295177207, 0.9575...
3    target_4  [0.9770670726986855, 0.973489571271416, 0.9711...
4    target_5  [0.9785964303605776, 0.9759807859825803, 0.975...
5    target_6  [0.9510843219690669, 0.9478761020596811, 0.947...
6    target_7  [0.9362204056829968, 0.9341844279193448, 0.934...
7    target_8  [0.9442090160299694, 0.9346594316831095, 0.932...
8    target_9  [0.9326175059648436, 0.9258850839999507, 0.926...
9   target_10  [0.9146841886645203, 0.8945233473429507, 0.894...
10  target_11  [0.9061511156763594, 0.8779621690229521, 0.881...
11  target_12  [0.9277067622544798, 0.907809835637711, 0.9164...
12  target_13  [0.9090101159999711, 0.8964180816937394, 0.897...
13  target_14  [0.8816674501433857, 0.8779152018561418, 0.873...
14  target_15  [0.8986952

In [None]:
class SequentialRegressor(pl.LightningModule):
    def __init__(self, input_dim: int, hidden_dims: List[int] = [512, 256, 128], 
                 dropout_rate: float = 0.2, learning_rate: float = 0.001):
        super().__init__()
        self.save_hyperparameters()
        self.test_step_outputs = []
        
        layers = []
        prev_dim = input_dim
        
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout_rate)
            ])
            prev_dim = hidden_dim
            
        layers.append(nn.Linear(prev_dim, 1))
        self.network = nn.Sequential(*layers)
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.network(x)
    
    def _compute_loss(self, batch: Tuple[torch.Tensor, torch.Tensor], 
                      stage: str) -> torch.Tensor:
        x, y = batch
        y_hat = self(x)
        loss = nn.MSELoss()(y_hat, y)
        self.log(f'{stage}_loss', loss, prog_bar=True)
        return loss
    
    def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], 
                     batch_idx: int) -> torch.Tensor:
        return self._compute_loss(batch, 'train')
    
    def validation_step(self, batch: Tuple[torch.Tensor, torch.Tensor], 
                       batch_idx: int) -> torch.Tensor:
        return self._compute_loss(batch, 'val')
    
    def test_step(self, batch: Tuple[torch.Tensor, torch.Tensor], 
                  batch_idx: int) -> None:
        x, y = batch
        y_hat = self(x)
        loss = nn.MSELoss()(y_hat, y)
        self.log('test_loss', loss)
        # Detach tensors before storing
        self.test_step_outputs.append({
            'y_true': y.cpu().detach(),
            'y_pred': y_hat.cpu().detach()
        })
    
    def on_test_epoch_end(self) -> None:
        y_true = torch.cat([out['y_true'] for out in self.test_step_outputs])
        y_pred = torch.cat([out['y_pred'] for out in self.test_step_outputs])
        # Detach tensors before converting to numpy
        r2 = r2_score(y_true.detach().numpy(), y_pred.detach().numpy())
        self.log('test_r2', r2, prog_bar=True)
        self.test_step_outputs.clear()
    
    def configure_optimizers(self) -> Dict:
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode='min', factor=0.5, patience=3, verbose=True
        )
        return {
            'optimizer': optimizer,
            'lr_scheduler': scheduler,
            'monitor': 'val_loss'
        }
    