<a href="https://colab.research.google.com/github/nirb28/nn_catalyst/blob/main/src/pl/StackModels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install pytorch_lightning torchmetrics

Collecting pytorch_lightning
  Downloading pytorch_lightning-2.5.0.post0-py3-none-any.whl.metadata (21 kB)
Collecting torchmetrics
  Downloading torchmetrics-1.6.1-py3-none-any.whl.metadata (21 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch_lightning)
  Downloading lightning_utilities-0.11.9-py3-none-any.whl.metadata (5.2 kB)
Downloading pytorch_lightning-2.5.0.post0-py3-none-any.whl (819 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m819.3/819.3 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading torchmetrics-1.6.1-py3-none-any.whl (927 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m927.3/927.3 kB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.11.9-py3-none-any.whl (28 kB)
Installing collected packages: lightning-utilities, torchmetrics, pytorch_lightning
Successfully installed lightning-utilities-0.11.9 pytorch_lightning-2.5.0.post0 torchmetrics-1.6.1


In [10]:
%%capture
import sys, os
import pytorch_lightning as pl
import torch, math, os

IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    print("Running in Colab!")
    from google.colab import drive

    drive.mount('/content/drive', force_remount=False)
    !pip install pytorch_lightning
    !pip install torchmetrics
else:
    print("Not running in Colab.")

def resolve_path_gdrive(relativePath):
    if os.path.exists('/content/drive'):
        return '/content/drive/MyDrive/work/gdrive-workspaces/git/nn_catalyst/' + relativePath
    else:
        from utils import get_project_root
        return get_project_root() + "/" + relativePath

print(f"Root project folder is at {resolve_path_gdrive('.')}")

NUM_WORKERS = 0
CHECKPOINTS_FOLDER_BASE = "/checkpoints/stn_r3_f849_tlast29"
CHECKPOINTS_FOLDER = resolve_path_gdrive(CHECKPOINTS_FOLDER_BASE) #f'd:/temp{CHECKPOINTS_FOLDER_BASE}'
DEBUG = False
seed = 42
pl.seed_everything(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.set_float32_matmul_precision("medium")  # to make lightning happy

INFO:lightning_fabric.utilities.seed:Seed set to 42


In [11]:
import pandas as pd
import numpy as np
import torch
import pytorch_lightning as pl
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.callbacks.lr_monitor import LearningRateMonitor
from sklearn.metrics import r2_score
from typing import Dict, List, Tuple, Optional

class MultiTargetDataset(Dataset):
    def __init__(self, X: np.ndarray, y: np.ndarray):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self) -> int:
        return len(self.X)

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        return self.X[idx], self.y[idx]

In [12]:
from torch import nn, optim
import torchmetrics
import torch.nn.functional as F

class BaseModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.r2 = torchmetrics.R2Score()
        self.loss_fn = nn.MSELoss()
        self.validation_step_outputs = []

    def training_step(self, batch, batch_idx):
        loss, scores, y = self._common_step(batch, batch_idx)
        self.log_dict(
            {
                "train_loss": loss,
            },
            on_step=False,
            on_epoch=True,
            prog_bar=True,
        )
        accuracy = self.r2(scores, y)
        self.log("train_acc", accuracy, prog_bar=True)
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        loss, scores, y = self._common_step(batch, batch_idx)
        self.log("val_loss", loss)
        self.validation_step_outputs.append(loss)
        return loss

    def on_validation_epoch_end(self):
        epoch_average = torch.stack(self.validation_step_outputs).mean()
        self.log("validation_epoch_average", epoch_average)
        self.validation_step_outputs.clear()  # free memory

    def test_step(self, batch, batch_idx):
        loss, scores, y = self._common_step(batch, batch_idx)
        self.log("test_loss", loss)
        return loss

    def _common_step(self, batch, batch_idx):
        x, y = batch
        x = x.reshape(x.size(0), -1)
        scores = self.forward(x)
        loss = self.loss_fn(scores, y)
        if DEBUG == True:
            print(f"loss: {loss}, len: {len(y)}")
        return loss, scores, y

    def predict_step(self, batch, batch_idx):
        x, y = batch
        x = x.reshape(x.size(0), -1)
        scores = self.forward(x)
        preds = torch.argmax(scores, dim=1)
        return preds

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(lr=self.lr, params=self.parameters())
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, min_lr=0.000000001, threshold=0.001)
        return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "val_loss"}

class SingleTargetNet(BaseModel):

    def __init__(self, input_size, learning_rate: float=0.001, dropout_rate: float = 0.2, target=1):
        super(SingleTargetNet, self).__init__()
        self.lr = learning_rate
        self.loss_fn = nn.MSELoss()

        self.fc1 = nn.Linear(input_size, 1024)
        self.bn1 = nn.BatchNorm1d(1024)
        self.fc2 = nn.Linear(1024, 512)
        self.bn2 = nn.BatchNorm1d(512)
        self.fc3 = nn.Linear(512, 1)
        self.fc_skip = nn.Linear(1024, 512)
        self.dropout = nn.Dropout(dropout_rate)
        self.save_hyperparameters()

    def forward(self, x):
        x1 = F.relu(self.bn1(self.fc1(x)))
        x1 = self.dropout(x1)

        x2 = F.relu(self.bn2(self.fc2(x1)))
        x2 = self.dropout(x2)

        # Skip connection
        x2 += self.fc_skip(x1)

        x3 = self.fc3(x2)
        return x3

In [13]:
def train_model(X_train: np.ndarray, X_val: np.ndarray, X_test: np.ndarray,
                y_train: np.ndarray, y_val: np.ndarray, y_test: np.ndarray,
                input_dim: int, target_num, batch_size: int = 32) -> pl.LightningModule:

    # Create datasets
    train_dataset = MultiTargetDataset(X_train, y_train)
    val_dataset = MultiTargetDataset(X_val, y_val)
    test_dataset = MultiTargetDataset(X_test, y_test)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    # Initialize model with current input dimension
    #model = SequentialRegressor(input_dim=input_dim)
    model = SingleTargetNet (
        input_size=input_dim
    )
    # Set up callbacks
    callbacks = [
        EarlyStopping(monitor='train_loss', patience=10, mode='min', verbose=True), #monitor="train_loss", val_loss
        ModelCheckpoint(
            dirpath=f'{CHECKPOINTS_FOLDER}/{target_num}',
            filename='{epoch:02d}-{val_loss:.2f}',
            save_top_k=1,
            verbose=True,
            monitor='val_loss',
            mode='min'
        ),
        LearningRateMonitor(logging_interval='epoch')
    ]

    # Initialize trainer
    trainer = pl.Trainer(
        max_epochs=150,
        callbacks=callbacks,
        accelerator='auto',
        devices=1,
        logger=True,
        log_every_n_steps=10
    )

    # Train and test the model
    trainer.fit(model, train_loader, val_loader)
    trainer.test(model, test_loader)

    return model

In [14]:
import joblib
def sequential_training(df: pd.DataFrame, num_features: int = 1479,
                       num_targets: int = 29, stack_predictions = True, scale_y = True,
                       target_range: Optional[Tuple[int, int]] = None) -> pd.DataFrame:
    X = df.iloc[:, :num_features].values
    y = df.iloc[:, num_features:num_features+num_targets].values
    global CHECKPOINTS_FOLDER
    CHECKPOINTS_FOLDER = f"{CHECKPOINTS_FOLDER}/stack={stack_predictions}-scaleY={scale_y}"

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed, shuffle=True)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=seed, shuffle=True)

    print(f"Sizes: {X_train.shape}, {X_val.shape}, {X_test.shape}, {y_train.shape}, {y_val.shape}")
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)

    # Save the scaler parameters
    joblib.dump(scaler, f"{CHECKPOINTS_FOLDER}/scaler_X.pkl")

    if scale_y:
        # Create separate scalers for each target
        y_scalers = [StandardScaler() for _ in range(num_targets)]
        y_train_scaled = np.zeros_like(y_train)
        y_val_scaled = np.zeros_like(y_val)
        y_test_scaled = np.zeros_like(y_test)

        # Scale each target separately
        for i in range(num_targets):
            y_train_scaled[:, i] = y_scalers[i].fit_transform(y_train[:, i].reshape(-1, 1)).ravel()
            y_val_scaled[:, i] = y_scalers[i].transform(y_val[:, i].reshape(-1, 1)).ravel()
            y_test_scaled[:, i] = y_scalers[i].transform(y_test[:, i].reshape(-1, 1)).ravel()

    final_predictions = pd.DataFrame()

    # Initialize current features for each dataset
    current_train_features = X_train_scaled.copy()
    current_val_features = X_val_scaled.copy()
    current_test_features = X_test_scaled.copy()

    r2_scores = []

    if target_range:
        start, end = target_range
    else:
        start, end = 0, num_targets

    for target_idx in range(start, end):
        print(f"\nTraining model for target {target_idx + 1}/{end - start}. Features: {current_train_features.shape[1]}")
        # Get current target
        if scale_y:
            current_target = y_train_scaled[:, target_idx].reshape(-1, 1)
            current_val_target = y_val_scaled[:, target_idx].reshape(-1, 1)
            current_test_target = y_test_scaled[:, target_idx].reshape(-1, 1)
        else:
            current_target = y_train[:, target_idx].reshape(-1, 1)
            current_val_target = y_val[:, target_idx].reshape(-1, 1)
            current_test_target = y_test[:, target_idx].reshape(-1, 1)

        # Create model with current input dimension
        current_input_dim = current_train_features.shape[1]

        model = train_model(
            current_train_features,
            current_val_features,
            current_test_features,
            current_target,
            current_val_target,
            current_test_target,
            input_dim=current_input_dim,
            target_num=target_idx + 1
        )
        # Make predictions
        model.eval()
        with torch.no_grad():
            # Make predictions for training set
            train_predictions = model(torch.tensor(current_train_features, dtype=torch.float32))

            # Make predictions for validation and test sets
            val_predictions = model(torch.tensor(current_val_features, dtype=torch.float32))
            test_predictions = model(torch.tensor(current_test_features, dtype=torch.float32))

            all_predictions = torch.cat((train_predictions, val_predictions, test_predictions)).cpu().detach().numpy()
            all_targets = np.concatenate((current_target, current_val_target, current_test_target))
            # Store predictions for current target
            # final_predictions[f'train_target_{target_idx+1}_pred'] = train_predictions.flatten()
            # final_predictions[f'train_target_{target_idx+1}'] = current_target
            #
            # final_predictions[f'val_target_{target_idx+1}_pred'] = test_predictions.flatten()
            # final_predictions[f'val_target_{target_idx+1}'] = current_val_target
            #
            # final_predictions[f'test_target_{target_idx+1}_pred'] = test_predictions.flatten()
            # final_predictions[f'test_target_{target_idx+1}'] = current_test_target

            final_predictions[f'all_target_{target_idx+1}_pred'] = all_predictions.flatten()
            final_predictions[f'all_target_{target_idx+1}'] = all_targets

            if stack_predictions == True:
                # Update features for next iteration
                current_train_features = np.hstack([
                    current_train_features,
                    train_predictions.cpu().detach().numpy()
                ])
                current_val_features = np.hstack([
                    current_val_features,
                    val_predictions.cpu().detach().numpy()
                ])
                current_test_features = np.hstack([
                    current_test_features,
                    test_predictions.cpu().detach().numpy()
                ])

        # Calculate and store R2 score
        if scale_y:
            test_predictions_orig = y_scalers[target_idx].inverse_transform(
                test_predictions.cpu().detach().numpy()
            )
            r2 = r2_score(y_test[:, target_idx], test_predictions_orig.flatten())
        else:
            r2 = r2_score(y_test[:, target_idx], test_predictions.flatten())

        r2_scores.append(r2)
        print(f"R2 score for target {target_idx + 1}: {r2:.4f}")

    print("\nFinal R2 scores for all targets:")
    for i, r2 in enumerate(r2_scores, start=1):
        print(f"Target {i}: {r2:.4f}")

    return final_predictions


In [15]:
import numpy as np
datafile='src/pl/merged_data_f849_tlast29_reordered_byR2.csv'
max_rows=None
df = pd.read_csv(resolve_path_gdrive(datafile), delimiter=',', skiprows=0, dtype=float, nrows=max_rows)

In [16]:
# Run sequential training target_range=[4,30]
predictions = sequential_training(df, stack_predictions=False, scale_y=True, target_range=None, num_features=849, num_targets=29)

Sizes: (15739, 849), (5247, 849), (5247, 849), (15739, 29), (5247, 29)


In [None]:
# Save predictions
predictions.to_csv(f'{CHECKPOINTS_FOLDER}/predictions.csv', index=False)

In [None]:
import pandas as pd
from sklearn.metrics import r2_score

def computeR2(predictions_file, sizes=[15739, 5247, 5247], identifier=''):
    # Load the file containing target and predicted values
    data = pd.read_csv(resolve_path_gdrive(predictions_file))

    # Adjusting the code to match the name pattern 'target_1_pred', 'target_1'
    r2_scores = []
    [train, test, val] = sizes
    for i in range(1, (len(data.columns) // 2) + 1):
        train_targets = data[f'all_target_{i}'][:train]
        train_predictions = data[f'all_target_{i}_pred'][:train]
        val_targets = data[f'all_target_{i}'][train:train+val]
        val_predictions = data[f'all_target_{i}_pred'][train:train+val]
        test_targets = data[f'all_target_{i}'][train+val:]
        test_predictions = data[f'all_target_{i}_pred'][train+val:]

        r2 = [r2_score(train_targets, train_predictions), r2_score(val_targets, val_predictions), r2_score(test_targets, test_predictions), r2_score(data[f'all_target_{i}'], data[f'all_target_{i}_pred'])]
        r2_scores.append(r2)

    # Create a DataFrame to tabulate the results
    results = pd.DataFrame({
        f'Target {identifier}': [f'target_{i}' for i in range(1, (len(data.columns) // 2) + 1)],
        f'Name': df.columns[-29:],
        f'R2 Train {identifier}': [score[0] for score in r2_scores],
        f'R2 Val {identifier}': [score[1] for score in r2_scores],
        f'R2 Test {identifier}': [score[2] for score in r2_scores],
        f'R2 All {identifier}': [score[3] for score in r2_scores]
    })
    return results


results_stack_false = computeR2('/checkpoints/stn_r3_f849_tlast29/stack=False-scaleY=True/predictions.csv', identifier='stack=F')
#results_stack_true = computeR2('/checkpoints/stn_r3_f849_tlast29/stack=True-scaleY=True/predictions.csv', identifier='stack=T')

# Concatenate the two dataframes
#final_results = pd.concat([results_stack_false.set_index('Target stack=F'), results_stack_true.set_index('Target stack=T')], axis=1)
final_results = results_stack_false
print(final_results)
final_results.to_csv('final_results.csv')

   Target stack=F              Name  R2 Train stack=F  R2 Val stack=F  \
0        target_1         elec_en_r          0.993476        0.993806   
1        target_2         elec_en_n          0.994522        0.994226   
2        target_3           gibbs_r          0.990869        0.993314   
3        target_4         elec_en_o          0.993162        0.992971   
4        target_5           gibbs_n          0.991933        0.992098   
5        target_6           gibbs_o          0.992079        0.993965   
6        target_7  homo_spin_down_o          0.987476        0.964452   
7        target_8    homo_spin_up_o          0.983732        0.962605   
8        target_9            ddg_ox          0.986125        0.956170   
9       target_10            lumo_n          0.986401        0.946999   
10      target_11    lumo_spin_up_o          0.981682        0.932792   
11      target_12  max_charge_pos_n          0.980615        0.936576   
12      target_13  max_charge_pos_o          0.9768

In [None]:
class SequentialRegressor(pl.LightningModule):
    def __init__(self, input_dim: int, hidden_dims: List[int] = [512, 256, 128],
                 dropout_rate: float = 0.2, learning_rate: float = 0.001):
        super().__init__()
        self.save_hyperparameters()
        self.test_step_outputs = []

        layers = []
        prev_dim = input_dim

        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout_rate)
            ])
            prev_dim = hidden_dim

        layers.append(nn.Linear(prev_dim, 1))
        self.network = nn.Sequential(*layers)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.network(x)

    def _compute_loss(self, batch: Tuple[torch.Tensor, torch.Tensor],
                      stage: str) -> torch.Tensor:
        x, y = batch
        y_hat = self(x)
        loss = nn.MSELoss()(y_hat, y)
        self.log(f'{stage}_loss', loss, prog_bar=True)
        return loss

    def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor],
                     batch_idx: int) -> torch.Tensor:
        return self._compute_loss(batch, 'train')

    def validation_step(self, batch: Tuple[torch.Tensor, torch.Tensor],
                       batch_idx: int) -> torch.Tensor:
        return self._compute_loss(batch, 'val')

    def test_step(self, batch: Tuple[torch.Tensor, torch.Tensor],
                  batch_idx: int) -> None:
        x, y = batch
        y_hat = self(x)
        loss = nn.MSELoss()(y_hat, y)
        self.log('test_loss', loss)
        # Detach tensors before storing
        self.test_step_outputs.append({
            'y_true': y.cpu().detach(),
            'y_pred': y_hat.cpu().detach()
        })

    def on_test_epoch_end(self) -> None:
        y_true = torch.cat([out['y_true'] for out in self.test_step_outputs])
        y_pred = torch.cat([out['y_pred'] for out in self.test_step_outputs])
        # Detach tensors before converting to numpy
        r2 = r2_score(y_true.detach().numpy(), y_pred.detach().numpy())
        self.log('test_r2', r2, prog_bar=True)
        self.test_step_outputs.clear()

    def configure_optimizers(self) -> Dict:
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode='min', factor=0.5, patience=3, verbose=True
        )
        return {
            'optimizer': optimizer,
            'lr_scheduler': scheduler,
            'monitor': 'val_loss'
        }
