<a href="https://colab.research.google.com/github/nirb28/nn_catalyst/blob/main/src/pl/scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [40]:
%%capture
DEBUG = False

import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print("Running in Colab!")
    from google.colab import drive
    drive.mount('/content/drive', force_remount=False)
    !pip install pytorch_lightning
    !pip install torchmetrics
else:
    print("Not running in Colab.")

In [41]:
import torch
# Training hyperparameters
INPUT_SIZE = 1479
NUM_TARGETS = 1
LEARNING_RATE = 0.001
BATCH_SIZE = 64
NUM_EPOCHS = 5

# Dataset
DATA_DIR = "dataset/"
NUM_WORKERS = 15

# Compute related
ACCELERATOR = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICES = [0]
PRECISION = 32

In [42]:
from torch.utils.data import DataLoader, Dataset
from torch.utils.data import random_split
import pytorch_lightning as pl
import torch, math, os
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.preprocessing import StandardScaler

pl.seed_everything(1234)

def resolve_path_gdrive(relativePath):
    if os.path.exists('/content/drive'):
        return '/content/drive/MyDrive/work/gdrive-workspaces/git/nn_catalyst/' + relativePath
    else:
        from utils import get_project_root
        return get_project_root() + "/" + relativePath


INFO:lightning_fabric.utilities.seed:Seed set to 1234


In [43]:
import numpy as np
datafile='src/pl/merged_data_last29_reordered_byR2.csv'
xy_orig = np.loadtxt(resolve_path_gdrive(datafile), delimiter=',', skiprows=1, dtype=float, max_rows=100)

In [44]:
class CatalystDataset(Dataset):
    def __init__(self, target_num, datafile='src/pl/merged_data_last29_reordered_byR2.csv',
                 useTargetsAsFeatures=False):
        # Initialize data, download, etc.
        #xy = np.loadtxt(resolve_path_gdrive(datafile), delimiter=',', skiprows=1, dtype=float, max_rows=None)
        xy = xy_orig.copy()
        #xy = StandardScaler().fit_transform(xy)
        self.n_samples = xy.shape[0]
        # here the first column is the class label, the rest are the features
        total_targets = 29
        target_col_start = -(total_targets - target_num + 1)
        target_col_end = target_col_start + 1
        if useTargetsAsFeatures:
            self.x_data = torch.from_numpy(xy[:,:target_col_start]).float()  # size [n_samples, n_features]
        else:
            self.x_data = torch.from_numpy(xy[:,:-total_targets]).float()  # size [n_samples, n_features]
        self.y_data = torch.from_numpy(xy[:,target_col_start:target_col_end]).float()  # size [n_samples, 1]

    # support indexing such that dataset[i] can be used to get i-th sample
    def __getitem__(self, index):
        sample = self.x_data[index], self.y_data[index]
        return sample

    # we can call len(dataset) to return the size
    def __len__(self):
        return self.n_samples

class CatalystDataModule(pl.LightningDataModule):
    def __init__(self, data_dir, batch_size, num_workers, target_num):
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.entire_dataset = CatalystDataset(target_num=target_num,
                                              useTargetsAsFeatures=True)
        self.target_num = target_num

    def prepare_data(self):
        pass

    def setup(self, stage=None):
        train_set_size = int(len(self.entire_dataset) * 0.8)
        test_set_size = int(len(self.entire_dataset) * 0.1)
        valid_set_size = len(self.entire_dataset) - train_set_size - test_set_size
        print(f"Train set size: {train_set_size}, Test set size: {test_set_size}, Valid set size: {valid_set_size}")
        self.train_ds, self.val_ds, self.test_ds = random_split(
            self.entire_dataset, [train_set_size, valid_set_size, test_set_size])
        return

    def train_dataloader(self):
        return DataLoader(
            self.train_ds,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            shuffle=True,
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_ds,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            shuffle=False,
            persistent_workers=True
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_ds,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            shuffle=False,
        )

dm1 = CatalystDataModule(
        data_dir=DATA_DIR,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        target_num=3
    )
dm1.prepare_data()
dm1.setup()

Train set size: 80, Test set size: 10, Valid set size: 10


In [45]:
'''
for x,y in dm1.test_dataloader():
    #print(x)
    print(y)
'''

'\nfor x,y in dm1.test_dataloader():\n    #print(x)\n    print(y)\n'

In [46]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tqdm
import pytorch_lightning as pl
import torchmetrics
from torchmetrics.regression import R2Score
import torch.nn.functional as F

class BaseModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.r2 = torchmetrics.R2Score()
        self.loss_fn = nn.MSELoss()
        self.validation_step_outputs = []

    def training_step(self, batch, batch_idx):
        loss, scores, y = self._common_step(batch, batch_idx)
        self.log_dict(
            {
                "train_loss": loss,
            },
            on_step=True,
            on_epoch=True,
            prog_bar=True,
        )
        accuracy = self.r2(scores, y)
        self.log("train_acc", accuracy, prog_bar=True)
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        loss, scores, y = self._common_step(batch, batch_idx)
        self.log("val_loss", loss)
        self.validation_step_outputs.append(loss)
        return loss

    def on_validation_epoch_end(self):
        epoch_average = torch.stack(self.validation_step_outputs).mean()
        self.log("validation_epoch_average", epoch_average)
        self.validation_step_outputs.clear()  # free memory

    def test_step(self, batch, batch_idx):
        loss, scores, y = self._common_step(batch, batch_idx)
        self.log("test_loss", loss)
        return loss

    def _common_step(self, batch, batch_idx):
        x, y = batch
        x = x.reshape(x.size(0), -1)
        scores = self.forward(x)
        loss = self.loss_fn(scores, y)
        if DEBUG == True:
            print(f"loss: {loss}, len: {len(y)}")
        return loss, scores, y

    def predict_step(self, batch, batch_idx):
        x, y = batch
        x = x.reshape(x.size(0), -1)
        scores = self.forward(x)
        preds = torch.argmax(scores, dim=1)
        return preds

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(lr=self.lr, params=self.parameters())
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, min_lr=0.000000001, threshold=0.001)
        return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "val_loss"}

    def on_load_checkpoint(self, checkpoint: dict) -> None:
        state_dict = checkpoint["state_dict"]
        model_state_dict = self.state_dict()
        is_changed = False
        for k in state_dict:
            if k in model_state_dict:
                if state_dict[k].shape != model_state_dict[k].shape:
                    print(f"Skip loading parameter: {k}, "
                                f"required shape: {model_state_dict[k].shape}, "
                                f"loaded shape: {state_dict[k].shape}")
                    state_dict[k] = model_state_dict[k]
                    is_changed = True
            else:
                is_changed = True

        if is_changed:
            checkpoint.pop("optimizer_states", None)

class SingleTargetNet(BaseModel):

    def __init__(self, input_size=INPUT_SIZE, learning_rate=0.001, dropout_rate=0.5, target=1):
        super(SingleTargetNet, self).__init__()
        self.lr = learning_rate
        self.loss_fn = nn.MSELoss()

        self.fc1 = nn.Linear(input_size, 1024)
        self.bn1 = nn.BatchNorm1d(1024)
        self.fc2 = nn.Linear(1024, 512)
        self.bn2 = nn.BatchNorm1d(512)
        self.fc3 = nn.Linear(512, 1)
        self.fc_skip = nn.Linear(1024, 512)
        self.dropout = nn.Dropout(dropout_rate)
        self.save_hyperparameters()

    def forward(self, x):
        x1 = F.relu(self.bn1(self.fc1(x)))
        x1 = self.dropout(x1)

        x2 = F.relu(self.bn2(self.fc2(x1)))
        x2 = self.dropout(x2)

        # Skip connection
        x2 += self.fc_skip(x1)

        x3 = self.fc3(x2)
        return x3


In [None]:
import torch
import pytorch_lightning as pl
#from model import NN
#from dataset import CatalystDataModule
#from config import *
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint

torch.set_float32_matmul_precision("medium") # to make lightning happy

if __name__ == "__main__":
    # do everything in a loop for all the targets
    for target in range(1, 30):
        print(f"Target: {target}")
        dm = CatalystDataModule(
            data_dir=DATA_DIR,
            batch_size=BATCH_SIZE,
            num_workers=NUM_WORKERS,
            target_num=target
        )
        dm.prepare_data()
        dm.setup()
        print(f"Train set size: {len(dm.train_ds),dm.train_ds.dataset.x_data.shape[1]}")
        print(f"Test set size: {len(dm.test_ds)}, Valid set size: {len(dm.val_ds)}")

        model = SingleTargetNet (
            input_size=dm.entire_dataset.x_data.shape[1],
            learning_rate=LEARNING_RATE,
        )

        # DEFAULTS used by the Trainer
        checkpoint_callback = ModelCheckpoint(
            dirpath=os.getcwd()+f'/checkpoints/stn_1/{dm.target_num}',
            filename='{epoch:02d}-{val_loss:.2f}',
            save_top_k=3,
            #save_best_only=True,
            verbose=True,
            monitor='val_loss',
            mode='min'
        )

        trainer = pl.Trainer(
            accelerator=ACCELERATOR,
            devices=1,
            min_epochs=1,
            max_epochs=NUM_EPOCHS,
            precision=PRECISION,
            fast_dev_run=False,
            enable_checkpointing=True,
            callbacks=[checkpoint_callback,
                    EarlyStopping(monitor="train_loss", patience=10, verbose=True, mode="min")]
        )
        trainer.fit(model, dm)
        trainer.validate(model, dm)
        trainer.test(model, dm)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type        | Params | Mode 
------------------------------------------------
0 | r2      | R2Score     | 0      | train
1 | loss_fn | MSELoss     | 0      | train
2 | fc1     | Linear      | 1.5 M  | train
3 | bn1     | BatchNorm1d | 2.0 K  | train
4 | fc2     | Linear      | 524 K  | train
5 | bn2     | BatchNorm1d | 1.0 K  | train
6 | fc3     | Linear      | 513    | train
7 | fc_skip | Linear      | 524 K  | train
8 | dropout | Dropout     | 0      | train
------------------------------------------------
2.6 M     Trainable params
0         Non-trainable params
2.6 M     Total params
10.275    Total

Target: 1
Train set size: 80, Test set size: 10, Valid set size: 10
Train set size: (80, 1479)
Test set size: 10, Valid set size: 10
Train set size: 80, Test set size: 10, Valid set size: 10


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (2) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved. New best score: 1813868.375
INFO:pytorch_lightning.utilities.rank_zero:Epoch 0, global step 2: 'val_loss' reached 6925690.50000 (best 6925690.50000), saving model to '/content/checkpoints/stn_1/29/epoch=00-val_loss=6925690.50.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved by 24151.625 >= min_delta = 0.0. New best score: 1789716.750
INFO:pytorch_lightning.utilities.rank_zero:Epoch 1, global step 4: 'val_loss' reached 6845184.00000 (best 6845184.00000), saving model to '/content/checkpoints/stn_1/29/epoch=01-val_loss=6845184.00.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved by 27682.000 >= min_delta = 0.0. New best score: 1762034.750
INFO:pytorch_lightning.utilities.rank_zero:Epoch 2, global step 6: 'val_loss' reached 6741408.00000 (best 6741408.00000), saving model to '/content/checkpoints/stn_1/29/epoch=02-val_loss=6741408.00.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved by 31522.750 >= min_delta = 0.0. New best score: 1730512.000
INFO:pytorch_lightning.utilities.rank_zero:Epoch 3, global step 8: 'val_loss' reached 6623659.50000 (best 6623659.50000), saving model to '/content/checkpoints/stn_1/29/epoch=03-val_loss=6623659.50.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved by 34547.750 >= min_delta = 0.0. New best score: 1695964.250
INFO:pytorch_lightning.utilities.rank_zero:Epoch 4, global step 10: 'val_loss' reached 6455161.00000 (best 6455161.00000), saving model to '/content/checkpoints/stn_1/29/epoch=04-val_loss=6455161.00.ckpt' as top 3
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Train set size: 80, Test set size: 10, Valid set size: 10


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Train set size: 80, Test set size: 10, Valid set size: 10


Testing: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /content/checkpoints/stn_1/29 exists and is not empty.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type        | Params | Mode 
------------------------------------------------
0 | r2      | R2Score     | 0      | train
1 | loss_fn | MSELoss     | 0      | train
2 | fc1     | Linear      | 1.5 M  | train
3 | bn1     | BatchNorm1d | 2.0 K  | train
4 | fc2     | Linear      | 524 K  | train
5 | bn2     | BatchNorm1d | 1.0 K  | train
6 | fc3     | Linear      | 513    | train
7 | fc_skip | Linear      | 524 K  | train
8 | dropout | Dropo

Target: 2
Train set size: 80, Test set size: 10, Valid set size: 10
Train set size: (80, 1480)
Test set size: 10, Valid set size: 10
Train set size: 80, Test set size: 10, Valid set size: 10


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved. New best score: 2545206.750
INFO:pytorch_lightning.utilities.rank_zero:Epoch 0, global step 2: 'val_loss' reached 1804107.25000 (best 1804107.25000), saving model to '/content/checkpoints/stn_1/29/epoch=00-val_loss=1804107.25.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved by 26654.500 >= min_delta = 0.0. New best score: 2518552.250
INFO:pytorch_lightning.utilities.rank_zero:Epoch 1, global step 4: 'val_loss' reached 1798709.00000 (best 1798709.00000), saving model to '/content/checkpoints/stn_1/29/epoch=01-val_loss=1798709.00.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved by 29709.500 >= min_delta = 0.0. New best score: 2488842.750
INFO:pytorch_lightning.utilities.rank_zero:Epoch 2, global step 6: 'val_loss' reached 1722130.62500 (best 1722130.62500), saving model to '/content/checkpoints/stn_1/29/epoch=02-val_loss=1722130.62.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved by 31998.250 >= min_delta = 0.0. New best score: 2456844.500
INFO:pytorch_lightning.utilities.rank_zero:Epoch 3, global step 8: 'val_loss' reached 1675220.75000 (best 1675220.75000), saving model to '/content/checkpoints/stn_1/29/epoch=03-val_loss=1675220.75.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved by 53570.750 >= min_delta = 0.0. New best score: 2403273.750
INFO:pytorch_lightning.utilities.rank_zero:Epoch 4, global step 10: 'val_loss' reached 1658192.50000 (best 1658192.50000), saving model to '/content/checkpoints/stn_1/29/epoch=04-val_loss=1658192.50.ckpt' as top 3
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Train set size: 80, Test set size: 10, Valid set size: 10


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Train set size: 80, Test set size: 10, Valid set size: 10


Testing: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type        | Params | Mode 
------------------------------------------------
0 | r2      | R2Score     | 0      | train
1 | loss_fn | MSELoss     | 0      | train
2 | fc1     | Linear      | 1.5 M  | train
3 | bn1     | BatchNorm1d | 2.0 K  | train
4 | fc2     | Linear      | 524 K  | train
5 | bn2     | BatchNorm1d | 1.0 K  | train
6 | fc3     | Linear      | 513    | train
7 | fc_skip | Linear      | 524 K  | train
8 | dropout | Dropout     | 0      | train
------------------------------------------------
2.6 M     Trainable params
0         Non-trainable params
2.6 M     Total params
10.283    Total

Target: 3
Train set size: 80, Test set size: 10, Valid set size: 10
Train set size: (80, 1481)
Test set size: 10, Valid set size: 10
Train set size: 80, Test set size: 10, Valid set size: 10


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved. New best score: 2273212.250
INFO:pytorch_lightning.utilities.rank_zero:Epoch 0, global step 2: 'val_loss' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved by 26167.500 >= min_delta = 0.0. New best score: 2247044.750
INFO:pytorch_lightning.utilities.rank_zero:Epoch 1, global step 4: 'val_loss' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved by 28959.250 >= min_delta = 0.0. New best score: 2218085.500
INFO:pytorch_lightning.utilities.rank_zero:Epoch 2, global step 6: 'val_loss' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved by 41400.000 >= min_delta = 0.0. New best score: 2176685.500
INFO:pytorch_lightning.utilities.rank_zero:Epoch 3, global step 8: 'val_loss' reached 1685634.75000 (best 1658192.50000), saving model to '/content/checkpoints/stn_1/29/epoch=03-val_loss=1685634.75.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved by 44669.000 >= min_delta = 0.0. New best score: 2132016.500
INFO:pytorch_lightning.utilities.rank_zero:Epoch 4, global step 10: 'val_loss' reached 1570676.00000 (best 1570676.00000), saving model to '/content/checkpoints/stn_1/29/epoch=04-val_loss=1570676.00.ckpt' as top 3
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Train set size: 80, Test set size: 10, Valid set size: 10


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Train set size: 80, Test set size: 10, Valid set size: 10


Testing: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type        | Params | Mode 
------------------------------------------------
0 | r2      | R2Score     | 0      | train
1 | loss_fn | MSELoss     | 0      | train
2 | fc1     | Linear      | 1.5 M  | train
3 | bn1     | BatchNorm1d | 2.0 K  | train
4 | fc2     | Linear      | 524 K  | train
5 | bn2     | BatchNorm1d | 1.0 K  | train
6 | fc3     | Linear      | 513    | train
7 | fc_skip | Linear      | 524 K  | train
8 | dropout | Dropout     | 0      | train
------------------------------------------------
2.6 M     Trainable params
0         Non-trainable params
2.6 M     Total params
10.287    Total

Target: 4
Train set size: 80, Test set size: 10, Valid set size: 10
Train set size: (80, 1482)
Test set size: 10, Valid set size: 10
Train set size: 80, Test set size: 10, Valid set size: 10


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved. New best score: 2164854.250
INFO:pytorch_lightning.utilities.rank_zero:Epoch 0, global step 2: 'val_loss' reached 1654846.62500 (best 1570676.00000), saving model to '/content/checkpoints/stn_1/29/epoch=00-val_loss=1654846.62.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved by 23965.000 >= min_delta = 0.0. New best score: 2140889.250
INFO:pytorch_lightning.utilities.rank_zero:Epoch 1, global step 4: 'val_loss' reached 1619319.62500 (best 1570676.00000), saving model to '/content/checkpoints/stn_1/29/epoch=01-val_loss=1619319.62.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved by 29471.750 >= min_delta = 0.0. New best score: 2111417.500
INFO:pytorch_lightning.utilities.rank_zero:Epoch 2, global step 6: 'val_loss' reached 1578452.00000 (best 1570676.00000), saving model to '/content/checkpoints/stn_1/29/epoch=02-val_loss=1578452.00.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved by 33095.250 >= min_delta = 0.0. New best score: 2078322.250
INFO:pytorch_lightning.utilities.rank_zero:Epoch 3, global step 8: 'val_loss' reached 1538664.00000 (best 1538664.00000), saving model to '/content/checkpoints/stn_1/29/epoch=03-val_loss=1538664.00.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved by 35160.000 >= min_delta = 0.0. New best score: 2043162.250
INFO:pytorch_lightning.utilities.rank_zero:Epoch 4, global step 10: 'val_loss' reached 1484422.25000 (best 1484422.25000), saving model to '/content/checkpoints/stn_1/29/epoch=04-val_loss=1484422.25.ckpt' as top 3
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Train set size: 80, Test set size: 10, Valid set size: 10


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Train set size: 80, Test set size: 10, Valid set size: 10


Testing: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type        | Params | Mode 
------------------------------------------------
0 | r2      | R2Score     | 0      | train
1 | loss_fn | MSELoss     | 0      | train
2 | fc1     | Linear      | 1.5 M  | train
3 | bn1     | BatchNorm1d | 2.0 K  | train
4 | fc2     | Linear      | 524 K  | train
5 | bn2     | BatchNorm1d | 1.0 K  | train
6 | fc3     | Linear      | 513    | train
7 | fc_skip | Linear      | 524 K  | train
8 | dropout | Dropout     | 0      | train
------------------------------------------------
2.6 M     Trainable params
0         Non-trainable params
2.6 M     Total params
10.291    Total

Target: 5
Train set size: 80, Test set size: 10, Valid set size: 10
Train set size: (80, 1483)
Test set size: 10, Valid set size: 10
Train set size: 80, Test set size: 10, Valid set size: 10


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved. New best score: 2490150.500
INFO:pytorch_lightning.utilities.rank_zero:Epoch 0, global step 2: 'val_loss' reached 1185046.25000 (best 1185046.25000), saving model to '/content/checkpoints/stn_1/29/epoch=00-val_loss=1185046.25.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved by 24434.250 >= min_delta = 0.0. New best score: 2465716.250
INFO:pytorch_lightning.utilities.rank_zero:Epoch 1, global step 4: 'val_loss' reached 1149954.50000 (best 1149954.50000), saving model to '/content/checkpoints/stn_1/29/epoch=01-val_loss=1149954.50.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved by 29209.750 >= min_delta = 0.0. New best score: 2436506.500
INFO:pytorch_lightning.utilities.rank_zero:Epoch 2, global step 6: 'val_loss' reached 1108740.87500 (best 1108740.87500), saving model to '/content/checkpoints/stn_1/29/epoch=02-val_loss=1108740.88.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved by 38140.000 >= min_delta = 0.0. New best score: 2398366.500
INFO:pytorch_lightning.utilities.rank_zero:Epoch 3, global step 8: 'val_loss' reached 1063848.62500 (best 1063848.62500), saving model to '/content/checkpoints/stn_1/29/epoch=03-val_loss=1063848.62.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved by 43691.750 >= min_delta = 0.0. New best score: 2354674.750
INFO:pytorch_lightning.utilities.rank_zero:Epoch 4, global step 10: 'val_loss' reached 1023717.00000 (best 1023717.00000), saving model to '/content/checkpoints/stn_1/29/epoch=04-val_loss=1023717.00.ckpt' as top 3
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Train set size: 80, Test set size: 10, Valid set size: 10


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Train set size: 80, Test set size: 10, Valid set size: 10


Testing: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type        | Params | Mode 
------------------------------------------------
0 | r2      | R2Score     | 0      | train
1 | loss_fn | MSELoss     | 0      | train
2 | fc1     | Linear      | 1.5 M  | train
3 | bn1     | BatchNorm1d | 2.0 K  | train
4 | fc2     | Linear      | 524 K  | train
5 | bn2     | BatchNorm1d | 1.0 K  | train
6 | fc3     | Linear      | 513    | train
7 | fc_skip | Linear      | 524 K  | train
8 | dropout | Dropout     | 0      | train
------------------------------------------------
2.6 M     Trainable params
0         Non-trainable params
2.6 M     Total params
10.295    Total

Target: 6
Train set size: 80, Test set size: 10, Valid set size: 10
Train set size: (80, 1484)
Test set size: 10, Valid set size: 10
Train set size: 80, Test set size: 10, Valid set size: 10


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved. New best score: 2563175.750
INFO:pytorch_lightning.utilities.rank_zero:Epoch 0, global step 2: 'val_loss' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved by 28132.500 >= min_delta = 0.0. New best score: 2535043.250
INFO:pytorch_lightning.utilities.rank_zero:Epoch 1, global step 4: 'val_loss' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved by 27282.750 >= min_delta = 0.0. New best score: 2507760.500
INFO:pytorch_lightning.utilities.rank_zero:Epoch 2, global step 6: 'val_loss' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved by 33826.000 >= min_delta = 0.0. New best score: 2473934.500
INFO:pytorch_lightning.utilities.rank_zero:Epoch 3, global step 8: 'val_loss' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved by 43420.250 >= min_delta = 0.0. New best score: 2430514.250
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Train set size: 80, Test set size: 10, Valid set size: 10


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Train set size: 80, Test set size: 10, Valid set size: 10


Testing: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type        | Params | Mode 
------------------------------------------------
0 | r2      | R2Score     | 0      | train
1 | loss_fn | MSELoss     | 0      | train
2 | fc1     | Linear      | 1.5 M  | train
3 | bn1     | BatchNorm1d | 2.0 K  | train
4 | fc2     | Linear      | 524 K  | train
5 | bn2     | BatchNorm1d | 1.0 K  | train
6 | fc3     | Linear      | 513    | train
7 | fc_skip | Linear      | 524 K  | train
8 | dropout | Dropout     | 0      | train
------------------------------------------------
2.6 M     Trainable params
0         Non-trainable params
2.6 M     Total params
10.299    Total

Target: 7
Train set size: 80, Test set size: 10, Valid set size: 10
Train set size: (80, 1485)
Test set size: 10, Valid set size: 10
Train set size: 80, Test set size: 10, Valid set size: 10


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved. New best score: 3.051
INFO:pytorch_lightning.utilities.rank_zero:Epoch 0, global step 2: 'val_loss' reached 0.54130 (best 0.54130), saving model to '/content/checkpoints/stn_1/29/epoch=00-val_loss=0.54.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved by 1.381 >= min_delta = 0.0. New best score: 1.670
INFO:pytorch_lightning.utilities.rank_zero:Epoch 1, global step 4: 'val_loss' reached 0.79934 (best 0.54130), saving model to '/content/checkpoints/stn_1/29/epoch=01-val_loss=0.80.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 2, global step 6: 'val_loss' reached 1.73695 (best 0.54130), saving model to '/content/checkpoints/stn_1/29/epoch=02-val_loss=1.74.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 3, global step 8: 'val_loss' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved by 0.688 >= min_delta = 0.0. New best score: 0.982
INFO:pytorch_lightning.utilities.rank_zero:Epoch 4, global step 10: 'val_loss' was not in top 3
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Train set size: 80, Test set size: 10, Valid set size: 10


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Train set size: 80, Test set size: 10, Valid set size: 10


Testing: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type        | Params | Mode 
------------------------------------------------
0 | r2      | R2Score     | 0      | train
1 | loss_fn | MSELoss     | 0      | train
2 | fc1     | Linear      | 1.5 M  | train
3 | bn1     | BatchNorm1d | 2.0 K  | train
4 | fc2     | Linear      | 524 K  | train
5 | bn2     | BatchNorm1d | 1.0 K  | train
6 | fc3     | Linear      | 513    | train
7 | fc_skip | Linear      | 524 K  | train
8 | dropout | Dropout     | 0      | train
------------------------------------------------
2.6 M     Trainable params
0         Non-trainable params
2.6 M     Total params
10.303    Total

Target: 8
Train set size: 80, Test set size: 10, Valid set size: 10
Train set size: (80, 1486)
Test set size: 10, Valid set size: 10
Train set size: 80, Test set size: 10, Valid set size: 10


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved. New best score: 2.102
INFO:pytorch_lightning.utilities.rank_zero:Epoch 0, global step 2: 'val_loss' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric train_loss improved by 0.151 >= min_delta = 0.0. New best score: 1.951
INFO:pytorch_lightning.utilities.rank_zero:Epoch 1, global step 4: 'val_loss' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Exception ignored in: 

In [48]:
#.best_model_path
os.rename(checkpoint_callback.best_model_path, checkpoint_callback.dirpath+"/best.ckpt")

In [49]:
#Load the best checkpoint
model = SingleTargetNet.load_from_checkpoint(checkpoint_callback.dirpath+"/best.ckpt")
model.eval()
model.cpu()

SingleTargetNet(
  (r2): R2Score()
  (loss_fn): MSELoss()
  (fc1): Linear(in_features=1481, out_features=1024, bias=True)
  (bn1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=True)
  (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=512, out_features=1, bias=True)
  (fc_skip): Linear(in_features=1024, out_features=512, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [57]:
from torchmetrics import R2Score
from torchmetrics import MeanSquaredError

def r2scoreAndMSE(model, dataloader):
    r2_score_metric = R2Score()
    mse = MeanSquaredError()
    for batch_idx, (data, target) in enumerate(dataloader):
        predictions = model(data)
        #import pdb; pdb.set_trace()
        r2_score_metric.update(predictions, target)
        mse.update(predictions, target)
    return r2_score_metric.compute(), mse.compute()

In [60]:
print(r2scoreAndMSE(model, dm1.train_dataloader()))
print(r2scoreAndMSE(model, dm1.val_dataloader()))
print(r2scoreAndMSE(model, dm1.test_dataloader()))



(tensor(-1.4546), tensor(2044479.2500))
(tensor(-1.9454), tensor(1998815.))
(tensor(-1.9245), tensor(3128534.7500))


In [62]:
# Try inference with checkpoint
checkpoint_path = checkpoint_callback.dirpath+"/best.ckpt"
model1 = SingleTargetNet.load_from_checkpoint(checkpoint_path)
model1.eval()
model1.cpu()
r2scoreAndMSE(model1, dm1.val_dataloader())



(tensor(-1.9454), tensor(1998815.))

In [None]:
import numpy as np
matrix = np.array([[1, 2, 3, 4],
                   [5, 6, 7, 8],
                   [9, 10, 11, 12]])

In [None]:
m = np.delete(matrix, [-1], axis=1)
m