<a href="https://colab.research.google.com/github/nirb28/nn_catalyst/blob/main/src/pl/scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
DEBUG = False

import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print("Running in Colab!")
    from google.colab import drive
    drive.mount('/content/drive', force_remount=False)
    !pip install pytorch_lightning
    !pip install torchmetrics
else:
    print("Not running in Colab.")

In [42]:
import torch
# Training hyperparameters
INPUT_SIZE = 1479
NUM_TARGETS = 1
LEARNING_RATE = 0.001
BATCH_SIZE = 64
NUM_EPOCHS = 150

# Dataset
DATA_DIR = "dataset/"
NUM_WORKERS = 15

# Compute related
ACCELERATOR = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICES = [0]
PRECISION = 32

In [43]:
import numpy as np
datafile='src/pl/merged_data_last29_reordered_byR2.csv'
xy_orig = np.loadtxt(resolve_path_gdrive(datafile), delimiter=',', skiprows=1, dtype=float, max_rows=10)

In [44]:
from torch.utils.data import DataLoader, Dataset
from torch.utils.data import random_split
import pytorch_lightning as pl
import torch, math, os
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.preprocessing import StandardScaler

pl.seed_everything(1234)

def resolve_path_gdrive(relativePath):
    if os.path.exists('/content/drive'):
        return '/content/drive/MyDrive/work/gdrive-workspaces/git/nn_catalyst/' + relativePath
    else:
        from utils import get_project_root
        return get_project_root() + "/" + relativePath

class CatalystDataset(Dataset):
    def __init__(self, target_num, datafile='src/pl/merged_data_last29_reordered_byR2.csv', useTargetsAsFeatures=False):
        # Initialize data, download, etc.
        #xy = np.loadtxt(resolve_path_gdrive(datafile), delimiter=',', skiprows=1, dtype=float, max_rows=None)
        xy = xy_orig.copy()
        #xy = StandardScaler().fit_transform(xy)
        self.n_samples = xy.shape[0]
        # here the first column is the class label, the rest are the features
        total_targets = 29
        target_col_start = -(total_targets - target_num + 1)
        target_col_end = target_col_start + 1
        if useTargetsAsFeatures:
            self.x_data = torch.from_numpy(xy[:,:target_col_start]).float()  # size [n_samples, n_features]
        else:
            self.x_data = torch.from_numpy(xy[:,:-total_targets]).float()  # size [n_samples, n_features]
        self.y_data = torch.from_numpy(xy[:,target_col_start:target_col_end]).float()  # size [n_samples, 1]

    # support indexing such that dataset[i] can be used to get i-th sample
    def __getitem__(self, index):
        sample = self.x_data[index], self.y_data[index]
        return sample

    # we can call len(dataset) to return the size
    def __len__(self):
        return self.n_samples

class CatalystDataModule(pl.LightningDataModule):
    def __init__(self, data_dir, batch_size, num_workers, target_num):
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.entire_dataset = CatalystDataset(target_num=target_num, useTargetsAsFeatures=True)

    def prepare_data(self):
        pass

    def setup(self, stage=None):
        train_set_size = int(len(self.entire_dataset) * 0.8)
        test_set_size = int(len(self.entire_dataset) * 0.1)
        valid_set_size = len(self.entire_dataset) - train_set_size - test_set_size
        print(f"Train set size: {train_set_size}, Test set size: {test_set_size}, Valid set size: {valid_set_size}")
        self.train_ds, self.val_ds, self.test_ds = random_split(
            self.entire_dataset, [train_set_size, valid_set_size, test_set_size])
        return

    def train_dataloader(self):
        return DataLoader(
            self.train_ds,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            shuffle=True,
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_ds,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            shuffle=False,
            persistent_workers=True
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_ds,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            shuffle=False,
        )

dm1 = CatalystDataModule(
        data_dir=DATA_DIR,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        target_num=1
    )
dm1.prepare_data()
dm1.setup()

Seed set to 1234


Train set size: 8, Test set size: 1, Valid set size: 1


In [45]:
'''
for x,y in dm1.test_dataloader():
    #print(x)
    print(y)
'''

'\nfor x,y in dm1.test_dataloader():\n    #print(x)\n    print(y)\n'

In [46]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tqdm
import pytorch_lightning as pl
import torchmetrics
from torchmetrics.regression import R2Score
import torch.nn.functional as F

class BaseModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.r2 = torchmetrics.R2Score()
        self.loss_fn = nn.MSELoss()
        self.validation_step_outputs = []

    def training_step(self, batch, batch_idx):
        loss, scores, y = self._common_step(batch, batch_idx)
        self.log_dict(
            {
                "train_loss": loss,
            },
            on_step=True,
            on_epoch=True,
            prog_bar=True,
        )
        accuracy = self.r2(scores, y)
        self.log("train_acc", accuracy, prog_bar=True)
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        loss, scores, y = self._common_step(batch, batch_idx)
        self.log("val_loss", loss)
        self.validation_step_outputs.append(loss)
        return loss

    def on_validation_epoch_end(self):
        epoch_average = torch.stack(self.validation_step_outputs).mean()
        self.log("validation_epoch_average", epoch_average)
        self.validation_step_outputs.clear()  # free memory

    def test_step(self, batch, batch_idx):
        loss, scores, y = self._common_step(batch, batch_idx)
        self.log("test_loss", loss)
        return loss

    def _common_step(self, batch, batch_idx):
        x, y = batch
        x = x.reshape(x.size(0), -1)
        scores = self.forward(x)
        loss = self.loss_fn(scores, y)
        if DEBUG == True:
            print(f"loss: {loss}, len: {len(y)}")
        return loss, scores, y

    def predict_step(self, batch, batch_idx):
        x, y = batch
        x = x.reshape(x.size(0), -1)
        scores = self.forward(x)
        preds = torch.argmax(scores, dim=1)
        return preds

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(lr=self.lr, params=self.parameters())
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, min_lr=0.000000001, threshold=0.001)
        return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "val_loss"}

    def on_load_checkpoint(self, checkpoint: dict) -> None:
        state_dict = checkpoint["state_dict"]
        model_state_dict = self.state_dict()
        is_changed = False
        for k in state_dict:
            if k in model_state_dict:
                if state_dict[k].shape != model_state_dict[k].shape:
                    print(f"Skip loading parameter: {k}, "
                                f"required shape: {model_state_dict[k].shape}, "
                                f"loaded shape: {state_dict[k].shape}")
                    state_dict[k] = model_state_dict[k]
                    is_changed = True
            else:
                is_changed = True

        if is_changed:
            checkpoint.pop("optimizer_states", None)

class SingleTargetNet(BaseModel):

    def __init__(self, input_size=INPUT_SIZE, learning_rate=0.001, dropout_rate=0.5, target=1):
        super(SingleTargetNet, self).__init__()
        self.lr = learning_rate
        self.loss_fn = nn.MSELoss()

        self.fc1 = nn.Linear(input_size, 1024)
        self.bn1 = nn.BatchNorm1d(1024)
        self.fc2 = nn.Linear(1024, 512)
        self.bn2 = nn.BatchNorm1d(512)
        self.fc3 = nn.Linear(512, 1)
        self.fc_skip = nn.Linear(1024, 512)
        self.dropout = nn.Dropout(dropout_rate)
        self.save_hyperparameters()

    def forward(self, x):
        x1 = F.relu(self.bn1(self.fc1(x)))
        x1 = self.dropout(x1)

        x2 = F.relu(self.bn2(self.fc2(x1)))
        x2 = self.dropout(x2)

        # Skip connection
        x2 += self.fc_skip(x1)

        x3 = self.fc3(x2)
        return x3
    

In [47]:
import torch
import pytorch_lightning as pl
#from model import NN
#from dataset import CatalystDataModule
#from config import *
from pytorch_lightning.callbacks import EarlyStopping

torch.set_float32_matmul_precision("medium") # to make lightning happy

from pytorch_lightning.callbacks import ModelCheckpoint
# DEFAULTS used by the Trainer
checkpoint_callback = ModelCheckpoint(
    dirpath=os.getcwd()+'/checkpoints/stn_1/',
    filename='{epoch:02d}-{val_loss:.2f}',
    save_top_k=3,
    #save_best_only=True,
    verbose=True,
    monitor='val_loss',
    mode='min'
)

if __name__ == "__main__":
    model = SingleTargetNet (
        input_size=INPUT_SIZE,
        learning_rate=LEARNING_RATE,
    )
    dm = dm1
    trainer = pl.Trainer(
        accelerator=ACCELERATOR,
        devices=1,
        min_epochs=1,
        max_epochs=NUM_EPOCHS,
        precision=PRECISION,
        fast_dev_run=False,
        enable_checkpointing=True,
        callbacks=[checkpoint_callback,
                   EarlyStopping(monitor="train_loss", patience=10, verbose=True, mode="min")]
    )
    trainer.fit(model, dm)
    trainer.validate(model, dm)
    trainer.test(model, dm)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name    | Type        | Params | Mode 
------------------------------------------------
0 | r2      | R2Score     | 0      | train
1 | loss_fn | MSELoss     | 0      | train
2 | fc1     | Linear      | 1.5 M  | train
3 | bn1     | BatchNorm1d | 2.0 K  | train
4 | fc2     | Linear      | 524 K  | train
5 | bn2     | BatchNorm1d | 1.0 K  | train
6 | fc3     | Linear      | 513    | train
7 | fc_skip | Linear      | 524 K  | train
8 | dropout | Dropout     | 0      | train
------------------------------------------------
2.6 M     Trainable params
0         Non-trainable params
2.6 M     Total params
10.275    Total estimated model params size (MB)
9         Modules in train mode
0         Modules in eval mode


Train set size: 8, Test set size: 1, Valid set size: 1


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

RuntimeError: DataLoader worker (pid(s) 27484, 10472, 7736, 6096, 3748, 30920, 27112, 32272, 15956, 21424, 8700, 28476, 32052, 29528, 27004) exited unexpectedly

In [None]:
#Load the best checkpoint
checkpoint_path = checkpoint_callback.best_model_path
model = SingleTargetNet.load_from_checkpoint(checkpoint_path)
model.eval()
model.cpu()

In [None]:
from torchmetrics import R2Score
def r2score(model, dataloader):
    r2_score_metric = R2Score()
    for batch_idx, (data, target) in enumerate(dataloader):
        predictions = model(data)
        #import pdb; pdb.set_trace()
        r2_score_metric.update(predictions, target)
    return r2_score_metric.compute()

In [None]:
r2score(model, dm1.test_dataloader())

In [None]:
r2score(model, dm1.train_dataloader())

In [None]:
r2score(model, dm1.val_dataloader())

In [None]:
# Try inference with checkpoint
checkpoint_path = checkpoint_callback.best_model_path
model1 = SingleTargetNet.load_from_checkpoint(checkpoint_path)
model1.eval()
model1.cpu()
r2score(model1, dm1.val_dataloader())

In [None]:
import numpy as np
matrix = np.array([[1, 2, 3, 4],
                   [5, 6, 7, 8],
                   [9, 10, 11, 12]])

In [None]:
m = np.delete(matrix, [-1], axis=1)
m