<a href="https://colab.research.google.com/github/nirb28/nn_catalyst/blob/main/src/pl/scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
import sys
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    print("Running in Colab!")
    from google.colab import drive
    drive.mount('/content/drive', force_remount=False)
    !pip install pytorch_lightning
    !pip install torchmetrics
else:
    print("Not running in Colab.")

In [28]:
from pytorch_lightning.loggers import TensorBoardLogger
import torch
DEBUG = False
# Training hyperparameters
INPUT_SIZE = 1479
NUM_TARGETS = 1
LEARNING_RATE = 0.001
BATCH_SIZE = 512
NUM_EPOCHS = 150
NUM_WORKERS = 0
# Compute related
ACCELERATOR = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICES = [0]
PRECISION = 32
CHECKPOINTS_FOLDER = "/checkpoints/stn_2_r2" #tmp_r1"

In [None]:
from torch.utils.data import DataLoader, Dataset
from torch.utils.data import random_split
import pytorch_lightning as pl
import torch, math, os
from torch.utils.data import Dataset, DataLoader
import numpy as np

seed = 1234
pl.seed_everything(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.set_float32_matmul_precision("medium") # to make lightning happy

def resolve_path_gdrive(relativePath):
    if os.path.exists('/content/drive'):
        return '/content/drive/MyDrive/work/gdrive-workspaces/git/nn_catalyst/' + relativePath
    else:
        from utils import get_project_root
        return get_project_root() + "/" + relativePath

print(f"Root project folder is at {resolve_path_gdrive('.')}")

In [None]:
#%load_ext tensorboard
#%tensorboard --logdir f"/content/drive/MyDrive/work/gdrive-workspaces/git/nn_catalyst/checkpoints/{CHECKPOINTS_FOLDER}/lightning_logs"

In [None]:
import numpy as np
datafile='src/pl/merged_data_last29_reordered_byR2.csv'
max_rows=None
xy_orig = np.loadtxt(resolve_path_gdrive(datafile), delimiter=',', skiprows=1, dtype=float, max_rows=max_rows)

In [None]:
class BaseModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.r2 = torchmetrics.R2Score()
        self.loss_fn = nn.MSELoss()
        self.validation_step_outputs = []

    def training_step(self, batch, batch_idx):
        loss, scores, y = self._common_step(batch, batch_idx)
        self.log_dict(
            {
                "train_loss": loss,
            },
            on_step=False,
            on_epoch=True,
            prog_bar=True,
        )
        accuracy = self.r2(scores, y)
        self.log("train_acc", accuracy, prog_bar=True)
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        loss, scores, y = self._common_step(batch, batch_idx)
        self.log("val_loss", loss)
        self.validation_step_outputs.append(loss)
        return loss

    def on_validation_epoch_end(self):
        epoch_average = torch.stack(self.validation_step_outputs).mean()
        self.log("validation_epoch_average", epoch_average)
        self.validation_step_outputs.clear()  # free memory

    def test_step(self, batch, batch_idx):
        loss, scores, y = self._common_step(batch, batch_idx)
        self.log("test_loss", loss)
        return loss

    def _common_step(self, batch, batch_idx):
        x, y = batch
        x = x.reshape(x.size(0), -1)
        scores = self.forward(x)
        loss = self.loss_fn(scores, y)
        if DEBUG == True:
            print(f"loss: {loss}, len: {len(y)}")
        return loss, scores, y

    def predict_step(self, batch, batch_idx):
        x, y = batch
        x = x.reshape(x.size(0), -1)
        scores = self.forward(x)
        preds = torch.argmax(scores, dim=1)
        return preds

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(lr=self.lr, params=self.parameters())
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, min_lr=0.000000001, threshold=0.001)
        return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "val_loss"}

class SingleTargetNet(BaseModel):

    def __init__(self, input_size=INPUT_SIZE, learning_rate=0.001, dropout_rate=0.5, target=1):
        super(SingleTargetNet, self).__init__()
        self.lr = learning_rate
        self.loss_fn = nn.MSELoss()

        self.fc1 = nn.Linear(input_size, 1024)
        self.bn1 = nn.BatchNorm1d(1024)
        self.fc2 = nn.Linear(1024, 512)
        self.bn2 = nn.BatchNorm1d(512)
        self.fc3 = nn.Linear(512, 1)
        self.fc_skip = nn.Linear(1024, 512)
        self.dropout = nn.Dropout(dropout_rate)
        self.save_hyperparameters()

    def forward(self, x):
        x1 = F.relu(self.bn1(self.fc1(x)))
        x1 = self.dropout(x1)

        x2 = F.relu(self.bn2(self.fc2(x1)))
        x2 = self.dropout(x2)

        # Skip connection
        x2 += self.fc_skip(x1)

        x3 = self.fc3(x2)
        return x3


In [None]:
# prompt: write a function that takes in a numpy array and splits it into train, test and validation. it then scales all the data including the target columns. finally create a dataset and dataloader for all the 3 and wrap it into a datamodule

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

class CatalystDataModule(pl.LightningDataModule):
    def __init__(self, data, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS):
        super().__init__()
        self.data = data
        self.batch_size = batch_size
        self.num_workers = num_workers

    def prepare_data(self):
        # Split data into train, validation, and test sets
        X = self.data[:, :-1]
        y = self.data[:, -1]
        X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
        X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

        # Scale data using StandardScaler
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)
        X_test = scaler.transform(X_test)

        y_train = y_train.reshape(-1, 1)
        y_val = y_val.reshape(-1, 1)
        y_test = y_test.reshape(-1, 1)

        y_train = scaler.fit_transform(y_train)
        y_val = scaler.transform(y_val)
        y_test = scaler.transform(y_test)

        # Create numpy arrays for the data
        self.train_data = np.concatenate((X_train, y_train), axis=1)
        self.val_data = np.concatenate((X_val, y_val), axis=1)
        self.test_data = np.concatenate((X_test, y_test), axis=1)

    def setup(self, stage=None):
        # Create datasets
        self.train_dataset = CatalystDataset(self.train_data)
        self.val_dataset = CatalystDataset(self.val_data)
        self.test_dataset = CatalystDataset(self.test_data)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=self.num_workers)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.num_workers)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=self.num_workers)

class CatalystDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.x = torch.tensor(self.data[:, :-1], dtype=torch.float32)
        #self.y = torch.tensor(self.data[:, -1], dtype=torch.float32)
        self.y = torch.unsqueeze(
            torch.tensor(self.data[:, -1], dtype=torch.float32), 1).float()  # size [n_samples, 1]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

In [None]:
from torch import nn, optim
import torchmetrics
import torch.nn.functional as F
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor
from pytorch_lightning.callbacks import RichProgressBar

def prepare_data_module(xy):
    dm = CatalystDataModule(data=xy)
    dm.prepare_data()
    dm.setup()
    return dm

def prepare_trainer(target, num_epochs=NUM_EPOCHS):
    tensorboard = TensorBoardLogger(resolve_path_gdrive(f'{CHECKPOINTS_FOLDER}/lightning_logs'), name=f"{target}")
    checkpoint_callback = ModelCheckpoint(
        dirpath=resolve_path_gdrive(f'{CHECKPOINTS_FOLDER}/{target}'),
        filename='{epoch:02d}-{val_loss:.2f}',
        save_top_k=1,
        verbose=True,
        monitor='val_loss',
        mode='min'
    )
    lr_monitor = LearningRateMonitor(logging_interval='step')
    trainer = pl.Trainer(
        accelerator=ACCELERATOR,
        devices=1,
        min_epochs=1,
        max_epochs=num_epochs,
        precision=PRECISION,
        fast_dev_run=True,
        enable_checkpointing=True,
        enable_progress_bar=True,
        log_every_n_steps=20,
        logger=tensorboard,
        callbacks=[checkpoint_callback, lr_monitor, RichProgressBar(),
                EarlyStopping(monitor="train_loss", patience=10, verbose=True, mode="min")]
    )
    return trainer

def prepare_model(input_size):
    model = SingleTargetNet (
        input_size=input_size,
        learning_rate=LEARNING_RATE,
    )
    return model

def iterate_all_targets(xy_data, total_targets, target_start_range=1, target_stop_range=None):
    total_cols = xy_data.shape[1]
    if target_stop_range == None: 
        target_stop_range = total_targets
    for target_num in range(target_start_range, target_stop_range+1):
        target_col_start = total_cols - (total_targets - target_num)
        print(f'Target {target_num}, target_col {target_col_start}')
        xy_data = torch.from_numpy(xy_orig[:,:target_col_start]).float()  # size [n_samples, n_features]
        dm = prepare_data_module(xy_data)
        model = prepare_model(input_size=dm.train_dataset.x.shape[1])
        trainer = prepare_trainer(target=target_num, num_epochs=NUM_EPOCHS)
        trainer.fit(model, dm)
        trainer.validate(model, dm)
        trainer.test(model, dm)


In [None]:
# ******** TRAINING CELL *********
TRAINING = False
if TRAINING:
    iterate_all_targets(xy_orig, total_targets=29, target_start_range=1, target_stop_range=5)

In [29]:
from torchmetrics import R2Score
from torchmetrics import MeanSquaredError

def r2scoreAndMSE(model, dataloader):
    r2_score_metric = R2Score()
    mse = MeanSquaredError()
    for batch_idx, (data, target) in enumerate(dataloader):
        predictions = model(data)
        #import pdb; pdb.set_trace()
        r2_score_metric.update(predictions, target)
        mse.update(predictions, target)
    return r2_score_metric.compute().detach().item(), mse.compute().detach().item()

In [32]:
from pathlib import Path
from itertools import chain

def measure_all_targets(xy_data, total_targets, target_start_range=1, target_stop_range=None):
    total_cols = xy_data.shape[1]
    if target_stop_range == None: 
        target_stop_range = total_targets
    for target_num in range(target_start_range, target_stop_range+1):
        target_col_start = total_cols - (total_targets - target_num)
        print(f'Target {target_num}, target_col {target_col_start}')
        xy_data = torch.from_numpy(xy_orig[:,:target_col_start]).float()  # size [n_samples, n_features]
        eval_model(xy_data, target_num)
        
def eval_model(xy_data, target_num):
        dm = prepare_data_module(xy_data)
        if DEBUG == True:
            print(f"Train set size: {len(dm.train_ds),dm.train_ds.dataset.x.shape[1]}")
            print(f"Test set size: {len(dm.test_ds)}, Valid set size: {len(dm.val_ds)}")

        checkpoint_path=resolve_path_gdrive(f'{CHECKPOINTS_FOLDER}/{target_num}')
        pathlist = Path(checkpoint_path).glob('**/*.ckpt')
        for path in pathlist:
            # because path is object not string
            model = SingleTargetNet.load_from_checkpoint(str(path))
            model.eval()
            model.cpu()
            # add row to resultsDF
            train_r2, train_mse = r2scoreAndMSE(model, dm.train_dataloader())
            val_r2, val_mse = r2scoreAndMSE(model, dm.val_dataloader())
            test_r2, test_mse = r2scoreAndMSE(model, dm.test_dataloader())

            results.append([target_num, os.path.basename(path), train_r2, train_mse, val_r2, val_mse, test_r2, test_mse])    

TESTING = True
results = []  
if TESTING:
    measure_all_targets(xy_orig, total_targets=29,target_start_range=1, target_stop_range=5)

Target 1, target_col 1480
Target 2, target_col 1481
Target 3, target_col 1482
Target 4, target_col 1483
Target 5, target_col 1484


In [33]:
import pandas as pd
resultsDFcolumns = ["Target", "ModelFile", "Train R2", "Train MSE", "Val R2", "Val MSE", "Test R2", "Test MSE"]
resultsDF = pd.DataFrame(results, columns=resultsDFcolumns)
resultsDF

Unnamed: 0,Target,ModelFile,Train R2,Train MSE,Val R2,Val MSE,Test R2,Test MSE
0,1,epoch=26-val_loss=0.00.ckpt,0.996414,0.003586,0.996376,0.003866,0.996377,0.003434
1,2,epoch=35-val_loss=0.00.ckpt,0.997682,0.002318,0.997154,0.003036,0.997258,0.002598
2,3,epoch=73-val_loss=0.00.ckpt,0.998498,0.001502,0.997765,0.002384,0.997934,0.001958
3,4,epoch=60-val_loss=0.00.ckpt,0.998913,0.001087,0.998615,0.001478,0.998345,0.001568
4,5,epoch=55-val_loss=0.00.ckpt,0.998662,0.001338,0.998204,0.001916,0.998139,0.001764


In [None]:
resultsDF.to_csv(resolve_path_gdrive(f'{CHECKPOINTS_FOLDER}/results.csv'), index=False)