## Load Libraries and import modules 

In [1]:
# Load all the libraries 
import numpy as np
import pandas as pd
import numpy.random as nrd
import os 
import sys

# Pytorch modules 
import torch
import torch.nn.functional as F
from torch import nn
import torch.optim as optim

# this for the custom Dataset 
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

# plotting
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib

# Import tqdm for progress bar
from tqdm.auto import tqdm

# for timing functions
from timeit import default_timer as timer 

### Configure Project Parameters

In [2]:
# check your current directory
os.getcwd()

'c:\\Users\\gpano\\Desktop\\github_py\\proteomics_latent_space'

**Important:** Run the configuration file first `configs.py`. Importing this script and setting the seed and device parameters before importing any of the other modules ensures that evereything is sync.

**Important** If you want *change the configuration parameters*, change them before importing and running the pipeline. 

In [3]:
from models_util import configs

Importing models_util.configs module
First set device and seed for reproducibility.
-----------------------------------------------


In [4]:
configs.get_configs()

'Seed: None, Device: None'

In [5]:
# print the global variables
print(configs.project_seed, configs.project_device)

None None


In [6]:
configs.set_seed(888)
device = configs.set_device(force_cpu=True)

# global variables have changed too
print(configs.project_seed, configs.project_device)

During configuration random seed 888 has been set.
888 cpu


In [7]:
# lets see if the get function also agrees:
configs.get_configs()

'Seed: 888, Device: cpu'

Now that all the configurations values are assigned globally, we can import the modules. If this is working, we expect each module to access the **same** **seed** and **device** we set. We are also expecting generated numbers **inside the modules** to be reproducible.

In [8]:
# Load home modules and check the device where they are running 
from models_util import utility_functions as uf

During configuration random seed 888 has been set.
Importing models_util.utility_functions, running in cpu with seed: 888


In [9]:
from models_util import custom_dataset as cd

During configuration random seed 888 has been set.
Importing models_util.custom_dataset, running in cpu with seed: 888


In [10]:
from models_util import cost_functions as cf

During configuration random seed 888 has been set.
Importing models_util.cost_functions, running in cpu with seed: 888


In [11]:
from models_util import VAE1 as v1 


During configuration random seed 888 has been set.
Importing models_util.VAE1, running in cpu with seed: 888


## SCBC Data scale and split for VAE
- We will perform min-max scaling to the TMT-Ratios of the proteomic SCBC data. <br>
- It is important to use the non-missing min and max values of dataset row-by-row <br>

In [12]:
# create path and read the scbc data
data_path = os.getcwd() + "\\data\\processed\\" 
scbc = pd.read_csv(data_path+"protein_quant_merged.txt",delimiter="\t")

In [13]:
# convert to numpy 
npscbc = scbc.to_numpy()
np.isnan(npscbc).sum()


np.int64(104200)

In [14]:
# Get extreme values (non-missing) frome ach row. 
scbc_min = np.nanmin(npscbc, axis=1, keepdims=True)  # minimum among non-NaN
scbc_max = np.nanmax(npscbc, axis=1,keepdims=True)  # maximum among non-NaN

# check that that shapes and values are as expected 
print(scbc_max.shape,scbc_min.shape,np.isnan(scbc_max).sum(), np.isnan(scbc_min).sum())

(10439, 1) (10439, 1) 0 0


In [15]:
# scale data 
npscbc_scaled = (npscbc - scbc_min) /(scbc_max - scbc_min + 1e-8)
npscbc_scaled.shape

# npscbc_scaled[0]

(10439, 130)

In [16]:
# shuffle the rows 
np.random.shuffle(npscbc_scaled)
npscbc_scaled.shape
# npscbc_scaled[0]

(10439, 130)

### Split Data 

In [17]:
train_data, val_data, test_data = uf.create_data_partition(
    npscbc_scaled, test_perc=0.15, val=True, val_perc=0.1
)
train_data.shape, val_data.shape, test_data.shape

((7829, 130), (1044, 130), (1566, 130))

You can test reproducibility by re-runing the function and checking the data in the first index of the matrix. We expect it to be the same. 

### Pass data to Custom Dataset and DataLoaders 
- check that your data is numpy matrix.
- check if data is scaled to (0,1).
- create three custom dataset instances.
- the custom dataset will save all the data to memory and create a mask where NaNs are located.
- the numpy arrays will be converted to tensors of appropriate dimensions and NaNs to zeroes.
- then we pass the custom dataset to the dataloader object.
- The DataLoader object contains for each row (training example) i) a tensor of 1 x 130 columns with 0-1 scaled values, ii) a 1x130 mask indicating NA positions and iii) index of the examples per batch (could be 64, 128,..., batch_size). 

In [18]:
train_dataset = cd.ProteinDataset(train_data)
val_dataset = cd.ProteinDataset(val_data)
test_dataset = cd.ProteinDataset(test_data)

Protein Dataset is passed to memory
No Protein Symbols were identified
Protein Dataset is passed to memory
No Protein Symbols were identified
Protein Dataset is passed to memory
No Protein Symbols were identified


In [48]:
# pass data to the dataLoader
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

In [67]:
# the train loader is not reproducible bcs it shuffles but it is not seeded yet. 
# here is one batch of training examples 
# torch.manual_seed(888)


next(iter(train_loader))

[tensor([[0.1777, 0.1589, 0.3951,  ..., 0.8203, 0.7897, 0.7998],
         [0.0389, 0.1591, 0.4674,  ..., 0.3939, 0.5666, 0.6010],
         [0.2107, 0.1048, 0.7604,  ..., 0.1447, 0.1145, 0.1311],
         ...,
         [0.7318, 0.7253, 0.5029,  ..., 0.1658, 0.1598, 0.1481],
         [0.1812, 0.2756, 0.0876,  ..., 0.7633, 0.5545, 0.6024],
         [0.6118, 0.6419, 0.4321,  ..., 0.0808, 0.1044, 0.0679]]),
 tensor([[False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         ...,
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False]]),
 tensor([ 625, 1896, 1534, 3794, 4551,  726, 4525, 5994, 2348,  912, 4147, 1732,
         2168,   19, 5119, 2514, 3839, 1346,  145, 7515, 3907, 1813, 6666, 3275,
         2968, 3535, 4981, 7389, 7084, 3903, 3380, 5775, 7344,

## Training-Validation Loop 
It comprises the analysis of the training and validation set. VAE inherently have a tendency to overfit, so it is important to keep the test set after training loop. The whole loop is parametrized in a function: <br>
- The function starts with a pre-training evaluation to initialize metrics at epoch = 0 <br>
- Then training of the model begins and after each epoch, the validation set is passed through the model to get the validation - epoch metrics.<br>


During training, these are computed:
- KL, Gaussian Logliklihood error, and Total Error are monitored per training batch, and also averaged every n batches.
- KL, Gaussian Logliklihood error, and Total Error are monitored per validation round (per epoch).
- Also average KL/dimension is monitored per epoch at the validation set (how information is distributed and whether there are inactive dimensions)
- Plot average KL per dimension and Reconstruction error per epoch.

In [68]:
# n_features = 130
# hidden_dim = 65
# latent_dim = 25

# Instantiate the model
model1 = v1.VAE(
    n_features=130,
    latent_dim=25,
    hidden_layer=True,
    hidden_dim=65,
    sigmoid=True
).to(device)
# optimizer = optim.Adam(model.parameters(), lr=0.0025)

# need to set the model name automatically 
model_name = "model0"


In [None]:
epoch = 50
learn_r = 0.0025
model = model1
loss_fun = cf.loss_fun
freebits = 0.5
batch_size = 128
norm = 2.5 # for gradient clipping - optional 

optimizer = optim.Adam(model.parameters(), lr=learn_r)

hyperparam_str = f"norm{norm}_bits{freebits}_bs{batch_size}_lr{optimizer.param_groups[0]["lr"]}"


# Storage
# for each batch/iteration
batch_dict = {
    "iteration": [],
    "Train total Loss": [],
    "Train KL Loss": [], 
    "Train Rec Loss": []
    }

# for each epoch
epoch_dict = {
    "epoch": [],
    "Train total Loss": [],
    "Train KL Loss": [], 
    "Train Rec Loss": [],
    "Val total Loss": [],
    "Val KL Loss": [],
    "Val Rec Loss": []
    }


for epoch in tqdm(range(epoch+1)):
    
    
    # initialize the loss metrics at epoch zero
    if epoch == 0:
        print(f"Performing pre-training evaluation on the model in epoch {epoch}")
        val_loss, val_kl, val_rl = 0,0,0
        model.eval()
        with torch.inference_mode(): # it doesnt update parameters 
            lst = []
            for val_batch, t_mask, tidx in test_loader:
                x_mu, x_logvar, z_mu, z_logvar = model(val_batch)
                loss = loss_fun(val_batch, x_mu, x_logvar, z_mu, z_logvar,lst,mask=t_mask,freebits=freebits)
                val_loss += loss.detach().item()
                val_kl += lst[-1]
                val_rl += lst[-2]
            
            val_loss = val_loss/len(test_loader)
            val_kl = val_kl/len(test_loader)
            val_rl = val_rl/len(test_loader)
            
            epoch_dict["epoch"].append(epoch)
            epoch_dict["Train total Loss"].append(val_loss)
            epoch_dict["Train KL Loss"].append(val_kl)
            epoch_dict["Train Rec Loss"].append(val_rl)
            epoch_dict["Val total Loss"].append(val_loss)
            epoch_dict["Val KL Loss"].append(val_kl)
            epoch_dict["Val Rec Loss"].append(val_rl)
        
        print(f"\nVal loss: {val_loss:.3f}| Val KL: {val_kl} | Val Rec: {val_rl:.3f}\n")
    
    # begin training the model from iteration 0 and after epoch 0 
    else:
        print(f"Epoch {epoch}\n--------------------")
        train_loss, train_kl, train_rl = 0,0,0
        lst = [] # this list stores the averaged losses/batch that are computed from the loss
        iter = 0			
        for batch, (xbatch, xmask, xidx) in enumerate(train_loader):
            model.train()
            # device
            xbatch, xmask = xbatch.to(device), xmask.to(device)

            #
            optimizer.zero_grad()

            x_mu, x_logvar, z_mu, z_logvar = model(xbatch)

            loss = loss_fun(xbatch, x_mu, x_logvar, z_mu, z_logvar,lst,mask=xmask,freebits=freebits)
            train_loss += loss.detach().item()
            train_kl += lst[-1]
            train_rl += lst[-2]

            batch_loss = loss.detach().item()
            batch_kl = lst[-1]
            batch_rl = lst[-2]

            loss.backward()
                    
            # Optional gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=norm)
            optimizer.step()

            # update the batch dictionary - no val since #iterations are not the same 
            batch_dict["iteration"].append(iter)
            batch_dict["Train total Loss"].append(batch_loss)
            batch_dict["Train KL Loss"].append(batch_kl)
            batch_dict["Train Rec Loss"].append(batch_rl)

            iter +=1

            # print every round of 10 batches the losses - smooths the results 
            if batch % 10 == 0:
                print(f"Iter {batch} and a total {batch*batch_size}/{len(train_loader.dataset)} proteins have passed.")
                print(f"Current Loss: {train_loss/(batch+1)} | KL Loss: {train_kl/(batch+1)}| Rec Loss: {train_rl/(batch+1)}")


        # calculate per epoch the metrics - divide by number of batches 
        train_loss = train_loss/len(train_loader)
        train_kl = train_kl/len(train_loader)
        train_rl = train_rl/len(train_loader)
        
        # add them to the dictionary 
        epoch_dict["epoch"].append(epoch)
        epoch_dict["Train total Loss"].append(train_loss)
        epoch_dict["Train KL Loss"].append(train_kl)
        epoch_dict["Train Rec Loss"].append(train_rl)
        

        # pass the validation set to the VAE 
        val_loss, val_kl, val_rl = 0,0,0
        model.eval()
        with torch.inference_mode(): # it doesnt update parameters based on gradients 
            lst = []
            for val_batch, t_mask, tidx in test_loader:

                x_mu, x_logvar, z_mu, z_logvar = model(val_batch)
                loss = loss_fun(val_batch, x_mu, x_logvar, z_mu, z_logvar,lst,mask=t_mask,freebits=freebits)
                val_loss += loss.detach().item()
                val_kl += lst[-1]
                val_rl += lst[-2]
            
            # divide by all the batches of val set to get epoch metrics 
            val_loss = val_loss/len(test_loader)
            val_kl = val_kl/len(test_loader)
            val_rl = val_rl/len(test_loader)

            epoch_dict["Val total Loss"].append(val_loss)
            epoch_dict["Val KL Loss"].append(val_kl)
            epoch_dict["Val Rec Loss"].append(val_rl)

        ## Print out what's happening
        print(f"\nTrain loss: {train_loss:.3f}|Train Rec: {train_rl:.3f} | Val loss: {val_loss:.3f}, Val Rec: {val_rl:.3f}\n")

## Test Set Analysis 
It comprises the analysis of the training and validation set. VAE inherently have a tendency to overfit, so it is important to keep the test set after training loop. During training:
