## Load Libraries and import modules 

In [1]:
# Load all the vanila libraries 
import numpy as np
import pandas as pd
import numpy.random as nrd
import os
import pathlib 
import sys
from typing import Callable
import itertools
import gc
import time
from functools import reduce



#
import umap

# Pytorch modules 
import torch
import torch.nn.functional as F
from torch import nn
import torch.optim as optim

# this for the custom Dataset 
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

# plotting
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.patches as mpatches
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.lines import Line2D


#sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, pairwise_distances
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler



# Import tqdm for progress bar
from tqdm.auto import tqdm

# for timing functions
from timeit import default_timer as timer 

### Configure Project Parameters

In [2]:
# check your current directory
os.getcwd()

'c:\\Users\\gpano\\Desktop\\github_py\\proteomics_latent_space'

**Important:** Run the configuration file first `configs.py`. Importing this script and setting the seed and device parameters before importing any of the other modules ensures that evereything is sync.

**Important** If you want *change the configuration parameters*, change them before importing and running the pipeline. 

In [3]:
from models_util import configs

Importing models_util.configs module
First set device and seed for reproducibility.
-----------------------------------------------


In [4]:
configs.get_configs()

'Seed: None, Device: None'

In [5]:
# print the global variables
print(configs.project_seed, configs.project_device)

None None


In [6]:
configs.set_seed(789)
device = configs.set_device(force_cpu=True)

# global variables have changed too
print(configs.project_seed, configs.project_device)

During configuration random seed 789 has been set.
789 cpu


In [7]:
# lets see if the get function also agrees:
configs.get_configs()

'Seed: 789, Device: cpu'

Now that all the configurations values are assigned globally, we can import the modules. If this is working, we expect each module to access the **same** **seed** and **device** we set. We are also expecting generated numbers **inside the modules** to be reproducible.

In [8]:
# Load home modules and check the device where they are running 
from models_util import utility_functions as uf

During configuration random seed 789 has been set.
Importing models_util.utility_functions, running in cpu with seed: 789


In [9]:
from models_util import custom_dataset as cd

During configuration random seed 789 has been set.
Importing models_util.custom_dataset, running in cpu with seed: 789


In [10]:
from models_util import cost_functions as cf

During configuration random seed 789 has been set.
Importing models_util.cost_functions, running in cpu with seed: 789


In [11]:
from models_util import VAE1 as v1 


During configuration random seed 789 has been set.
Importing models_util.VAE1, running in cpu with seed: 789


##  Data scale and split for VAE
- We will perform min-max scaling to the TMT-Ratios of the proteomic SCBC data. <br>
- We will scale the array version of our scbc data, the `npdata` matrix.
- Then we will copy this scaled matrix and reshuffle the copy. The `npscbc_scaled_shuffled` will be used for the model training and performance evaluattion. <br>
- The `npdata_scaled` matrix with the original order of rows will be used later for the validation of the latent variables. <br> 
- It is important to use the non-missing min and max values of dataset row-by-row <br>

In [12]:
# create path and read the scbc data
data_path = os.getcwd() + "\\data\\processed\\" 
data = pd.read_csv(data_path+"prot_abms_norm.txt",delimiter="\t")

In [13]:
# convert to numpy 
npdata = data.to_numpy()
np.isnan(npdata).sum()


np.int64(15306)

In [14]:
# Get extreme values (non-missing) frome ach row. 
data_min = np.nanmin(npdata, axis=1, keepdims=True)  # minimum among non-NaN
data_max = np.nanmax(npdata, axis=1,keepdims=True)  # maximum among non-NaN

# check that that shapes and values are as expected 
print(data_max.shape,data_min.shape,np.isnan(data_max).sum(), np.isnan(data_min).sum())

(11209, 1) (11209, 1) 0 0


In [15]:
# scale data 
npdata_scaled = (npdata - data_min) /(data_max - data_min + 1e-8)
npdata_scaled.shape

# npscbc_scaled[0:2,]

(11209, 54)

In [16]:
# shuffle the rows but keep scaled original
npdata_scaled_shuffled = npdata_scaled.copy()
np.random.shuffle(npdata_scaled_shuffled)
# npscbc_scaled[1,],scbc.iloc[1,:12]


### Split Data 

In [17]:
train_data, val_data, test_data = uf.create_data_partition(
    npdata_scaled_shuffled, test_perc=0.15, val=True, val_perc=0.1
)
train_data.shape, val_data.shape, test_data.shape

((8406, 54), (1121, 54), (1682, 54))

You can test reproducibility by re-runing the function and checking the data in the first index of the matrix. We expect it to be the same. 

### Pass data to Custom Dataset and DataLoaders 
- check that your data is numpy matrix.
- check if data is scaled to (0,1).
- create three custom dataset instances.
- the custom dataset will save all the data to memory and create a mask where NaNs are located.
- the numpy arrays will be converted to tensors of appropriate dimensions and NaNs to zeroes.
- then we pass the custom dataset to the dataloader object.
- The DataLoader object contains for each row (training example) i) a tensor of 1 x 130 columns with 0-1 scaled values, ii) a 1x130 mask indicating NA positions and iii) index of the examples per batch (could be 64, 128,..., batch_size). 

In [18]:
train_dataset = cd.ProteinDataset(train_data)
val_dataset = cd.ProteinDataset(val_data)
test_dataset = cd.ProteinDataset(test_data)
whole_dataset = cd.ProteinDataset(npdata_scaled)

Protein Dataset is passed to memory
No Protein Symbols were identified
Protein Dataset is passed to memory
No Protein Symbols were identified
Protein Dataset is passed to memory
No Protein Symbols were identified
Protein Dataset is passed to memory
No Protein Symbols were identified


In [19]:
# pass data to the dataLoader
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False,drop_last=True)
whole_loader = DataLoader(whole_dataset, batch_size=128, shuffle=False, drop_last=False)

In [20]:
# the train loader is not reproducible bcs it shuffles but it is not seeded yet. 
# here is one batch of training examples 
# torch.manual_seed(888)


next(iter(train_loader))

[tensor([[0.8556, 0.8802, 0.8614,  ..., 0.6423, 0.6196, 0.5990],
         [0.0229, 0.0020, 0.0356,  ..., 0.2527, 0.2068, 0.2431],
         [0.4199, 0.4289, 0.4633,  ..., 0.1165, 0.0860, 0.1325],
         ...,
         [0.0666, 0.0915, 0.0735,  ..., 0.0984, 0.1327, 0.1055],
         [0.1673, 0.2089, 0.2423,  ..., 0.2401, 0.2461, 0.2781],
         [0.0000, 0.1242, 0.0000,  ..., 0.0000, 0.1622, 0.0000]]),
 tensor([[False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         ...,
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [ True, False,  True,  ...,  True, False,  True]]),
 tensor([4533, 8395, 1556, 4706, 3183, 6129, 8275, 4601, 7598, 4491,  766, 7960,
         2285, 7439, 2658, 3824, 1335, 3125, 8246, 3767, 7144, 1220, 3377, 6766,
         6048, 3638, 1801, 2293, 5473, 2720, 6120,  415, 6202,

## VAE dimensions optimization Loop with secondary task 
It comprises the run of the training and validation set. VAE inherently have a tendency to overfit, so it is important to keep the test set after training loop. In this tutorial we run one model. The name is based on a simple numbering system and its layers to track it down. Furthermore the train_val_loop creates a hyperparameter string to track other parameters. The whole loop is parametrized in a function: <br>
- The function starts with a pre-training evaluation to initialize metrics at epoch = 0 <br>
- Then training of the model begins and after each epoch, the validation set is passed through the model to get the validation - epoch metrics.<br>


During training, these are computed:
- KL, Gaussian Logliklihood error, and Total Error are monitored per training batch, and also averaged every n batches.
- KL, Gaussian Logliklihood error, and Total Error are monitored per validation round (per epoch).

In [21]:
# set the combinations of the VAE layers to check. 
comb_list = list()
for comb in list(itertools.product([54,50, 45,40,30],[30,25,20,15])):
    if comb[0] >= 1.5*comb[1]:
        comb_list.append(comb)

In [23]:
# load groudtruth pairs 
pairs_df = pd.read_csv(os.getcwd() + "\\data\\processed\\" + "merged_pairs.txt", delimiter="\t")
pairs_df.head()

Unnamed: 0,V1,V2,complex
1,EP300,CREBBP,Multisubunit ACTR coactivator complex
2,KAT2B,EP300,Multisubunit ACTR coactivator complex
3,NCOA3,EP300,Multisubunit ACTR coactivator complex
4,KAT2B,CREBBP,Multisubunit ACTR coactivator complex
5,NCOA3,KAT2B,Multisubunit ACTR coactivator complex


In [None]:
# dimlist = [(54,30)]
# future for loop for at least 10 different seeds 
outer_final_df = None
outer_metrics_df = None

for seed in [12,13,14,15,16,17,18,19,20,22]:

    # here we concatanate all the dataframes that are generated per seed (with all the different combinations)
    # set the umap function for the embeddings 
    umap_model = umap.UMAP(n_neighbors=20,
                           min_dist=0.1,
                           n_components=3,
                           metric="euclidean",
                           random_state=seed)

    seed = seed
    torch.manual_seed(seed)

    # here we store the metrics of the models for one seed 
    final_df = None
    final_feature_df = None
    model_list = []

    # iterate over the combinations of the VAE layers 
    for i, tup in enumerate(comb_list):
        hidden_dim, latent_dim, n = tup[0], tup[1], i
        print(i, hidden_dim, latent_dim)

        # Instantiate the model
        model1 = v1.VAE(
            n_features=54,
            latent_dim=latent_dim,
            hidden_layer=True,
            hidden_dim=hidden_dim,
            output_activ=nn.Sigmoid(),
        ).to(device)

        # need to set the model name with the layers - usefull for creating its unique folder 
        model_name = f"abms_iter_{n}_{hidden_dim}_{latent_dim}"
        model_list.append(model_name)
        # the optimizer is in the train-val loop 


        ## Create a "models" folder and the specifics model's directory to save figures  

        # create the models directory path 
        path_dir = os.getcwd() + "\\models"

        # Check if the models directory exists, if not, create it
        if not os.path.exists(path_dir):
            os.makedirs(path_dir)
            print(f"Created directory: {path_dir}")
        else:
            print(f"Directory already exists: {path_dir}")

        # create a subdirectory for each model based on #number and name (the dims of layers)
        model_path = os.path.join(path_dir,model_name)
        if not os.path.exists(model_path):
            os.makedirs(model_path)
        model_path


        ## Final Important part

        # set all the parameters to variables because all functions depend on them
        model = model1
        loss_fun = cf.loss_fun_gauss
        model_name=model_name
        path=model_path
        epoch = 200
        learn_r = 0.005
        freebits = 4
        batch_size = 128
        norm = 0

        # the path where this model is going to be saved 
        print(f"model path {path}")

        # run the training for the model
        # Run the loop - see the parameters 

        batch_dict, epoch_dict,hyperparam_str = uf.train_val_loop_v3(
        model = model,
        train_loader=train_loader,
        val_loader=val_loader, 
        loss_fun = loss_fun,
        model_name=model_name,
        model_path=path,
        epoch = epoch,
        patience = 7,
        learn_r = learn_r,
        freebits = freebits,
        batch_size = batch_size,
        norm = norm
        )

        # write the full model id
        model_id = model_name + "_" + hyperparam_str
        print(f"Model: {model_id} has been trained")


        # next run the test set analysis for the eaxh model and get the results test_iter_dict and test_metrics 
        test_iter_dict, test_metrics = uf.test_set_analysis(
        model=model,
        test_loader=test_loader,
        loss_fun = loss_fun,
        freebits=freebits,
        model_id=model_id
    )
        # create the the dataframe to store per model metrics 
        test_df = pd.DataFrame([test_metrics])
        test_df["hidden_dim"] = hidden_dim
        test_df["latent_dim"] = latent_dim
        test_df["seed"] = seed
        
        # extract latent variables from the whole dataset using the trained model
        # latent list
        latent_list = list()
        with torch.inference_mode():
            model.eval()
            for batch, _, _ in whole_loader:
                batch = batch.to(device)
                latent, _ = model.get_latent_variables(batch, detach=True)
                latent_list.append(latent.cpu().detach().numpy())
            
        latent_arr = np.vstack(latent_list)
        latent_df = pd.DataFrame(latent_arr, index=data.index, columns=[f"z{str(i)}" for i in range(latent_dim)])

        # after gettng the latent variables perform umap on them 
        umap_latent = umap_model.fit_transform(latent_df)
        umap_df = pd.DataFrame(umap_latent, index=data.index, columns=[f"dim1",f"dim2",f"dim3"])

        # create the classifier and use the latent dimension and the umap embeddings to the predictor - use df_pairs as ground truth, 

        # thing to remove after the iteration is finished umap_latent, latent_df, umap_dist 
        if final_df is None:
            final_df = test_df
        else:
            final_df = pd.concat([final_df,test_df],axis=0)
        
        gc.collect()

        # get the feature from the specific model parameters
    
        # proteins that exist withing ground truth databases
        proteins_in_pairs = set(pairs_df.iloc[:, 0]).union(set(pairs_df.iloc[:, 1]))

        # subset by what is in the ground truth pairs
        latent_df_sub = latent_df.loc[latent_df.index.intersection(proteins_in_pairs)]
        umap_df_sub = umap_df.loc[umap_df.index.intersection(proteins_in_pairs)]


        # perform corralation analysis for latent and euclidean distance calculation for umap embeddings



        corr_matrix = pd.DataFrame(np.corrcoef(latent_df_sub),
                                index=latent_df_sub.index,
                                columns=latent_df_sub.index)

        # fix the matrices of each feature type 

        cor_feat = (corr_matrix
                    .reset_index()
                    .melt(id_vars="index", var_name="Var2", value_name=f"cor_feature_{model_name}")
                    .rename(columns={"index":"Var1"})
        )

        ##
        umap_dist = pd.DataFrame(pairwise_distances(umap_df_sub, metric="euclidean"),
                                index = umap_df_sub.index,
                                columns=umap_df_sub.index)

        umap_dist = umap_dist.reset_index().melt(id_vars="index", var_name="Var2", value_name=f"euc_feature_{model_name}").rename(columns={"index":"Var1"})



        ## filter duplicates and self-correlations/distances, keep only Var1>Var2
        cor_feat = cor_feat[cor_feat["Var1"]!=cor_feat["Var2"]]
        cor_feat = cor_feat[cor_feat["Var1"]>cor_feat["Var2"]]

        umap_dist = umap_dist[umap_dist["Var1"]!=umap_dist["Var2"]]
        umap_dist = umap_dist[umap_dist["Var1"]>umap_dist["Var2"]]


        ## get a true protein pairs character vector
        pair_chars = pairs_df["V1"].astype(str) + "_" + pairs_df["V2"].astype(str)

        ## get ground truth classes of each feature category
        cor_feat["db"] = np.where((cor_feat["Var1"].astype(str) + "_" + cor_feat["Var2"].astype(str)).isin(pair_chars), 1, 0)
        umap_dist["db"] = np.where((umap_dist["Var1"].astype(str) + "_" + umap_dist["Var2"].astype(str)).isin(pair_chars), 1, 0)

        # merge both dataframes using reduce and lambda function
        feature_df = reduce(lambda left, right: pd.merge(left, right, on=["Var1", "Var2", "db"], how="inner"), [cor_feat, umap_dist])

        if final_feature_df is None:
            final_feature_df = feature_df
        else:
            final_feature_df = reduce(lambda left, right: pd.merge(left, right, on=["Var1", "Var2", "db"], how="inner"), [final_feature_df, feature_df])
        
        del cor_feat, umap_dist, pair_chars, corr_matrix, latent_df_sub, umap_df_sub, umap_latent, feature_df
        
        gc.collect()


    # out of the first loop 


    # create cor features from the base 
    proteins_in_pairs = set(pairs_df.iloc[:, 0]).union(set(pairs_df.iloc[:, 1]))

    data_sub = data.loc[data.index.intersection(proteins_in_pairs)]
    corr_matrix_base = data_sub.T.corr(method="pearson")

    cor_feat_base = (corr_matrix_base
                .reset_index()
                .melt(id_vars="index", var_name="Var2", value_name=f"cor_base")
                .rename(columns={"index":"Var1"})
    )
    cor_feat_base = cor_feat_base[cor_feat_base["Var1"]!=cor_feat_base["Var2"]]
    cor_feat_base = cor_feat_base[cor_feat_base["Var1"]>cor_feat_base["Var2"]]

    pair_chars = pairs_df["V1"].astype(str) + "_" + pairs_df["V2"].astype(str)
    cor_feat_base["db"] = np.where((cor_feat_base["Var1"].astype(str) + "_" + cor_feat_base["Var2"].astype(str)).isin(pair_chars), 1, 0) 

    # add the base cor-predictor to the feature list as well as to the model list
    model_list.append("cor_base")
    final_feature_df= final_feature_df.merge(cor_feat_base, on=["Var1","Var2","db"], how="inner")




    ###### from this part and so on feature extraction for all combinations ######
    metrics_df = None
    # all combinations should be based on the same sampling procedure for more accurate comparisons 

    # begin with the L2 regression 
    pos_df = final_feature_df[final_feature_df['db'] == 1]
    neg_df = final_feature_df[final_feature_df['db'] == 0]

    # sample the negatives with randomization and create final dataset df
    neg_df = neg_df.sample(n=pos_df.shape[0], random_state=seed)
    sample_df = pd.concat([pos_df,neg_df]).sample(frac=1, random_state=seed)
    del pos_df, neg_df

    X = sample_df.drop(["Var1", "Var2", "db"], axis=1) 
    # umapX = sample_df[["euc_feature"]]
    y = sample_df["db"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

    classifier_pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("logreg", LogisticRegression(penalty="l2", solver="liblinear", random_state=seed))
    ])

    param_grid = {
        'logreg__C': [0.01]
    }

    for model_n in model_list:
        print(model_n)
        if model_n != "cor_base":

            # base_cor_grid = GridSearchCV(classifier_pipe, param_grid, cv=10, scoring='roc_auc', n_jobs=1)
            cor_grid = GridSearchCV(classifier_pipe, param_grid, cv=10, scoring='roc_auc', n_jobs=1)
            euc_grid = GridSearchCV(classifier_pipe, param_grid, cv=10, scoring='roc_auc', n_jobs=1)


            # base_cor_grid.fit(X_train[["cor_base"]], y_train)
            cor_grid.fit(X_train[[f"cor_feature_{model_n}"]], y_train)
            euc_grid.fit(X_train[[f"euc_feature_{model_n}"]], y_train)

            # basecor_cv_auc = base_cor_grid.best_score_
            cor_cv_auc = cor_grid.best_score_
            euc_cv_auc = euc_grid.best_score_

            # y_pred_basecor = base_cor_grid.predict_proba(X_test[["cor_base"]])[:, 1]
            y_pred_cor = cor_grid.predict_proba(X_test[[f"cor_feature_{model_n}"]])[:, 1]
            y_pred_euc = euc_grid.predict_proba(X_test[[f"euc_feature_{model_n}"]])[:, 1]

            # basecor_test_auc = roc_auc_score(y_test, y_pred_basecor)
            cor_test_auc = roc_auc_score(y_test, y_pred_cor)
            euc_test_auc = roc_auc_score(y_test, y_pred_euc)

            # final_df["test_basecor"] = round(float(basecor_test_auc),3)
            res_df = pd.DataFrame({
                    "model" : [model_n],
                    "test_cor" : [round(float(cor_test_auc),3)],
                    "test_umap" : [round(float(euc_test_auc),3)],
                    "seed" : seed
                })

            if metrics_df is None:
                metrics_df = res_df
            else:
                metrics_df = pd.concat([metrics_df,res_df],axis=0)
            
            gc.collect()
        else:
            cor_grid = GridSearchCV(classifier_pipe, param_grid, cv=10, scoring='roc_auc', n_jobs=1)
            cor_grid.fit(X_train[[model_n]], y_train)
            y_pred_cor = cor_grid.predict_proba(X_test[[model_n]])[:, 1]
            cor_test_auc = roc_auc_score(y_test, y_pred_cor)
            res_df = pd.DataFrame({
                    "model" : [model_n],
                    "test_cor" : [round(float(cor_test_auc),3)],
                    "test_umap" : 0,
                    "seed" : seed
                })
            
            if metrics_df is None:
                metrics_df = res_df
            else:
                metrics_df = pd.concat([metrics_df,res_df],axis=0)

    del X_train, X_test, y_train, y_test, param_grid, classifier_pipe, res_df
    gc.collect()


    if outer_final_df is None:
        outer_final_df = final_df
    else:
        outer_final_df = pd.concat([outer_final_df, final_df], axis=0)

    if outer_metrics_df is None:
        outer_metrics_df = metrics_df
    else:
        outer_metrics_df = pd.concat([outer_metrics_df, metrics_df], axis=0)
    # last step - make predictions for this batch of features


outer_final_df.shape, outer_metrics_df.shape 

0 54 30
Directory already exists: c:\Users\gpano\Desktop\github_py\proteomics_latent_space\models
model path c:\Users\gpano\Desktop\github_py\proteomics_latent_space\models\abms_iter_0_54_30


  0%|          | 0/201 [00:00<?, ?it/s]

Performing pre-training evaluation on the model in epoch 0

Val loss: 84.131| Val KL: 83.17765045166016 | Val Rec: 0.953

Patience exceeded at 60 with last checkpoint saved at 52
changed learning rate to 0.001
Early stopping at epoch 94 with last checkpoint saved at 83
Model saved at: c:\Users\gpano\Desktop\github_py\proteomics_latent_space\models\abms_iter_0_54_30
Model: abms_iter_0_54_30_ep83_norm0_bits4_bs128_lr0.001 has been trained
Using this model abms_iter_0_54_30_ep83_norm0_bits4_bs128_lr0.001
The decoder output is transformed with an activation function, so reoconstructions are scaled.




1 54 25
Directory already exists: c:\Users\gpano\Desktop\github_py\proteomics_latent_space\models
model path c:\Users\gpano\Desktop\github_py\proteomics_latent_space\models\abms_iter_1_54_25


  0%|          | 0/201 [00:00<?, ?it/s]

Performing pre-training evaluation on the model in epoch 0

Val loss: 70.273| Val KL: 69.31470489501953 | Val Rec: 0.958

Patience exceeded at 54 with last checkpoint saved at 46
changed learning rate to 0.001
Early stopping at epoch 69 with last checkpoint saved at 58
Model saved at: c:\Users\gpano\Desktop\github_py\proteomics_latent_space\models\abms_iter_1_54_25
Model: abms_iter_1_54_25_ep58_norm0_bits4_bs128_lr0.001 has been trained
Using this model abms_iter_1_54_25_ep58_norm0_bits4_bs128_lr0.001
The decoder output is transformed with an activation function, so reoconstructions are scaled.




2 54 20
abms_iter_0_54_30
abms_iter_1_54_25
cor_base
0 54 30
Directory already exists: c:\Users\gpano\Desktop\github_py\proteomics_latent_space\models
model path c:\Users\gpano\Desktop\github_py\proteomics_latent_space\models\abms_iter_0_54_30


  0%|          | 0/201 [00:00<?, ?it/s]

Performing pre-training evaluation on the model in epoch 0

Val loss: 84.129| Val KL: 83.17765045166016 | Val Rec: 0.952

Patience exceeded at 56 with last checkpoint saved at 48
changed learning rate to 0.001
Early stopping at epoch 76 with last checkpoint saved at 65
Model saved at: c:\Users\gpano\Desktop\github_py\proteomics_latent_space\models\abms_iter_0_54_30
Model: abms_iter_0_54_30_ep65_norm0_bits4_bs128_lr0.001 has been trained
Using this model abms_iter_0_54_30_ep65_norm0_bits4_bs128_lr0.001
The decoder output is transformed with an activation function, so reoconstructions are scaled.




1 54 25
Directory already exists: c:\Users\gpano\Desktop\github_py\proteomics_latent_space\models
model path c:\Users\gpano\Desktop\github_py\proteomics_latent_space\models\abms_iter_1_54_25


  0%|          | 0/201 [00:00<?, ?it/s]

Performing pre-training evaluation on the model in epoch 0

Val loss: 70.277| Val KL: 69.31470489501953 | Val Rec: 0.962

Patience exceeded at 52 with last checkpoint saved at 44
changed learning rate to 0.001
Early stopping at epoch 73 with last checkpoint saved at 62
Model saved at: c:\Users\gpano\Desktop\github_py\proteomics_latent_space\models\abms_iter_1_54_25
Model: abms_iter_1_54_25_ep62_norm0_bits4_bs128_lr0.001 has been trained
Using this model abms_iter_1_54_25_ep62_norm0_bits4_bs128_lr0.001
The decoder output is transformed with an activation function, so reoconstructions are scaled.




2 54 20
abms_iter_0_54_30
abms_iter_1_54_25
cor_base


((4, 8), (6, 4))

In [None]:
outer_metrics_df.to_csv(os.getcwd()+"\\data\\processed\\abms_iter_minmax_metrics.csv")
outer_final_df.to_csv(os.getcwd()+"\\data\\processed\\abms_iter_minmax_finaldf.csv")

In [103]:
pd.read_csv(os.getcwd()+"\\data\\processed\\abms_iter_minmax_finaldf.csv")

Unnamed: 0.1,Unnamed: 0,model_id,bits,avg_total_loss,avg_kl_loss,avg_rl_loss,hidden_dim,latent_dim,seed
0,0,abms_iter_0_54_30_ep83_norm0_bits4_bs128_lr0.001,4,82.131075,83.178672,-1.047598,54,30,12
1,0,abms_iter_1_54_25_ep58_norm0_bits4_bs128_lr0.001,4,68.286223,69.31565,-1.029426,54,25,12
2,0,abms_iter_0_54_30_ep65_norm0_bits4_bs128_lr0.001,4,82.14843,83.179176,-1.030745,54,30,13
3,0,abms_iter_1_54_25_ep62_norm0_bits4_bs128_lr0.001,4,68.286371,69.316413,-1.030042,54,25,13


### Checkpoint

##  Data scale and split for VAE
- We will perform min-max scaling to the TMT-Ratios of the proteomic SCBC data. <br>
- We will scale the array version of our scbc data, the `npdata` matrix.
- Then we will copy this scaled matrix and reshuffle the copy. The `npscbc_scaled_shuffled` will be used for the model training and performance evaluattion. <br>
- The `npdata_scaled` matrix with the original order of rows will be used later for the validation of the latent variables. <br> 
- It is important to use the non-missing min and max values of dataset row-by-row <br>

In [104]:
# create path and read the scbc data
data_path = os.getcwd() + "\\data\\processed\\" 
data = pd.read_csv(data_path+"protein_quant_merged.txt",delimiter="\t")

In [105]:
# convert to numpy 
npdata = data.to_numpy()
np.isnan(npdata).sum()


np.int64(104200)

In [106]:
# Get extreme values (non-missing) frome ach row. 
data_min = np.nanmin(npdata, axis=1, keepdims=True)  # minimum among non-NaN
data_max = np.nanmax(npdata, axis=1,keepdims=True)  # maximum among non-NaN

# check that that shapes and values are as expected 
print(data_max.shape,data_min.shape,np.isnan(data_max).sum(), np.isnan(data_min).sum())

(10439, 1) (10439, 1) 0 0


In [107]:
# scale data 
npdata_scaled = (npdata - data_min) /(data_max - data_min + 1e-8)
npdata_scaled.shape

# npscbc_scaled[0:2,]

(10439, 130)

In [108]:
# shuffle the rows but keep scaled original
npdata_scaled_shuffled = npdata_scaled.copy()
np.random.shuffle(npdata_scaled_shuffled)
# npscbc_scaled[1,],scbc.iloc[1,:12]


### Split Data 

In [109]:
train_data, val_data, test_data = uf.create_data_partition(
    npdata_scaled_shuffled, test_perc=0.15, val=True, val_perc=0.1
)
train_data.shape, val_data.shape, test_data.shape

((7829, 130), (1044, 130), (1566, 130))

You can test reproducibility by re-runing the function and checking the data in the first index of the matrix. We expect it to be the same. 

### Pass data to Custom Dataset and DataLoaders 
- check that your data is numpy matrix.
- check if data is scaled to (0,1).
- create three custom dataset instances.
- the custom dataset will save all the data to memory and create a mask where NaNs are located.
- the numpy arrays will be converted to tensors of appropriate dimensions and NaNs to zeroes.
- then we pass the custom dataset to the dataloader object.
- The DataLoader object contains for each row (training example) i) a tensor of 1 x 130 columns with 0-1 scaled values, ii) a 1x130 mask indicating NA positions and iii) index of the examples per batch (could be 64, 128,..., batch_size). 

In [110]:
train_dataset = cd.ProteinDataset(train_data)
val_dataset = cd.ProteinDataset(val_data)
test_dataset = cd.ProteinDataset(test_data)
whole_dataset = cd.ProteinDataset(npdata_scaled)

Protein Dataset is passed to memory
No Protein Symbols were identified
Protein Dataset is passed to memory
No Protein Symbols were identified
Protein Dataset is passed to memory
No Protein Symbols were identified
Protein Dataset is passed to memory
No Protein Symbols were identified


In [111]:
# pass data to the dataLoader
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False,drop_last=True)
whole_loader = DataLoader(whole_dataset, batch_size=128, shuffle=False, drop_last=False)

## VAE dimensions optimization Loop with secondary task 
It comprises the run of the training and validation set. VAE inherently have a tendency to overfit, so it is important to keep the test set after training loop. In this tutorial we run one model. The name is based on a simple numbering system and its layers to track it down. Furthermore the train_val_loop creates a hyperparameter string to track other parameters. The whole loop is parametrized in a function: <br>
- The function starts with a pre-training evaluation to initialize metrics at epoch = 0 <br>
- Then training of the model begins and after each epoch, the validation set is passed through the model to get the validation - epoch metrics.<br>


During training, these are computed:
- KL, Gaussian Logliklihood error, and Total Error are monitored per training batch, and also averaged every n batches.
- KL, Gaussian Logliklihood error, and Total Error are monitored per validation round (per epoch).

In [114]:
# set the combinations of the VAE layers to check. 
comb_list = list()
for comb in list(itertools.product([90,75,65,50,40],[45,30,25,20,15,10])):
    if comb[0] >= 1.5*comb[1]:
        comb_list.append(comb)

In [115]:
comb_list

[(90, 45),
 (90, 30),
 (90, 25),
 (90, 20),
 (90, 15),
 (90, 10),
 (75, 45),
 (75, 30),
 (75, 25),
 (75, 20),
 (75, 15),
 (75, 10),
 (65, 30),
 (65, 25),
 (65, 20),
 (65, 15),
 (65, 10),
 (50, 30),
 (50, 25),
 (50, 20),
 (50, 15),
 (50, 10),
 (40, 25),
 (40, 20),
 (40, 15),
 (40, 10)]

In [None]:
# set the umap function for the embeddings 
umap_model = umap.UMAP(n_neighbors=20,
                           min_dist=0.1,
                           n_components=3,
                           metric="euclidean",
                           random_state=88)

In [None]:
# load groudtruth pairs 
pairs_df = pd.read_csv(os.getcwd() + "\\data\\processed\\" + "merged_pairs.txt", delimiter="\t")
pairs_df.head()

Unnamed: 0,V1,V2,complex
1,EP300,CREBBP,Multisubunit ACTR coactivator complex
2,KAT2B,EP300,Multisubunit ACTR coactivator complex
3,NCOA3,EP300,Multisubunit ACTR coactivator complex
4,KAT2B,CREBBP,Multisubunit ACTR coactivator complex
5,NCOA3,KAT2B,Multisubunit ACTR coactivator complex


In [None]:
# dimlist = [(54,30)]
# future for loop for at least 10 different seeds 
outer_final_df = None
outer_metrics_df = None

for seed in [22,23,24,25,26,27,28,29,30,31]:

    # here we concatanate all the dataframes that are generated per seed (with all the different combinations)

    seed = seed
    torch.manual_seed(seed)
    umap_model = umap.UMAP(n_neighbors=20,
                           min_dist=0.1,
                           n_components=3,
                           metric="euclidean",
                           random_state=seed)


    # here we store the metrics of the models for one seed 
    final_df = None
    final_feature_df = None
    model_list = []

    # iterate over the combinations of the VAE layers 
    for i, tup in enumerate(comb_list):
        hidden_dim, latent_dim, n = tup[0], tup[1], i
        print(i, hidden_dim, latent_dim)

        # Instantiate the model
        model1 = v1.VAE(
            n_features=130,
            latent_dim=latent_dim,
            hidden_layer=True,
            hidden_dim=hidden_dim,
            output_activ=nn.Sigmoid(),
        ).to(device)

        # need to set the model name with the layers - usefull for creating its unique folder 
        model_name = f"abms_iter_{n}_{hidden_dim}_{latent_dim}"
        model_list.append(model_name)
        # the optimizer is in the train-val loop 


        ## Create a "models" folder and the specifics model's directory to save figures  

        # create the models directory path 
        path_dir = os.getcwd() + "\\models"

        # Check if the models directory exists, if not, create it
        if not os.path.exists(path_dir):
            os.makedirs(path_dir)
            print(f"Created directory: {path_dir}")
        else:
            print(f"Directory already exists: {path_dir}")

        # create a subdirectory for each model based on #number and name (the dims of layers)
        model_path = os.path.join(path_dir,model_name)
        if not os.path.exists(model_path):
            os.makedirs(model_path)
        model_path


        ## Final Important part

        # set all the parameters to variables because all functions depend on them
        model = model1
        loss_fun = cf.loss_fun_gauss
        model_name=model_name
        path=model_path
        epoch = 200
        learn_r = 0.005
        freebits = 4
        batch_size = 128
        norm = 0

        # the path where this model is going to be saved 
        print(f"model path {path}")

        # run the training for the model
        # Run the loop - see the parameters 

        batch_dict, epoch_dict,hyperparam_str = uf.train_val_loop_v3(
        model = model,
        train_loader=train_loader,
        val_loader=val_loader, 
        loss_fun = loss_fun,
        model_name=model_name,
        model_path=path,
        epoch = epoch,
        patience = 7,
        learn_r = learn_r,
        freebits = freebits,
        batch_size = batch_size,
        norm = norm
        )

        # write the full model id
        model_id = model_name + "_" + hyperparam_str
        print(f"Model: {model_id} has been trained")


        # next run the test set analysis for the eaxh model and get the results test_iter_dict and test_metrics 
        test_iter_dict, test_metrics = uf.test_set_analysis(
        model=model,
        test_loader=test_loader,
        loss_fun = loss_fun,
        freebits=freebits,
        model_id=model_id
    )
        # create the the dataframe to store per model metrics 
        test_df = pd.DataFrame([test_metrics])
        test_df["hidden_dim"] = hidden_dim
        test_df["latent_dim"] = latent_dim
        test_df["seed"] = seed
        
        # extract latent variables from the whole dataset using the trained model
        # latent list
        latent_list = list()
        with torch.inference_mode():
            model.eval()
            for batch, _, _ in whole_loader:
                batch = batch.to(device)
                latent, _ = model.get_latent_variables(batch, detach=True)
                latent_list.append(latent.cpu().detach().numpy())
            
        latent_arr = np.vstack(latent_list)
        latent_df = pd.DataFrame(latent_arr, index=data.index, columns=[f"z{str(i)}" for i in range(latent_dim)])

        # after gettng the latent variables perform umap on them 
        umap_latent = umap_model.fit_transform(latent_df)
        umap_df = pd.DataFrame(umap_latent, index=data.index, columns=[f"dim1",f"dim2",f"dim3"])

        # create the classifier and use the latent dimension and the umap embeddings to the predictor - use df_pairs as ground truth, 

        # thing to remove after the iteration is finished umap_latent, latent_df, umap_dist 
        if final_df is None:
            final_df = test_df
        else:
            final_df = pd.concat([final_df,test_df],axis=0)
        
        gc.collect()

        # get the feature from the specific model parameters
    
        # proteins that exist withing ground truth databases
        proteins_in_pairs = set(pairs_df.iloc[:, 0]).union(set(pairs_df.iloc[:, 1]))

        # subset by what is in the ground truth pairs
        latent_df_sub = latent_df.loc[latent_df.index.intersection(proteins_in_pairs)]
        umap_df_sub = umap_df.loc[umap_df.index.intersection(proteins_in_pairs)]


        # perform corralation analysis for latent and euclidean distance calculation for umap embeddings



        corr_matrix = pd.DataFrame(np.corrcoef(latent_df_sub),
                                index=latent_df_sub.index,
                                columns=latent_df_sub.index)

        # fix the matrices of each feature type 

        cor_feat = (corr_matrix
                    .reset_index()
                    .melt(id_vars="index", var_name="Var2", value_name=f"cor_feature_{model_name}")
                    .rename(columns={"index":"Var1"})
        )

        ##
        umap_dist = pd.DataFrame(pairwise_distances(umap_df_sub, metric="euclidean"),
                                index = umap_df_sub.index,
                                columns=umap_df_sub.index)

        umap_dist = umap_dist.reset_index().melt(id_vars="index", var_name="Var2", value_name=f"euc_feature_{model_name}").rename(columns={"index":"Var1"})



        ## filter duplicates and self-correlations/distances, keep only Var1>Var2
        cor_feat = cor_feat[cor_feat["Var1"]!=cor_feat["Var2"]]
        cor_feat = cor_feat[cor_feat["Var1"]>cor_feat["Var2"]]

        umap_dist = umap_dist[umap_dist["Var1"]!=umap_dist["Var2"]]
        umap_dist = umap_dist[umap_dist["Var1"]>umap_dist["Var2"]]


        ## get a true protein pairs character vector
        pair_chars = pairs_df["V1"].astype(str) + "_" + pairs_df["V2"].astype(str)

        ## get ground truth classes of each feature category
        cor_feat["db"] = np.where((cor_feat["Var1"].astype(str) + "_" + cor_feat["Var2"].astype(str)).isin(pair_chars), 1, 0)
        umap_dist["db"] = np.where((umap_dist["Var1"].astype(str) + "_" + umap_dist["Var2"].astype(str)).isin(pair_chars), 1, 0)

        # merge both dataframes using reduce and lambda function
        feature_df = reduce(lambda left, right: pd.merge(left, right, on=["Var1", "Var2", "db"], how="inner"), [cor_feat, umap_dist])

        if final_feature_df is None:
            final_feature_df = feature_df
        else:
            final_feature_df = reduce(lambda left, right: pd.merge(left, right, on=["Var1", "Var2", "db"], how="inner"), [final_feature_df, feature_df])
        
        del cor_feat, umap_dist, pair_chars, corr_matrix, latent_df_sub, umap_df_sub, umap_latent, feature_df
        
        gc.collect()


    # out of the first loop 


    # create cor features from the base 
    proteins_in_pairs = set(pairs_df.iloc[:, 0]).union(set(pairs_df.iloc[:, 1]))

    data_sub = data.loc[data.index.intersection(proteins_in_pairs)]
    corr_matrix_base = data_sub.T.corr(method="pearson")

    cor_feat_base = (corr_matrix_base
                .reset_index()
                .melt(id_vars="index", var_name="Var2", value_name=f"cor_base")
                .rename(columns={"index":"Var1"})
    )
    cor_feat_base = cor_feat_base[cor_feat_base["Var1"]!=cor_feat_base["Var2"]]
    cor_feat_base = cor_feat_base[cor_feat_base["Var1"]>cor_feat_base["Var2"]]

    pair_chars = pairs_df["V1"].astype(str) + "_" + pairs_df["V2"].astype(str)
    cor_feat_base["db"] = np.where((cor_feat_base["Var1"].astype(str) + "_" + cor_feat_base["Var2"].astype(str)).isin(pair_chars), 1, 0) 

    # add the base cor-predictor to the feature list as well as to the model list
    model_list.append("cor_base")
    final_feature_df= final_feature_df.merge(cor_feat_base, on=["Var1","Var2","db"], how="inner")




    ###### from this part and so on feature extraction for all combinations ######
    metrics_df = None
    # all combinations should be based on the same sampling procedure for more accurate comparisons 

    # begin with the L2 regression 
    pos_df = final_feature_df[final_feature_df['db'] == 1]
    neg_df = final_feature_df[final_feature_df['db'] == 0]

    # sample the negatives with randomization and create final dataset df
    neg_df = neg_df.sample(n=pos_df.shape[0], random_state=seed)
    sample_df = pd.concat([pos_df,neg_df]).sample(frac=1, random_state=seed)
    del pos_df, neg_df

    X = sample_df.drop(["Var1", "Var2", "db"], axis=1) 
    # umapX = sample_df[["euc_feature"]]
    y = sample_df["db"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

    classifier_pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("logreg", LogisticRegression(penalty="l2", solver="liblinear", random_state=seed))
    ])

    param_grid = {
        'logreg__C': [0.01]
    }

    for model_n in model_list:
        print(model_n)
        if model_n != "cor_base":

            # base_cor_grid = GridSearchCV(classifier_pipe, param_grid, cv=10, scoring='roc_auc', n_jobs=1)
            cor_grid = GridSearchCV(classifier_pipe, param_grid, cv=10, scoring='roc_auc', n_jobs=1)
            euc_grid = GridSearchCV(classifier_pipe, param_grid, cv=10, scoring='roc_auc', n_jobs=1)


            # base_cor_grid.fit(X_train[["cor_base"]], y_train)
            cor_grid.fit(X_train[[f"cor_feature_{model_n}"]], y_train)
            euc_grid.fit(X_train[[f"euc_feature_{model_n}"]], y_train)

            # basecor_cv_auc = base_cor_grid.best_score_
            cor_cv_auc = cor_grid.best_score_
            euc_cv_auc = euc_grid.best_score_

            # y_pred_basecor = base_cor_grid.predict_proba(X_test[["cor_base"]])[:, 1]
            y_pred_cor = cor_grid.predict_proba(X_test[[f"cor_feature_{model_n}"]])[:, 1]
            y_pred_euc = euc_grid.predict_proba(X_test[[f"euc_feature_{model_n}"]])[:, 1]

            # basecor_test_auc = roc_auc_score(y_test, y_pred_basecor)
            cor_test_auc = roc_auc_score(y_test, y_pred_cor)
            euc_test_auc = roc_auc_score(y_test, y_pred_euc)

            # final_df["test_basecor"] = round(float(basecor_test_auc),3)
            res_df = pd.DataFrame({
                    "model" : [model_n],
                    "test_cor" : [round(float(cor_test_auc),3)],
                    "test_umap" : [round(float(euc_test_auc),3)],
                    "seed" : seed
                })

            if metrics_df is None:
                metrics_df = res_df
            else:
                metrics_df = pd.concat([metrics_df,res_df],axis=0)
            
            gc.collect()
        else:
            cor_grid = GridSearchCV(classifier_pipe, param_grid, cv=10, scoring='roc_auc', n_jobs=1)
            cor_grid.fit(X_train[[model_n]], y_train)
            y_pred_cor = cor_grid.predict_proba(X_test[[model_n]])[:, 1]
            cor_test_auc = roc_auc_score(y_test, y_pred_cor)
            res_df = pd.DataFrame({
                    "model" : [model_n],
                    "test_cor" : [round(float(cor_test_auc),3)],
                    "test_umap" : 0,
                    "seed" : seed
                })
            
            if metrics_df is None:
                metrics_df = res_df
            else:
                metrics_df = pd.concat([metrics_df,res_df],axis=0)

    del X_train, X_test, y_train, y_test, param_grid, classifier_pipe, res_df
    gc.collect()


    if outer_final_df is None:
        outer_final_df = final_df
    else:
        outer_final_df = pd.concat([outer_final_df, final_df], axis=0)

    if outer_metrics_df is None:
        outer_metrics_df = metrics_df
    else:
        outer_metrics_df = pd.concat([outer_metrics_df, metrics_df], axis=0)
    # last step - make predictions for this batch of features


outer_final_df.shape, outer_metrics_df.shape 

0 54 30
Directory already exists: c:\Users\gpano\Desktop\github_py\proteomics_latent_space\models
model path c:\Users\gpano\Desktop\github_py\proteomics_latent_space\models\abms_iter_0_54_30


  0%|          | 0/201 [00:00<?, ?it/s]

Performing pre-training evaluation on the model in epoch 0

Val loss: 84.131| Val KL: 83.17765045166016 | Val Rec: 0.953

Patience exceeded at 60 with last checkpoint saved at 52
changed learning rate to 0.001
Early stopping at epoch 94 with last checkpoint saved at 83
Model saved at: c:\Users\gpano\Desktop\github_py\proteomics_latent_space\models\abms_iter_0_54_30
Model: abms_iter_0_54_30_ep83_norm0_bits4_bs128_lr0.001 has been trained
Using this model abms_iter_0_54_30_ep83_norm0_bits4_bs128_lr0.001
The decoder output is transformed with an activation function, so reoconstructions are scaled.




1 54 25
Directory already exists: c:\Users\gpano\Desktop\github_py\proteomics_latent_space\models
model path c:\Users\gpano\Desktop\github_py\proteomics_latent_space\models\abms_iter_1_54_25


  0%|          | 0/201 [00:00<?, ?it/s]

Performing pre-training evaluation on the model in epoch 0

Val loss: 70.273| Val KL: 69.31470489501953 | Val Rec: 0.958

Patience exceeded at 54 with last checkpoint saved at 46
changed learning rate to 0.001
Early stopping at epoch 69 with last checkpoint saved at 58
Model saved at: c:\Users\gpano\Desktop\github_py\proteomics_latent_space\models\abms_iter_1_54_25
Model: abms_iter_1_54_25_ep58_norm0_bits4_bs128_lr0.001 has been trained
Using this model abms_iter_1_54_25_ep58_norm0_bits4_bs128_lr0.001
The decoder output is transformed with an activation function, so reoconstructions are scaled.




2 54 20
abms_iter_0_54_30
abms_iter_1_54_25
cor_base
0 54 30
Directory already exists: c:\Users\gpano\Desktop\github_py\proteomics_latent_space\models
model path c:\Users\gpano\Desktop\github_py\proteomics_latent_space\models\abms_iter_0_54_30


  0%|          | 0/201 [00:00<?, ?it/s]

Performing pre-training evaluation on the model in epoch 0

Val loss: 84.129| Val KL: 83.17765045166016 | Val Rec: 0.952

Patience exceeded at 56 with last checkpoint saved at 48
changed learning rate to 0.001
Early stopping at epoch 76 with last checkpoint saved at 65
Model saved at: c:\Users\gpano\Desktop\github_py\proteomics_latent_space\models\abms_iter_0_54_30
Model: abms_iter_0_54_30_ep65_norm0_bits4_bs128_lr0.001 has been trained
Using this model abms_iter_0_54_30_ep65_norm0_bits4_bs128_lr0.001
The decoder output is transformed with an activation function, so reoconstructions are scaled.




1 54 25
Directory already exists: c:\Users\gpano\Desktop\github_py\proteomics_latent_space\models
model path c:\Users\gpano\Desktop\github_py\proteomics_latent_space\models\abms_iter_1_54_25


  0%|          | 0/201 [00:00<?, ?it/s]

Performing pre-training evaluation on the model in epoch 0

Val loss: 70.277| Val KL: 69.31470489501953 | Val Rec: 0.962

Patience exceeded at 52 with last checkpoint saved at 44
changed learning rate to 0.001
Early stopping at epoch 73 with last checkpoint saved at 62
Model saved at: c:\Users\gpano\Desktop\github_py\proteomics_latent_space\models\abms_iter_1_54_25
Model: abms_iter_1_54_25_ep62_norm0_bits4_bs128_lr0.001 has been trained
Using this model abms_iter_1_54_25_ep62_norm0_bits4_bs128_lr0.001
The decoder output is transformed with an activation function, so reoconstructions are scaled.




2 54 20
abms_iter_0_54_30
abms_iter_1_54_25
cor_base


((4, 8), (6, 4))

In [None]:
outer_metrics_df.to_csv(os.getcwd()+"\\data\\processed\\scbc_iter_minmax_metrics.csv")
outer_final_df.to_csv(os.getcwd()+"\\data\\processed\\scbc_iter_minmax_finaldf.csv")

Unnamed: 0.1,Unnamed: 0,model_id,bits,avg_total_loss,avg_kl_loss,avg_rl_loss,hidden_dim,latent_dim,seed
0,0,abms_iter_0_54_30_ep83_norm0_bits4_bs128_lr0.001,4,82.131075,83.178672,-1.047598,54,30,12
1,0,abms_iter_1_54_25_ep58_norm0_bits4_bs128_lr0.001,4,68.286223,69.31565,-1.029426,54,25,12
2,0,abms_iter_0_54_30_ep65_norm0_bits4_bs128_lr0.001,4,82.14843,83.179176,-1.030745,54,30,13
3,0,abms_iter_1_54_25_ep62_norm0_bits4_bs128_lr0.001,4,68.286371,69.316413,-1.030042,54,25,13


### Checkpoint