# Tutorial 3: How to train the DeepRBP predictor from scratch.

In [39]:
import argparse
import pandas as pd
import os
import json
import torch
import sys
import uuid
import joblib
import numpy as np

from utils.Utils import get_data, scale_rbp_input_data, create_data_loader, build_optimizer
#from utils.Plots import *
from utils.Config import Config
from DLModelClass.modelsNN import DeepRBP

## 1. Setting Up Model Configuration and Saving Results 

First, we are going to create the Config object where we define various parameters that will be used to configure the model. These 
parameters include the maximum number of nodes in the hidden layers (`max_node`), the number of hidden layers (`num_hidden_layers`), 
the learning rate (`learning_rate`), batch size (`batch_size`), the number of epochs (`epochs`), among others.

A unique identifier unique_id is generated using uuid.uuid4().hex[:8], and a 'path_save_files' path is 
created to save the results in a specific folder within the output directory (`./output/`). If the folder doesn't exist, it is created.

After that we save the created model configuration to a JSON File in the results folder.

In [10]:
config = Config(
             max_node = 1024,
             num_hidden_layers = 2,
             learning_rate = 0.001,
             batch_size = 128,
             epochs = 100,
             same_num_nodes = False, 
             node_divition_factor = 4,
             activation_layer = 'relu',
             optimizer = 'adamW',
             source_train = 'TCGA',
             train_tumor_types = 'all')
print(config.get_config())

# Create the results folder
unique_id = uuid.uuid4().hex[:8]
path_save_files = f'./output/{unique_id}'

if not os.path.exists(path_save_files):
    os.makedirs(path_save_files)
    
# We are going to create the .json for the model we're gonna train with the actual settings
with open(path_save_files+'/config.json', 'w') as file:
   json.dump(config.get_config(), file)

{'max_node': 1024, 'num_hidden_layers': 2, 'learning_rate': 0.001, 'batch_size': 128, 'epochs': 100, 'same_num_nodes': False, 'node_divition_factor': 4, 'activation_layer': 'relu', 'optimizer': 'adamW', 'source_train': 'TCGA', 'train_tumor_types': 'all', 'test_tumor_types': 'all'}


## 2. Get and process the data

In this tutorial, we will use the TCGA dataset to train the model saved in `path_data`. If you haven't downloaded and processed this data yet, please run the following two shell scripts:

```bash
sh data/input_create_model/raw/download_script.sh
sh data/input_create_model/processed/create_data.sh
```

In [12]:
path_data = '../data/input_create_model/processed'

### 2.1. Get the data
In this step, we'll gather the training raw data for RBP, gene, and transcript expression, utilizing the tumor type samples specified in `config.train_tumor_types` from the study defined in `source_train`.

To conserve computational resources, we set `toy_set` to true, indicating that this is a toy example using a 15% fraction of all TCGA samples, specified by the variable `frac`.

The data will be divided into training and validation sets, and the combined data will be returned in the `data_raw` variable.

In [13]:
data_raw = get_data(config=config, path=path_data, toy_set=True, frac=0.15)

[Utils][get_data] set_mode is training
[Utils][get_data] path_raw:  ../data/input_create_model/processed/splitted_datasets/TCGA/training/Lung_Squamous_Cell_Carcinoma
[Utils][get_data] tumor_type & source columns added to this set ... -> DONE
[Utils][get_data] You are taking 15% of the samples for Lung_Squamous_Cell_Carcinoma tumor type
[Utils][get_data] You are performing a train-test split
[Utils][get_data] Divide data in Lung_Squamous_Cell_Carcinoma in Training/Validation ... -> DONE
[Utils][get_data] df_train.shape: (52, 1284)
[Utils][get_data] df_validation.shape: (14, 1284)
[Utils][get_data] train_labels.shape: (52, 11459)
[Utils][get_data] valid_labels.shape: (14, 11459)
[Utils][get_data] train_gn.shape: (52, 11459)
[Utils][get_data] valid_gn.shape: (14, 11459)
[Utils][get_data] path_raw:  ../data/input_create_model/processed/splitted_datasets/TCGA/training/Cholangiocarcinoma
[Utils][get_data] tumor_type & source columns added to this set ... -> DONE
[Utils][get_data] You are tak

Let's take a closer look at the data stored in the `data_raw` variable:

* `df_train`
 and `df_val`: these contain the training and validation sets with RBP expression in *log2(TPM+1)*, with dimensions *num_samples x n_rbps*.
* `train_labels` and `valid_labels`: these hold the training and validation sets with transcript expression in *log2(TPM+1)*, with dimensions *num_samples x n_transcripts*.
* `train_gn` and `valid_gn`: these store the training and validation sets with gene expression in *TPM* for each transcript, with dimensions *num_samples x n_transcripts*.

Therefore, if you want to introduce different data instead of TCGA, it should have the format mentioned here.

In [23]:
df_train = data_raw.df_train.drop(['source', 'tumor_type'], axis=1)
df_val = data_raw.df_val.drop(['source','tumor_type'], axis=1)

train_labels = data_raw.train_labels
valid_labels = data_raw.valid_labels

train_gn = data_raw.train_gn
valid_gn = data_raw.valid_gn

In [25]:
df_train.head()

Unnamed: 0,A1CF,AATF,ABCF1,ABT1,ACAA2,ACIN1,ACTB,ACTN1,ACTN4,ADAR,...,ZNF598,ZNF622,ZNF638,ZNF768,ZNFX1,ZNHIT6,ZRANB2,ZRSR1,ZRSR2,ZYX
TCGA-33-4533-01,0.070387,5.527001,5.32189,4.051357,5.597576,5.75543,12.267192,7.127147,7.295404,6.233582,...,4.231916,5.884603,5.437253,4.507785,2.526098,4.414803,5.13705,0.367373,4.204,6.016536
TCGA-90-7964-01,0.014355,5.640976,5.593659,3.828865,3.685968,6.430408,11.635853,6.484289,8.40097,6.305439,...,5.077624,4.524808,5.750032,5.04435,3.662187,3.838908,4.606989,0.176318,3.092509,7.434058
TCGA-LA-A7SW-01,0.454178,6.271676,6.24868,4.618787,4.041774,6.714093,12.719414,8.564711,9.030559,6.183862,...,6.326768,4.832396,6.322818,4.854982,3.769736,3.873821,5.0237,0.0,4.225762,7.277917
TCGA-66-2793-01,0.0,5.066562,6.45779,4.438928,5.535708,5.759159,11.43592,5.795485,8.420812,6.688343,...,6.731032,4.109319,6.553823,6.188399,3.375697,4.516009,6.128647,0.0,3.411442,6.39404
TCGA-21-1076-01,0.02148,4.748974,4.878253,3.51347,5.373491,6.320794,12.007832,7.067684,8.458759,6.723045,...,4.267917,4.571335,6.087984,4.030755,4.488948,3.302328,5.143042,0.0,4.063533,7.481918


In [26]:
train_labels.head()

Unnamed: 0,ENST00000006015,ENST00000006750,ENST00000009530,ENST00000027335,ENST00000034275,ENST00000052569,ENST00000053243,ENST00000066544,ENST00000074304,ENST00000078429,...,ENST00000631199,ENST00000631253,ENST00000631589,ENST00000631641,ENST00000631857,ENST00000632133,ENST00000632375,ENST00000633063,ENST00000633317,ENST00000633691
TCGA-33-4533-01,4.133355,1.214107,6.648637,0.0,3.947667,3.466004,0.59455,4.519834,2.606401,5.209098,...,0.0,0.042644,0.286878,0.0,0.111034,0.275004,0.0,3.278002,0.0,1.163497
TCGA-90-7964-01,0.321936,1.042632,7.993168,0.400527,0.505893,3.423589,0.321936,3.71477,2.664475,3.994575,...,0.0,0.367373,0.0,0.0,0.356135,0.432948,0.137501,2.792876,0.0,0.879716
TCGA-LA-A7SW-01,1.608791,0.0,5.670969,0.097613,0.0,4.131752,0.0,3.829887,1.835903,4.839923,...,0.0,0.310339,0.0,0.0,0.432948,0.400527,0.0,2.833883,0.0,1.604087
TCGA-66-2793-01,0.903017,0.555816,5.37154,0.201631,3.729004,5.146087,0.516027,4.827861,2.61351,4.95232,...,0.0,0.0,0.0,0.250954,0.678058,0.516027,0.0,3.484117,0.0,0.0
TCGA-21-1076-01,0.327691,2.467268,9.127541,0.035623,3.657098,4.031684,3.106875,3.668464,3.008072,4.152988,...,0.0,0.097613,0.0,0.0,0.891411,1.675827,0.042646,3.421571,0.0,0.0


In [27]:
train_gn.head()

Unnamed: 0,ENST00000006015,ENST00000006750,ENST00000009530,ENST00000027335,ENST00000034275,ENST00000052569,ENST00000053243,ENST00000066544,ENST00000074304,ENST00000078429,...,ENST00000631199,ENST00000631253,ENST00000631589,ENST00000631641,ENST00000631857,ENST00000632133,ENST00000632375,ENST00000633063,ENST00000633317,ENST00000633691
TCGA-33-4533-01,17.940144,2.760017,828.810379,0.070002,42.970255,16.789811,0.510002,39.15876,6.429808,39.959424,...,7.250101,53.861153,10.779857,104.690463,9.420129,4177.718746,15.960231,13.340047,15.960231,61.960493
TCGA-90-7964-01,0.250007,2.260061,2078.027895,0.34999,37.908828,10.509796,0.250007,30.740166,7.910219,16.979417,...,1.860028,77.218858,9.030089,54.65091,9.640115,2456.68882,9.490262,9.820205,9.490262,77.1814
TCGA-LA-A7SW-01,2.309972,1.909963,439.645447,0.070002,47.520221,16.619594,0.049999,52.069704,5.990013,43.961454,...,3.259967,43.988887,5.660169,81.001807,5.980055,2038.368669,1.94,14.920286,1.94,102.899059
TCGA-66-2793-01,0.869973,0.99003,281.813103,0.149998,41.67948,35.779459,0.430012,59.659328,8.939776,38.920624,...,1.460044,53.689688,6.830015,128.301013,71.73871,1461.565747,4.489886,19.720536,4.489886,121.615908
TCGA-21-1076-01,0.310004,11.190079,5615.525038,0.099998,37.679484,18.22517,7.949859,23.120124,8.810139,24.119871,...,12.585022,71.771825,11.150227,78.746159,35.480236,17462.184252,7.674869,18.900149,7.674869,77.481564


In [24]:
print(f'Number of samples for training: {df_train.shape[0]}')
print(f'Number of samples for validation: {df_val.shape[0]}')

Number of samples for training: 993
Number of samples for validation: 265


### 2.2. Scale the RBP input data
Next, we'll scale the dataset with RBP expression. We'll fit the training set to the standard scaler to compute the scaling parameters and then transform both the training and validation sets.

Afterwards, we'll constrain the values between *-2sigma* and *2sigma*, shift them by *2sigma*, and divide by *4sigma*, so that the expression values are normalized between 0 and 1.

The `data_scale` variable will contain the scaled training and validation sets, along with the scaler object and the sigma value used (a value that tends to 1).

In [30]:
data_scale = scale_rbp_input_data(df_train, df_val)

In [31]:
data_scale.scaledTrain_df.head()

Unnamed: 0,A1CF,AATF,ABCF1,ABT1,ACAA2,ACIN1,ACTB,ACTN1,ACTN4,ADAR,...,ZNF598,ZNF622,ZNF638,ZNF768,ZNFX1,ZNHIT6,ZRANB2,ZRSR1,ZRSR2,ZYX
TCGA-33-4533-01,0.396504,0.586747,0.549004,0.63823,0.545793,0.431549,0.73403,0.664078,0.368639,0.563407,...,0.39282,1.0,0.386229,0.34604,0.135015,0.911408,0.600449,0.591811,0.707293,0.299165
TCGA-90-7964-01,0.385842,0.631865,0.646026,0.536686,0.214678,0.66892,0.507299,0.535379,0.654791,0.583991,...,0.658109,0.465896,0.495723,0.54268,0.474366,0.718825,0.428438,0.446836,0.296959,0.67217
TCGA-LA-A7SW-01,0.46953,0.881535,0.879868,0.897202,0.276308,0.768684,0.896434,0.951876,0.817747,0.549164,...,1.0,0.599804,0.696238,0.47328,0.50649,0.7305,0.563665,0.313044,0.715327,0.631083
TCGA-66-2793-01,0.383111,0.404478,0.95452,0.815115,0.535077,0.432861,0.435499,0.397482,0.659927,0.693679,...,1.0,0.285013,0.777106,0.961948,0.388791,0.945252,0.922233,0.313044,0.414701,0.3985
TCGA-21-1076-01,0.387198,0.278757,0.390626,0.392742,0.506979,0.630372,0.640887,0.652174,0.669749,0.70362,...,0.404113,0.486151,0.61403,0.171219,0.721319,0.539389,0.602393,0.313044,0.655436,0.684763


In [32]:
data_scale.scaledValidation_df.head()

Unnamed: 0,A1CF,AATF,ABCF1,ABT1,ACAA2,ACIN1,ACTB,ACTN1,ACTN4,ADAR,...,ZNF598,ZNF622,ZNF638,ZNF768,ZNFX1,ZNHIT6,ZRANB2,ZRSR1,ZRSR2,ZYX
TCGA-33-4566-01,0.383111,0.446194,0.469529,0.65162,0.169466,0.266505,0.671017,0.589874,0.175058,0.60074,...,0.302774,0.899345,0.173778,0.0,0.302199,0.352018,0.309458,0.313044,0.0,0.413533
TCGA-60-2706-01,0.383111,0.257331,0.838424,0.907845,0.526877,0.567132,0.59727,0.558254,0.615572,0.774972,...,0.939604,0.435736,0.299208,1.0,0.527763,0.409715,0.420132,0.43711,0.199989,0.532922
TCGA-90-6837-11,0.383111,0.435958,0.549387,0.468396,0.613895,0.386991,0.726741,0.621069,0.912473,0.631288,...,0.316958,0.408216,0.281443,0.303724,0.776966,0.516103,0.370867,0.512642,0.474307,0.915242
TCGA-22-1000-01,0.383111,0.393612,0.389386,0.354163,0.431194,0.362745,0.577524,0.746405,0.633684,0.487077,...,0.488239,0.315244,0.193619,0.204915,0.584074,0.485913,0.352935,0.313044,0.295721,0.704985
TCGA-85-8277-01,0.383111,0.810636,0.945909,0.111489,0.220996,0.364053,0.369337,0.489238,0.438195,0.65027,...,0.637849,0.468603,0.1729,0.534934,0.324916,0.528255,0.401149,0.649664,0.468899,0.337096


### 2.3. Create training and validation data loaders
Next, we'll create the training and validation data loaders (`train_loader` and `val_loader`). These loaders will use the normalized RBP expression, transcript expression, and gene expression, needed to fit into the model. In this step, we use the `config` object to determine the batch size. For the validation set, since we are testing, the batch size is set to twice the value assigned in the config.


In [33]:
train_loader = create_data_loader(scaled_rbps_df=data_scale.scaledTrain_df, labels_df=train_labels, gn_df=train_gn, config=config)
val_loader = create_data_loader(scaled_rbps_df=data_scale.scaledValidation_df, labels_df=valid_labels, gn_df=valid_gn, config=config, set_mode='test')

[Utils][create_data_loader] Creating a training data_loader
[Utils][create_data_loader] Convert to PyTorch dataset ... -> DONE 
[Utils][create_data_loader] Creating a test data_loader
[Utils][create_data_loader] Convert to PyTorch dataset ... -> DONE 


## 3. Create the DeepRBP model object and get the optimizer

In [34]:
model = DeepRBP(n_inputs=df_train.shape[1], n_outputs=train_labels.shape[1], config=config)
optimizer = build_optimizer(model, config.optimizer, config.learning_rate)
print(model)

You are using a model with 2 hidden layers
DeepRBP(
  (linear0): Linear(in_features=1282, out_features=1024, bias=True)
  (bn0): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (a_function0): ReLU()
  (linear1): Linear(in_features=1024, out_features=256, bias=True)
  (bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (a_function1): ReLU()
  (final_layer): Linear(in_features=256, out_features=11459, bias=True)
)


## 4. Train the DeepRBP model

In [35]:
model.fit(
        epochs=config.epochs, 
        train_loader=train_loader, 
        val_loader=val_loader, 
        optimizer=optimizer, 
        path=path_save_files, 
        model_name='model.pt')

Training:   1%|▎                             | 1/100 [00:00<01:16,  1.30epoch/s]

Epoch 0/100 - Training Loss: {'loss': 9.460707664489746}, Validation Loss: {'val_loss': 7.767425060272217}


Training:   6%|█▊                            | 6/100 [00:03<00:50,  1.86epoch/s]

Epoch 5/100 - Training Loss: {'loss': 0.6503445506095886}, Validation Loss: {'val_loss': 0.5434252619743347}


Training:  11%|███▏                         | 11/100 [00:06<00:46,  1.91epoch/s]

Epoch 10/100 - Training Loss: {'loss': 0.22756969928741455}, Validation Loss: {'val_loss': 0.23799046874046326}


Training:  16%|████▋                        | 16/100 [00:08<00:43,  1.91epoch/s]

Epoch 15/100 - Training Loss: {'loss': 0.18126490712165833}, Validation Loss: {'val_loss': 0.19313272833824158}


Training:  21%|██████                       | 21/100 [00:11<00:41,  1.92epoch/s]

Epoch 20/100 - Training Loss: {'loss': 0.16029749810695648}, Validation Loss: {'val_loss': 0.1718820482492447}


Training:  26%|███████▌                     | 26/100 [00:13<00:39,  1.90epoch/s]

Epoch 25/100 - Training Loss: {'loss': 0.14951136708259583}, Validation Loss: {'val_loss': 0.15762443840503693}


Training:  31%|████████▉                    | 31/100 [00:16<00:36,  1.88epoch/s]

Epoch 30/100 - Training Loss: {'loss': 0.13836590945720673}, Validation Loss: {'val_loss': 0.14914976060390472}


Training:  36%|██████████▍                  | 36/100 [00:19<00:35,  1.79epoch/s]

Epoch 35/100 - Training Loss: {'loss': 0.13165168464183807}, Validation Loss: {'val_loss': 0.14232420921325684}


Training:  41%|███████████▉                 | 41/100 [00:22<00:35,  1.68epoch/s]

Epoch 40/100 - Training Loss: {'loss': 0.1266697496175766}, Validation Loss: {'val_loss': 0.13670961558818817}


Training:  46%|█████████████▎               | 46/100 [00:25<00:32,  1.64epoch/s]

Epoch 45/100 - Training Loss: {'loss': 0.12023498117923737}, Validation Loss: {'val_loss': 0.13191097974777222}


Training:  51%|██████████████▊              | 51/100 [00:28<00:29,  1.68epoch/s]

Epoch 50/100 - Training Loss: {'loss': 0.11590653657913208}, Validation Loss: {'val_loss': 0.12817241251468658}


Training:  56%|████████████████▏            | 56/100 [00:31<00:26,  1.68epoch/s]

Epoch 55/100 - Training Loss: {'loss': 0.11284075677394867}, Validation Loss: {'val_loss': 0.12525072693824768}


Training:  61%|█████████████████▋           | 61/100 [00:34<00:23,  1.67epoch/s]

Epoch 60/100 - Training Loss: {'loss': 0.10986991971731186}, Validation Loss: {'val_loss': 0.12374722957611084}


Training:  66%|███████████████████▏         | 66/100 [00:37<00:19,  1.73epoch/s]

Epoch 65/100 - Training Loss: {'loss': 0.10635685920715332}, Validation Loss: {'val_loss': 0.12124522030353546}


Training:  71%|████████████████████▌        | 71/100 [00:40<00:17,  1.65epoch/s]

Epoch 70/100 - Training Loss: {'loss': 0.10363112390041351}, Validation Loss: {'val_loss': 0.11846330016851425}


Training:  76%|██████████████████████       | 76/100 [00:43<00:15,  1.58epoch/s]

Epoch 75/100 - Training Loss: {'loss': 0.1018759235739708}, Validation Loss: {'val_loss': 0.11704249680042267}


Training:  81%|███████████████████████▍     | 81/100 [00:46<00:11,  1.65epoch/s]

Epoch 80/100 - Training Loss: {'loss': 0.10018760710954666}, Validation Loss: {'val_loss': 0.11553354561328888}


Training:  86%|████████████████████████▉    | 86/100 [00:49<00:08,  1.71epoch/s]

Epoch 85/100 - Training Loss: {'loss': 0.09544279426336288}, Validation Loss: {'val_loss': 0.11382366716861725}


Training:  91%|██████████████████████████▍  | 91/100 [00:52<00:05,  1.72epoch/s]

Epoch 90/100 - Training Loss: {'loss': 0.09476589411497116}, Validation Loss: {'val_loss': 0.11165716499090195}


Training:  96%|███████████████████████████▊ | 96/100 [00:55<00:02,  1.77epoch/s]

Epoch 95/100 - Training Loss: {'loss': 0.09429702907800674}, Validation Loss: {'val_loss': 0.11143597215414047}


Training: 100%|████████████████████████████| 100/100 [00:57<00:00,  1.74epoch/s]

Model saved succesfully in ./output/5303fd9c





([{'loss': 9.460707664489746},
  {'loss': 6.607574939727783},
  {'loss': 4.02331018447876},
  {'loss': 2.2119243144989014},
  {'loss': 1.1717698574066162},
  {'loss': 0.6503445506095886},
  {'loss': 0.42609813809394836},
  {'loss': 0.3288208544254303},
  {'loss': 0.2753663659095764},
  {'loss': 0.24565927684307098},
  {'loss': 0.22756969928741455},
  {'loss': 0.21668753027915955},
  {'loss': 0.20208634436130524},
  {'loss': 0.19472339749336243},
  {'loss': 0.18751324713230133},
  {'loss': 0.18126490712165833},
  {'loss': 0.1753537505865097},
  {'loss': 0.17490161955356598},
  {'loss': 0.16850495338439941},
  {'loss': 0.16486100852489471},
  {'loss': 0.16029749810695648},
  {'loss': 0.1580357849597931},
  {'loss': 0.1569432020187378},
  {'loss': 0.15430499613285065},
  {'loss': 0.15190644562244415},
  {'loss': 0.14951136708259583},
  {'loss': 0.14530618488788605},
  {'loss': 0.1434699296951294},
  {'loss': 0.14069858193397522},
  {'loss': 0.1402304321527481},
  {'loss': 0.13836590945720

## 5. Save the trained DeepRBP model parameters, the scaler and sigma used in results folder

In [40]:
filename_scaler = os.path.join(path_save_files, 'scaler_sfs.joblib')
filename_sigma = os.path.join(path_save_files, 'sigma_sfs.txt')
joblib.dump(data_scale.scaler_sfs, filename_scaler)
np.savetxt(filename_sigma, [data_scale.sigma_sfs])