## BIOINFORMATICS THESIS: MULTIMODAL NEURAL NETWORK

# CELL LINE: K562

In [1]:
import pandas as pd 
import numpy as np
import os

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from collections import defaultdict, OrderedDict
import pickle

import sqlite3
from sqlalchemy import create_engine

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
# create a database to store optuna studies with sqlite backend
#engine = create_engine('sqlite:///BIOINF_optuna_tuning.db')

In [3]:
from BIOINF_tesi.data_pipe import CELL_LINES, TASKS

In [4]:
cell_line = CELL_LINES[5]
cell_line

'K562'

---

In [5]:
from BIOINF_tesi.data_pipe import Load_Create_Task
from BIOINF_tesi.data_pipe import Build_DataLoader_Pipeline

In [6]:
from BIOINF_tesi.models import FFNN, CNN, CNN_LSTM
from BIOINF_tesi.models.utils import fit, Param_Search, Kfold_CV

In [7]:
with open ('results_dict.pickle', 'rb') as fin:
    results_dict = pickle.load(fin)
    results_dict = defaultdict(lambda: defaultdict(dict), results_dict)

## 1) ACTIVE ENHANCERS vs INACTIVE ENHANCERS

In [7]:
task = TASKS[0]
task

'active_E_vs_inactive_E'

In [9]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


### 1. FFNN

In [8]:
model=FFNN

In [11]:
# IMBALANCED
type_augm_genfeatures = 'smote'

kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                type_augm_genfeatures=type_augm_genfeatures,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}_{type_augm_genfeatures}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_{type_augm_genfeatures}_TEST')

>>> ITERATION N. 1



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:11:53,742][0m Using an existing study with name 'K562_active_E_vs_inactive_E_FFNN_smote_1' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.16219378913991303
  Params: 
    dropout_l0: 0.0
    lr: 0.0012310377211073446
    n_layers: 1
    n_units_l0: 128
    optimizer: Adam
    weight_decay: 0.009841838092178281

AUPRC test score: 0.16011940526804078


>>> ITERATION N. 2



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:11:55,924][0m Using an existing study with name 'K562_active_E_vs_inactive_E_FFNN_smote_1_2' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.16827608778971453
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.4
    lr: 4.8514294313735435e-05
    n_layers: 2
    n_units_l0: 64
    n_units_l1: 64
    optimizer: RMSprop
    weight_decay: 0.004683515072552518

AUPRC test score: 0.17065584203541573


>>> ITERATION N. 3



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:11:58,065][0m Using an existing study with name 'K562_active_E_vs_inactive_E_FFNN_smote_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.16956568449297407
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.4
    lr: 0.0001857319156178135
    n_layers: 2
    n_units_l0: 64
    n_units_l1: 64
    optimizer: Adam
    weight_decay: 0.00033637939831044635

AUPRC test score: 0.18528740201451152



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.17202


In [12]:
results_dict[cell_line][task][f'{model.__name__}_{type_augm_genfeatures}'] = kf_CV.scores_dict

In [13]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

In [14]:
# IMBALANCED
type_augm_genfeatures = 'double'

kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                type_augm_genfeatures=type_augm_genfeatures,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}_{type_augm_genfeatures}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_{type_augm_genfeatures}_TEST')

>>> ITERATION N. 1



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:11:59,837][0m Using an existing study with name 'K562_active_E_vs_inactive_E_FFNN_double_1' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.1677854905698403
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.0
    lr: 0.00015084299101051534
    n_layers: 2
    n_units_l0: 64
    n_units_l1: 128
    optimizer: Nadam
    weight_decay: 0.0024008620194533293

AUPRC test score: 0.1728176878689416


>>> ITERATION N. 2



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:12:00,818][0m Using an existing study with name 'K562_active_E_vs_inactive_E_FFNN_double_1_2' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.1694133113594793
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.2
    dropout_l2: 0.5
    lr: 0.00012879035355452782
    n_layers: 3
    n_units_l0: 32
    n_units_l1: 32
    n_units_l2: 4
    optimizer: RMSprop
    weight_decay: 0.00010338545774886194

AUPRC test score: 0.16360713833317334


>>> ITERATION N. 3



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:12:01,672][0m Using an existing study with name 'K562_active_E_vs_inactive_E_FFNN_double_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.1713202482979311
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.3
    dropout_l2: 0.4
    dropout_l3: 0.5
    lr: 0.0009722589684935593
    n_layers: 4
    n_units_l0: 64
    n_units_l1: 32
    n_units_l2: 4
    n_units_l3: 16
    optimizer: Nadam
    weight_decay: 0.0010135573545622541

AUPRC test score: 0.17771453884486205



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.17138


In [15]:
results_dict[cell_line][task][f'{model.__name__}_{type_augm_genfeatures}'] = kf_CV.scores_dict

In [16]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [12]:
model=CNN

In [12]:
# IMBALANCED
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

>>> ITERATION N. 1



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-26 07:17:46,549][0m Using an existing study with name 'K562_active_E_vs_inactive_E_CNN_1' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.09978065942930114
  Params: 
    dropout_l0: 0.3
    kernel_size_l0: 5
    lr: 0.00012553999377602288
    n_layers: 1
    optimizer: Adam
    out_channels_l0: 32
    weight_decay: 0.0016199060493007837

AUPRC test score: 0.0951873587475576


>>> ITERATION N. 2



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-26 07:17:49,389][0m Using an existing study with name 'K562_active_E_vs_inactive_E_CNN_1_2' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.0988930744073319
  Params: 
    dropout_l0: 0
    dropout_l1: 0
    kernel_size_l0: 11
    kernel_size_l1: 5
    lr: 5.54911804804769e-05
    n_layers: 2
    optimizer: Adam
    out_channels_l0: 64
    out_channels_l1: 32
    weight_decay: 0.009098234131602656



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-26 07:17:50,069][0m Using an existing study with name 'K562_active_E_vs_inactive_E_CNN_1_2_3' instead of creating a new one.[0m


AUPRC test score: 0.09326586105610829


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.09695771162235443
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.5
    kernel_size_l0: 5
    kernel_size_l1: 5
    lr: 3.799406964424947e-05
    n_layers: 2
    optimizer: RMSprop
    out_channels_l0: 16
    out_channels_l1: 32
    weight_decay: 0.00020887449695996396

AUPRC test score: 0.09966244852811575



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.09604


In [13]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [14]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 2) ACTIVE PROMOTERS vs INACTIVE PROMOTERS

In [16]:
task = TASKS[1]
task

'active_P_vs_inactive_P'

In [18]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


---

### 1. FFNN

In [17]:
model=FFNN

In [20]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

>>> ITERATION N. 1


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:12:03,363][0m Using an existing study with name 'K562_active_P_vs_inactive_P_FFNN_1' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.3380595667968206
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.3
    dropout_l2: 0.0
    dropout_l3: 0.5
    lr: 0.0005332619896751221
    n_layers: 4
    n_units_l0: 128
    n_units_l1: 64
    n_units_l2: 16
    n_units_l3: 32
    optimizer: Adam
    weight_decay: 0.004412853454868649

AUPRC test score: 0.346751362564637


>>> ITERATION N. 2


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:12:04,171][0m Using an existing study with name 'K562_active_P_vs_inactive_P_FFNN_1_2' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.350482604022387
  Params: 
    dropout_l0: 0.4
    lr: 2.477173483044694e-05
    n_layers: 1
    n_units_l0: 32
    optimizer: Adam
    weight_decay: 0.0006494048404091282

AUPRC test score: 0.3352240270461264


>>> ITERATION N. 3


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:12:04,828][0m Using an existing study with name 'K562_active_P_vs_inactive_P_FFNN_1_2_3' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.34804258357286855
  Params: 
    dropout_l0: 0.0
    dropout_l1: 0.4
    dropout_l2: 0.5
    dropout_l3: 0.4
    lr: 0.00026726630089396025
    n_layers: 4
    n_units_l0: 256
    n_units_l1: 16
    n_units_l2: 64
    n_units_l3: 4
    optimizer: Adam
    weight_decay: 0.0013683895425364268

AUPRC test score: 0.34359407251470625



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.34186


In [21]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [22]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [21]:
model=CNN

In [18]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-26 07:17:51,698][0m Using an existing study with name 'K562_active_P_vs_inactive_P_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.2220656349623457
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.4
    kernel_size_l0: 11
    kernel_size_l1: 11
    lr: 0.0027388698322440117
    n_layers: 2
    optimizer: Nadam
    out_channels_l0: 32
    out_channels_l1: 32
    weight_decay: 0.00038939741174232517



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-26 07:17:52,580][0m Using an existing study with name 'K562_active_P_vs_inactive_P_CNN_1_2' instead of creating a new one.[0m


AUPRC test score: 0.2506220632779133


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.23292874499208988
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.5
    dropout_l2: 0.5
    kernel_size_l0: 11
    kernel_size_l1: 11
    kernel_size_l2: 11
    lr: 0.00024803777510764175
    n_layers: 3
    optimizer: Nadam
    out_channels_l0: 64
    out_channels_l1: 64
    out_channels_l2: 96
    weight_decay: 0.07700078693031231



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-26 07:17:53,359][0m Using an existing study with name 'K562_active_P_vs_inactive_P_CNN_1_2_3' instead of creating a new one.[0m


AUPRC test score: 0.24488248850567054


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.23553865239452412
  Params: 
    dropout_l0: 0
    kernel_size_l0: 15
    lr: 0.00021244455977337997
    n_layers: 1
    optimizer: RMSprop
    out_channels_l0: 32
    weight_decay: 0.0005241393348139716

AUPRC test score: 0.23183123862186264



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.24245


In [19]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [20]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 3) ACTIVE ENHANCERS vs ACTIVE PROMOTERS

In [25]:
task = TASKS[2]
task

'active_E_vs_active_P'

In [24]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


---
### 1. FFNN

In [26]:
model=FFNN

In [26]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:12:07,299][0m Using an existing study with name 'K562_active_E_vs_active_P_FFNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.8128214035270347
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.2
    dropout_l2: 0.4
    dropout_l3: 0.4
    lr: 0.00029769984912220675
    n_layers: 4
    n_units_l0: 64
    n_units_l1: 32
    n_units_l2: 32
    n_units_l3: 4
    optimizer: Adam
    weight_decay: 0.00016478680755945124

AUPRC test score: 0.7854633119056109


>>> ITERATION N. 2


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:12:07,527][0m Using an existing study with name 'K562_active_E_vs_active_P_FFNN_1_2' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:12:07,748][0m Using an existing study with name 'K562_active_E_vs_active_P_FFNN_1_2_3' instead of creating a new one.[0m


  Value:  0.7245543772030697
  Params: 
    dropout_l0: 0.4
    lr: 0.003674307342965946
    n_layers: 1
    n_units_l0: 256
    optimizer: Nadam
    weight_decay: 0.03197094386568194

AUPRC test score: 0.7893468130245469


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.3113636363636364
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.0
    dropout_l2: 0.0
    dropout_l3: 0.0
    lr: 0.042745343586120754
    n_layers: 4
    n_units_l0: 64
    n_units_l1: 128
    n_units_l2: 16
    n_units_l3: 4
    optimizer: RMSprop
    weight_decay: 0.0001300309601605759

AUPRC test score: 0.7890401006034164



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.78795


In [27]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [28]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [30]:
model=CNN

In [24]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-26 07:17:57,538][0m Using an existing study with name 'K562_active_E_vs_active_P_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.7286561239644743
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0
    kernel_size_l0: 15
    kernel_size_l1: 5
    lr: 0.0004982788834045453
    n_layers: 2
    optimizer: Nadam
    out_channels_l0: 64
    out_channels_l1: 64
    weight_decay: 0.021726943154521127

AUPRC test score: 0.6855680410677872




  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-26 07:17:57,829][0m Using an existing study with name 'K562_active_E_vs_active_P_CNN_1_2' instead of creating a new one.[0m


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.7015452722820358
  Params: 
    dropout_l0: 0
    dropout_l1: 0
    dropout_l2: 0
    dropout_l3: 0.4
    kernel_size_l0: 11
    kernel_size_l1: 5
    kernel_size_l2: 15
    kernel_size_l3: 11
    lr: 0.0011585087764247083
    n_layers: 4
    optimizer: RMSprop
    out_channels_l0: 16
    out_channels_l1: 32
    out_channels_l2: 256
    out_channels_l3: 512
    weight_decay: 0.00042547132851450213



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-26 07:17:58,041][0m Using an existing study with name 'K562_active_E_vs_active_P_CNN_1_2_3' instead of creating a new one.[0m


AUPRC test score: 0.6008439690405243


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  4
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.7275269835889884
  Params: 
    dropout_l0: 0.4
    kernel_size_l0: 11
    lr: 0.003082244888755402
    n_layers: 1
    optimizer: Adam
    out_channels_l0: 64
    weight_decay: 0.00013395551039375276

AUPRC test score: 0.6625969489850172



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.64967


In [25]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [26]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 4) INACTIVE ENHANCERS vs INACTIVE PROMOTERS

In [34]:
task = TASKS[3]
task

'inactive_E_vs_inactive_P'

In [30]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


---
### 1. FFNN

In [35]:
model=FFNN

In [32]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

>>> ITERATION N. 1



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:12:13,851][0m Using an existing study with name 'K562_inactive_E_vs_inactive_P_FFNN_1' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5819520169868239
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.4
    dropout_l2: 0.4
    lr: 3.50561735656237e-05
    n_layers: 3
    n_units_l0: 64
    n_units_l1: 32
    n_units_l2: 16
    optimizer: Nadam
    weight_decay: 0.00021922031857164005

AUPRC test score: 0.580510224033992


>>> ITERATION N. 2



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:12:17,342][0m Using an existing study with name 'K562_inactive_E_vs_inactive_P_FFNN_1_2' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  4
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5756115192740492
  Params: 
    dropout_l0: 0.0
    dropout_l1: 0.3
    dropout_l2: 0.5
    dropout_l3: 0.0
    lr: 0.001602823665786899
    n_layers: 4
    n_units_l0: 64
    n_units_l1: 16
    n_units_l2: 64
    n_units_l3: 16
    optimizer: RMSprop
    weight_decay: 0.0006936528791634974

AUPRC test score: 0.5812594405034212


>>> ITERATION N. 3


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:12:19,468][0m Using an existing study with name 'K562_inactive_E_vs_inactive_P_FFNN_1_2_3' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5948708238528561
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.3
    dropout_l2: 0.4
    lr: 0.00020452329281468794
    n_layers: 3
    n_units_l0: 128
    n_units_l1: 128
    n_units_l2: 16
    optimizer: Nadam
    weight_decay: 0.00013712344273594207

AUPRC test score: 0.6041359617355831



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.58864


In [37]:
from BIOINF_tesi.visual import parse_as_dict

i=2

d = torch.load(f'{cell_line}_{model.__name__}_{task}_{i}_test_.pt', map_location=torch.device(device))

s =  """dropout_l0: 0.0
    dropout_l1: 0.3
    dropout_l2: 0.5
    dropout_l3: 0.0
    lr: 0.001602823665786899
    n_layers: 4
    n_units_l0: 64
    n_units_l1: 16
    n_units_l2: 64
    n_units_l3: 16
    optimizer: RMSprop
    weight_decay: 0.0006936528791634974"""

d['model_params']=parse_as_dict(s)
print(d['model_params'])

torch.save(d, f'{cell_line}_{model.__name__}_{task}_{i}_test_.pt')

OrderedDict([('dropout_l0', 0.0), ('dropout_l1', 0.3), ('dropout_l2', 0.5), ('dropout_l3', 0.0), ('lr', 0.001602823665786899), ('n_layers', 4.0), ('n_units_l0', 64.0), ('n_units_l1', 16.0), ('n_units_l2', 64.0), ('n_units_l3', 16.0), ('optimizer', 'RMSprop'), ('weight_decay', 0.0006936528791634974)])


In [33]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [34]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [29]:
model=CNN

In [30]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-26 07:18:03,436][0m Using an existing study with name 'K562_inactive_E_vs_inactive_P_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  4
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.591144432230329
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0
    kernel_size_l0: 5
    kernel_size_l1: 15
    lr: 0.0001820835437571587
    n_layers: 2
    optimizer: Nadam
    out_channels_l0: 16
    out_channels_l1: 32
    weight_decay: 0.006765423040120965



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-26 07:18:03,724][0m Using an existing study with name 'K562_inactive_E_vs_inactive_P_CNN_1_2' instead of creating a new one.[0m


AUPRC test score: 0.5871187361009541


>>> ITERATION N. 2



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-26 07:18:03,972][0m Using an existing study with name 'K562_inactive_E_vs_inactive_P_CNN_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  4
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.571171052848243
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.5
    dropout_l2: 0.4
    kernel_size_l0: 11
    kernel_size_l1: 11
    kernel_size_l2: 5
    lr: 1.0030637855044293e-05
    n_layers: 3
    optimizer: Nadam
    out_channels_l0: 64
    out_channels_l1: 32
    out_channels_l2: 96
    weight_decay: 0.004065285254537316

AUPRC test score: 0.5352137027949346


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5551096218445121
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0
    kernel_size_l0: 11
    kernel_size_l1: 15
    lr: 0.001107600674576402
    n_layers: 2
    optimizer: RMSprop
    out_channels_l0: 16
    out_channels_l1: 96
    weight_decay: 0.019164801464340447

AUPRC test score: 0.5700204649743834



3-FOLD CROSS-VALIDATION 

In [31]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [32]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 5) ACTIVE ENHANCERS + ACTIVE PROMOTERS vs INACTIVE REST

In [35]:
task = TASKS[4]
task

'active_EP_vs_inactive_rest'

In [36]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


---
### 1. FFNN

In [37]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

>>> ITERATION N. 1



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:12:23,644][0m Using an existing study with name 'K562_active_EP_vs_inactive_rest_FFNN_1' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.24976247049442898
  Params: 
    dropout_l0: 0.4
    lr: 0.003438936914934226
    n_layers: 1
    n_units_l0: 32
    optimizer: Adam
    weight_decay: 0.06493399527089351

AUPRC test score: 0.24378712698281457


>>> ITERATION N. 2



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:12:26,623][0m Using an existing study with name 'K562_active_EP_vs_inactive_rest_FFNN_1_2' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.27158689336547526
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.0
    lr: 1.527256203374856e-05
    n_layers: 2
    n_units_l0: 32
    n_units_l1: 32
    optimizer: RMSprop
    weight_decay: 0.0009640697605213797

AUPRC test score: 0.2541543567751261


>>> ITERATION N. 3



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:12:28,612][0m Using an existing study with name 'K562_active_EP_vs_inactive_rest_FFNN_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.2637937075091922
  Params: 
    dropout_l0: 0.3
    lr: 0.00027857622669425354
    n_layers: 1
    n_units_l0: 64
    optimizer: Adam
    weight_decay: 0.0002545785445432793

AUPRC test score: 0.26238839435842365



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.25344


In [38]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [39]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [35]:
model=CNN

In [36]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-26 07:18:10,912][0m Using an existing study with name 'K562_active_EP_vs_inactive_rest_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.139048418634575
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.4
    dropout_l2: 0.4
    kernel_size_l0: 5
    kernel_size_l1: 11
    kernel_size_l2: 15
    lr: 0.009227513116884434
    n_layers: 3
    optimizer: Adam
    out_channels_l0: 16
    out_channels_l1: 64
    out_channels_l2: 96
    weight_decay: 0.006337792359963763

AUPRC test score: 0.13659568120528381


>>> ITERATION N. 2



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-26 07:18:11,398][0m Using an existing study with name 'K562_active_EP_vs_inactive_rest_CNN_1_2' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.19997609406816955
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.4
    dropout_l2: 0.5
    dropout_l3: 0.5
    kernel_size_l0: 15
    kernel_size_l1: 11
    kernel_size_l2: 5
    kernel_size_l3: 5
    lr: 0.000806439220515186
    n_layers: 4
    optimizer: RMSprop
    out_channels_l0: 64
    out_channels_l1: 96
    out_channels_l2: 256
    out_channels_l3: 256
    weight_decay: 0.0023283575781733686

AUPRC test score: 0.18857143471901733


>>> ITERATION N. 3



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-26 07:18:11,716][0m Using an existing study with name 'K562_active_EP_vs_inactive_rest_CNN_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.18611026449949544
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.4
    kernel_size_l0: 5
    kernel_size_l1: 11
    lr: 0.00014725046227743286
    n_layers: 2
    optimizer: RMSprop
    out_channels_l0: 16
    out_channels_l1: 32
    weight_decay: 0.010034936987591187

AUPRC test score: 0.1824606721545208



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.16921


In [37]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [38]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---