## BIOINFORMATICS THESIS: MULTIMODAL NEURAL NETWORK

# CELL LINE: HEPG2

In [1]:
import pandas as pd 
import numpy as np
import os

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from collections import defaultdict, OrderedDict
import pickle

import sqlite3
from sqlalchemy import create_engine

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
# create a database to store optuna studies with sqlite backend
#engine = create_engine('sqlite:///BIOINF_optuna_tuning.db')

In [3]:
from BIOINF_tesi.data_pipe import CELL_LINES, TASKS

In [4]:
cell_line = CELL_LINES[4]
cell_line

'HEPG2'

---

In [5]:
from BIOINF_tesi.data_pipe import Load_Create_Task
from BIOINF_tesi.data_pipe import Build_DataLoader_Pipeline

In [6]:
from BIOINF_tesi.models import FFNN, CNN, CNN_LSTM
from BIOINF_tesi.models.utils import fit, Param_Search, Kfold_CV

In [7]:
with open ('results_dict.pickle', 'rb') as fin:
    results_dict = pickle.load(fin)
    results_dict = defaultdict(lambda: defaultdict(dict), results_dict)

## 1) ACTIVE ENHANCERS vs INACTIVE ENHANCERS

In [8]:
task = TASKS[0]
task

'active_E_vs_inactive_E'

In [9]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!




### 1. FFNN

In [9]:
model=FFNN

In [10]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

>>> ITERATION N. 1



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-03 20:20:14,417][0m Using an existing study with name 'HEPG2_active_E_vs_inactive_E_FFNN_1' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.34461230235589685
  Params: 
    dropout_l0: 0.4
    lr: 0.0021576922256036526
    n_layers: 1
    n_units_l0: 64
    optimizer: Nadam
    weight_decay: 0.0019464696791820569

AUPRC test score: 0.30299326208687005


>>> ITERATION N. 2


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-03 20:20:14,856][0m Using an existing study with name 'HEPG2_active_E_vs_inactive_E_FFNN_1_2' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.35512637059862623
  Params: 
    dropout_l0: 0.4
    lr: 0.00045899492953244955
    n_layers: 1
    n_units_l0: 64
    optimizer: Nadam
    weight_decay: 0.0005422040805272998



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-03 20:20:15,254][0m Using an existing study with name 'HEPG2_active_E_vs_inactive_E_FFNN_1_2_3' instead of creating a new one.[0m


AUPRC test score: 0.3487990289680023


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.3403655946017572
  Params: 
    dropout_l0: 0.0
    lr: 0.0006608957924102211
    n_layers: 1
    n_units_l0: 64
    optimizer: Adam
    weight_decay: 0.00010198512970011743

AUPRC test score: 0.34595893719530785



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.33258


In [None]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [None]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [10]:
model=CNN

In [11]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

>>> ITERATION N. 1



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-05 16:14:00,512][0m Using an existing study with name 'HEPG2_active_E_vs_inactive_E_CNN_1' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.14063598978328698
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0
    kernel_size_l0: 15
    kernel_size_l1: 5
    lr: 0.0011192497589899114
    n_layers: 2
    optimizer: Adam
    out_channels_l0: 64
    out_channels_l1: 32
    weight_decay: 0.00014622816854972708



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-05 16:14:05,484][0m Using an existing study with name 'HEPG2_active_E_vs_inactive_E_CNN_1_2' instead of creating a new one.[0m


AUPRC test score: 0.13195522985003827


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.11347227975697324
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0
    dropout_l2: 0
    kernel_size_l0: 11
    kernel_size_l1: 11
    kernel_size_l2: 5
    lr: 0.00017584881129923064
    n_layers: 3
    optimizer: RMSprop
    out_channels_l0: 16
    out_channels_l1: 96
    out_channels_l2: 64
    weight_decay: 0.006294208282606327



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-05 16:14:06,228][0m Using an existing study with name 'HEPG2_active_E_vs_inactive_E_CNN_1_2_3' instead of creating a new one.[0m


AUPRC test score: 0.1157149950347567


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.14553339901205997
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0
    kernel_size_l0: 11
    kernel_size_l1: 5
    lr: 1.734142191582984e-05
    n_layers: 2
    optimizer: Nadam
    out_channels_l0: 32
    out_channels_l1: 32
    weight_decay: 0.0077286379938048865

AUPRC test score: 0.13798453735798627



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.12855


In [12]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [13]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 2) ACTIVE PROMOTERS vs INACTIVE PROMOTERS

In [14]:
task = TASKS[1]
task

'active_P_vs_inactive_P'

In [15]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!




---

### 1. FFNN

In [13]:
model=FFNN

In [14]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-03 20:20:19,476][0m Using an existing study with name 'HEPG2_active_P_vs_inactive_P_FFNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.142325795182938
  Params: 
    dropout_l0: 0.0
    dropout_l1: 0.2
    dropout_l2: 0.0
    lr: 0.0010482022189361125
    n_layers: 3
    n_units_l0: 256
    n_units_l1: 32
    n_units_l2: 4
    optimizer: Nadam
    weight_decay: 0.003956782096372866

AUPRC test score: 0.16601433296906926


>>> ITERATION N. 2


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-03 20:20:19,686][0m Using an existing study with name 'HEPG2_active_P_vs_inactive_P_FFNN_1_2' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.14206012134583562
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.0
    dropout_l2: 0.4
    lr: 0.08190424637713349
    n_layers: 3
    n_units_l0: 256
    n_units_l1: 64
    n_units_l2: 16
    optimizer: RMSprop
    weight_decay: 0.014448774379742731

AUPRC test score: 0.1426805962542999


>>> ITERATION N. 3



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-03 20:20:19,874][0m Using an existing study with name 'HEPG2_active_P_vs_inactive_P_FFNN_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.14246460746460748
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.0
    dropout_l2: 0.0
    dropout_l3: 0.4
    lr: 9.341904042058524e-05
    n_layers: 4
    n_units_l0: 128
    n_units_l1: 128
    n_units_l2: 16
    n_units_l3: 32
    optimizer: Nadam
    weight_decay: 0.002719188503890816

AUPRC test score: 0.14117249372223298



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.14996


In [None]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [None]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [16]:
model=CNN

In [17]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-05 16:14:16,159][0m Using an existing study with name 'HEPG2_active_P_vs_inactive_P_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.14428432960074555
  Params: 
    dropout_l0: 0.2
    kernel_size_l0: 15
    lr: 1.6072890360528345e-05
    n_layers: 1
    optimizer: Adam
    out_channels_l0: 32
    weight_decay: 0.013712189694200698



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-05 16:14:17,287][0m Using an existing study with name 'HEPG2_active_P_vs_inactive_P_CNN_1_2' instead of creating a new one.[0m


AUPRC test score: 0.14759311461002628


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.1774029185190475
  Params: 
    dropout_l0: 0
    dropout_l1: 0.5
    dropout_l2: 0.5
    kernel_size_l0: 11
    kernel_size_l1: 15
    kernel_size_l2: 5
    lr: 0.00012396457839585287
    n_layers: 3
    optimizer: Adam
    out_channels_l0: 16
    out_channels_l1: 32
    out_channels_l2: 128
    weight_decay: 0.06932909806638518

AUPRC test score: 0.1688655893772809


>>> ITERATION N. 3



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-05 16:14:18,601][0m Using an existing study with name 'HEPG2_active_P_vs_inactive_P_CNN_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.1567154433695652
  Params: 
    dropout_l0: 0
    dropout_l1: 0.5
    kernel_size_l0: 15
    kernel_size_l1: 11
    lr: 3.2393867505748626e-05
    n_layers: 2
    optimizer: Nadam
    out_channels_l0: 32
    out_channels_l1: 96
    weight_decay: 0.0010483987710159643

AUPRC test score: 0.16659279637241375



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.16102


In [18]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [19]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 3) ACTIVE ENHANCERS vs ACTIVE PROMOTERS

In [20]:
task = TASKS[2]
task

'active_E_vs_active_P'

In [21]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!




---
### 1. FFNN

In [17]:
model=FFNN

In [18]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-03 20:20:22,540][0m Using an existing study with name 'HEPG2_active_E_vs_active_P_FFNN_1' instead of creating a new one.[0m
  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-03 20:20:22,649][0m Using an existing study with name 'HEPG2_active_E_vs_active_P_FFNN_1_2' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.7780236348074855
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.2
    dropout_l2: 0.4
    dropout_l3: 0.0
    lr: 0.0003082149764107701
    n_layers: 4
    n_units_l0: 64
    n_units_l1: 128
    n_units_l2: 64
    n_units_l3: 32
    optimizer: Nadam
    weight_decay: 0.0005798740768213723

AUPRC test score: 0.7787056480265978


>>> ITERATION N. 2



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-03 20:20:22,807][0m Using an existing study with name 'HEPG2_active_E_vs_active_P_FFNN_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.7926214817335216
  Params: 
    dropout_l0: 0.0
    dropout_l1: 0.0
    dropout_l2: 0.5
    lr: 0.008956164265347446
    n_layers: 3
    n_units_l0: 64
    n_units_l1: 32
    n_units_l2: 64
    optimizer: Nadam
    weight_decay: 0.004001217923626742

AUPRC test score: 0.7708154935520501


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.7531508803610644
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.0
    dropout_l2: 0.5
    lr: 0.00021283856738588385
    n_layers: 3
    n_units_l0: 128
    n_units_l1: 16
    n_units_l2: 32
    optimizer: Nadam
    weight_decay: 0.021147138081129408

AUPRC test score: 0.7622453470530369



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.77059


In [None]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [None]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [22]:
model=CNN

In [23]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-05 16:14:23,554][0m Using an existing study with name 'HEPG2_active_E_vs_active_P_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.6745521606210675
  Params: 
    dropout_l0: 0
    dropout_l1: 0
    dropout_l2: 0.4
    kernel_size_l0: 15
    kernel_size_l1: 11
    kernel_size_l2: 11
    lr: 0.03837315829669874
    n_layers: 3
    optimizer: Nadam
    out_channels_l0: 32
    out_channels_l1: 32
    out_channels_l2: 96
    weight_decay: 0.04789666981199902



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-05 16:14:24,817][0m Using an existing study with name 'HEPG2_active_E_vs_active_P_CNN_1_2' instead of creating a new one.[0m


AUPRC test score: 0.338425925925926


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.7129530683641129
  Params: 
    dropout_l0: 0
    dropout_l1: 0
    dropout_l2: 0.5
    dropout_l3: 0.5
    kernel_size_l0: 15
    kernel_size_l1: 5
    kernel_size_l2: 11
    kernel_size_l3: 11
    lr: 0.0003185113310791346
    n_layers: 4
    optimizer: RMSprop
    out_channels_l0: 16
    out_channels_l1: 32
    out_channels_l2: 96
    out_channels_l3: 256
    weight_decay: 0.0049442769021249946



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-05 16:14:25,863][0m Using an existing study with name 'HEPG2_active_E_vs_active_P_CNN_1_2_3' instead of creating a new one.[0m


AUPRC test score: 0.6739333313359013


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.675210885487434
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.5
    dropout_l2: 0.4
    kernel_size_l0: 5
    kernel_size_l1: 15
    kernel_size_l2: 5
    lr: 0.00048371731651232936
    n_layers: 3
    optimizer: RMSprop
    out_channels_l0: 64
    out_channels_l1: 96
    out_channels_l2: 64
    weight_decay: 0.06359178795156419

AUPRC test score: 0.7125649787404698



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.57497


In [24]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [25]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 4) INACTIVE ENHANCERS vs INACTIVE PROMOTERS

In [26]:
task = TASKS[3]
task

'inactive_E_vs_inactive_P'

In [27]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!




---
### 1. FFNN

In [21]:
model=FFNN

In [22]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

>>> ITERATION N. 1


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-03 20:20:26,147][0m Using an existing study with name 'HEPG2_inactive_E_vs_inactive_P_FFNN_1' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5541917016114689
  Params: 
    dropout_l0: 0.2
    lr: 0.00024114594487533076
    n_layers: 1
    n_units_l0: 32
    optimizer: Adam
    weight_decay: 0.01957758455949045

AUPRC test score: 0.5530124697612042


>>> ITERATION N. 2


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-03 20:20:27,200][0m Using an existing study with name 'HEPG2_inactive_E_vs_inactive_P_FFNN_1_2' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5987803515868848
  Params: 
    dropout_l0: 0.0
    dropout_l1: 0.4
    dropout_l2: 0.5
    dropout_l3: 0.4
    lr: 0.0008662921816230916
    n_layers: 4
    n_units_l0: 64
    n_units_l1: 64
    n_units_l2: 64
    n_units_l3: 16
    optimizer: Adam
    weight_decay: 0.00027025441301827295

AUPRC test score: 0.5941765166710042


>>> ITERATION N. 3


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-03 20:20:27,907][0m Using an existing study with name 'HEPG2_inactive_E_vs_inactive_P_FFNN_1_2_3' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5895004158073325
  Params: 
    dropout_l0: 0.0
    dropout_l1: 0.0
    dropout_l2: 0.0
    dropout_l3: 0.5
    lr: 0.00022975529164695645
    n_layers: 4
    n_units_l0: 32
    n_units_l1: 128
    n_units_l2: 64
    n_units_l3: 32
    optimizer: RMSprop
    weight_decay: 0.0012991028234900016

AUPRC test score: 0.5883148326958885



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.5785


In [None]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [None]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [28]:
model=CNN

In [29]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-05 16:14:45,433][0m Using an existing study with name 'HEPG2_inactive_E_vs_inactive_P_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5771526412066551
  Params: 
    dropout_l0: 0
    dropout_l1: 0.4
    kernel_size_l0: 5
    kernel_size_l1: 11
    lr: 2.932875610681385e-05
    n_layers: 2
    optimizer: Adam
    out_channels_l0: 32
    out_channels_l1: 96
    weight_decay: 0.00019849581036934485



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-05 16:14:47,281][0m Using an existing study with name 'HEPG2_inactive_E_vs_inactive_P_CNN_1_2' instead of creating a new one.[0m


AUPRC test score: 0.574325173354808


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5800134424759591
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0
    kernel_size_l0: 15
    kernel_size_l1: 11
    lr: 0.00016866066721187343
    n_layers: 2
    optimizer: RMSprop
    out_channels_l0: 16
    out_channels_l1: 32
    weight_decay: 0.003431920457464381



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-05 16:14:48,135][0m Using an existing study with name 'HEPG2_inactive_E_vs_inactive_P_CNN_1_2_3' instead of creating a new one.[0m


AUPRC test score: 0.5826386867170965


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5836571411134077
  Params: 
    dropout_l0: 0
    kernel_size_l0: 5
    lr: 0.0003931376208904803
    n_layers: 1
    optimizer: Nadam
    out_channels_l0: 64
    weight_decay: 0.0003599336045179877

AUPRC test score: 0.5680021549313335



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.57499


In [30]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [31]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 5) ACTIVE ENHANCERS + ACTIVE PROMOTERS vs INACTIVE REST

In [32]:
task = TASKS[4]
task

'active_EP_vs_inactive_rest'

In [33]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!




---
### 1. FFNN

In [25]:
model = FFNN

In [26]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

>>> ITERATION N. 1


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-03 20:20:30,322][0m Using an existing study with name 'HEPG2_active_EP_vs_inactive_rest_FFNN_1' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.13310744801921537
  Params: 
    dropout_l0: 0.0
    dropout_l1: 0.0
    dropout_l2: 0.4
    lr: 5.998338033733112e-05
    n_layers: 3
    n_units_l0: 256
    n_units_l1: 16
    n_units_l2: 4
    optimizer: Adam
    weight_decay: 0.0004688562495101427

AUPRC test score: 0.13234881043920838


>>> ITERATION N. 2


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-03 20:20:31,058][0m Using an existing study with name 'HEPG2_active_EP_vs_inactive_rest_FFNN_1_2' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.2064006090494295
  Params: 
    dropout_l0: 0.3
    lr: 1.527372556511274e-05
    n_layers: 1
    n_units_l0: 256
    optimizer: Adam
    weight_decay: 0.000547008990795143

AUPRC test score: 0.22398137483428093


>>> ITERATION N. 3


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-03 20:20:31,914][0m Using an existing study with name 'HEPG2_active_EP_vs_inactive_rest_FFNN_1_2_3' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.19483375497085736
  Params: 
    dropout_l0: 0.4
    lr: 1.9080043880953393e-05
    n_layers: 1
    n_units_l0: 64
    optimizer: RMSprop
    weight_decay: 0.0010501357936829625

AUPRC test score: 0.2068492896179175



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.18773


In [None]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [None]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [34]:
model=CNN

In [35]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),


>>> ITERATION N. 1



[32m[I 2021-10-05 16:15:18,409][0m Using an existing study with name 'HEPG2_active_EP_vs_inactive_rest_CNN_1' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.18623663235921137
  Params: 
    dropout_l0: 0
    dropout_l1: 0.5
    dropout_l2: 0.5
    kernel_size_l0: 11
    kernel_size_l1: 15
    kernel_size_l2: 5
    lr: 0.0001222044258431401
    n_layers: 3
    optimizer: Nadam
    out_channels_l0: 16
    out_channels_l1: 32
    out_channels_l2: 256
    weight_decay: 0.0018403439299137187



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-05 16:15:20,414][0m Using an existing study with name 'HEPG2_active_EP_vs_inactive_rest_CNN_1_2' instead of creating a new one.[0m


AUPRC test score: 0.18914325908899268


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.16994189490128797
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.4
    kernel_size_l0: 11
    kernel_size_l1: 11
    lr: 4.179945473211488e-05
    n_layers: 2
    optimizer: RMSprop
    out_channels_l0: 64
    out_channels_l1: 64
    weight_decay: 0.003175597101654849



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-05 16:15:21,867][0m Using an existing study with name 'HEPG2_active_EP_vs_inactive_rest_CNN_1_2_3' instead of creating a new one.[0m


AUPRC test score: 0.1567329521933549


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.13374709283250702
  Params: 
    dropout_l0: 0
    dropout_l1: 0.5
    dropout_l2: 0.5
    kernel_size_l0: 15
    kernel_size_l1: 5
    kernel_size_l2: 5
    lr: 0.005363692114327795
    n_layers: 3
    optimizer: Nadam
    out_channels_l0: 16
    out_channels_l1: 32
    out_channels_l2: 64
    weight_decay: 0.003136012992131029

AUPRC test score: 0.12931242177722163



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.1584


In [36]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [37]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)