## BIOINFORMATICS THESIS: MULTIMODAL NEURAL NETWORK

# CELL LINE: H1

In [1]:
import pandas as pd 
import numpy as np
import os

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from collections import defaultdict, OrderedDict
import pickle

import sqlite3
from sqlalchemy import create_engine

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
# create a database to store optuna studies with sqlite backend
#engine = create_engine('sqlite:///BIOINF_optuna_tuning.db')

In [3]:
from BIOINF_tesi.data_pipe import CELL_LINES, TASKS

In [4]:
cell_line = CELL_LINES[2]
cell_line

'H1'

---

In [5]:
from BIOINF_tesi.data_pipe import Load_Create_Task
from BIOINF_tesi.data_pipe import Build_DataLoader_Pipeline

In [6]:
from BIOINF_tesi.models import FFNN, CNN, CNN_LSTM
from BIOINF_tesi.models.utils import fit, Param_Search, Kfold_CV

In [7]:
with open ('results_dict.pickle', 'rb') as fin:
    results_dict = pickle.load(fin)
    results_dict = defaultdict(lambda: defaultdict(dict), results_dict)

## 1) ACTIVE ENHANCERS vs INACTIVE ENHANCERS

In [7]:
task = TASKS[0]
task

'active_E_vs_inactive_E'

In [8]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


### 1. FFNN

In [10]:
model=FFNN

In [11]:
# IMBALANCED
type_augm_genfeatures = 'smote'

kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                type_augm_genfeatures=type_augm_genfeatures,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}_{type_augm_genfeatures}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_{type_augm_genfeatures}_TEST')

>>> ITERATION N. 1



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:09:01,845][0m Using an existing study with name 'H1_active_E_vs_inactive_E_FFNN_smote_1' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.1404646773945613
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.4
    dropout_l2: 0.5
    dropout_l3: 0.0
    lr: 8.388792484415416e-05
    n_layers: 4
    n_units_l0: 128
    n_units_l1: 16
    n_units_l2: 32
    n_units_l3: 16
    optimizer: Adam
    weight_decay: 0.00020791131976909724



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:09:02,305][0m Using an existing study with name 'H1_active_E_vs_inactive_E_FFNN_smote_1_2' instead of creating a new one.[0m


AUPRC test score: 0.13647388452967094


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.1462092251963442
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.3
    dropout_l2: 0.4
    lr: 0.02524811508332391
    n_layers: 3
    n_units_l0: 32
    n_units_l1: 32
    n_units_l2: 32
    optimizer: RMSprop
    weight_decay: 0.00010366811690427747

AUPRC test score: 0.16499727007614856


>>> ITERATION N. 3



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:09:02,652][0m Using an existing study with name 'H1_active_E_vs_inactive_E_FFNN_smote_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.15327575074265942
  Params: 
    dropout_l0: 0.0
    lr: 0.00022996454359441556
    n_layers: 1
    n_units_l0: 64
    optimizer: Nadam
    weight_decay: 0.0005013773082437128

AUPRC test score: 0.13944441271332556



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.14697


In [12]:
results_dict[cell_line][task][f'{model.__name__}_{type_augm_genfeatures}'] = kf_CV.scores_dict

In [13]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

In [14]:
# IMBALANCED
type_augm_genfeatures = 'double'

kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                type_augm_genfeatures=type_augm_genfeatures,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}_{type_augm_genfeatures}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_{type_augm_genfeatures}_TEST')

>>> ITERATION N. 1



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:09:03,255][0m Using an existing study with name 'H1_active_E_vs_inactive_E_FFNN_double_1' instead of creating a new one.[0m


Study statistics: 


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:09:03,454][0m Using an existing study with name 'H1_active_E_vs_inactive_E_FFNN_double_1_2' instead of creating a new one.[0m


  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.13875496810796648
  Params: 
    dropout_l0: 0.3
    lr: 0.0007153757603697248
    n_layers: 1
    n_units_l0: 32
    optimizer: Nadam
    weight_decay: 0.00013395887919276042

AUPRC test score: 0.13550014103227337


>>> ITERATION N. 2



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:09:03,749][0m Using an existing study with name 'H1_active_E_vs_inactive_E_FFNN_double_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.12927519660333892
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.2
    dropout_l2: 0.5
    lr: 2.742801956754972e-05
    n_layers: 3
    n_units_l0: 64
    n_units_l1: 64
    n_units_l2: 64
    optimizer: Nadam
    weight_decay: 0.0002323326966982178

AUPRC test score: 0.1387255878075459


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.14548172325676753
  Params: 
    dropout_l0: 0.0
    dropout_l1: 0.3
    dropout_l2: 0.0
    dropout_l3: 0.0
    lr: 0.013796285178862397
    n_layers: 4
    n_units_l0: 256
    n_units_l1: 64
    n_units_l2: 4
    n_units_l3: 32
    optimizer: Adam
    weight_decay: 0.00033074317004100546

AUPRC test score: 0.14069933455136133



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.13831


In [15]:
results_dict[cell_line][task][f'{model.__name__}_{type_augm_genfeatures}'] = kf_CV.scores_dict

In [16]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [14]:
model=CNN

In [11]:
# IMBALANCED
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:39:53,281][0m Using an existing study with name 'H1_active_E_vs_inactive_E_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1





Study statistics: 
  Number of finished trials:  5
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.0839994015615884
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.4
    dropout_l2: 0.4
    kernel_size_l0: 5
    kernel_size_l1: 15
    kernel_size_l2: 5
    lr: 0.002966594346874513
    n_layers: 3
    optimizer: Nadam
    out_channels_l0: 64
    out_channels_l1: 96
    out_channels_l2: 128
    weight_decay: 0.011442938555931578

AUPRC test score: 0.08075767615737826




  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:39:55,908][0m Using an existing study with name 'H1_active_E_vs_inactive_E_CNN_1_2' instead of creating a new one.[0m


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials: 

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:39:56,222][0m Using an existing study with name 'H1_active_E_vs_inactive_E_CNN_1_2_3' instead of creating a new one.[0m


 0
  Number of complete trials:  3
Best trial:
  Value:  0.08967892556988034
  Params: 
    dropout_l0: 0
    kernel_size_l0: 11
    lr: 0.00040854348190510754
    n_layers: 1
    optimizer: Nadam
    out_channels_l0: 32
    weight_decay: 0.00039999368794673996

AUPRC test score: 0.07669281276498159


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.08066668081991031
  Params: 
    dropout_l0: 0
    dropout_l1: 0.4
    dropout_l2: 0.5
    kernel_size_l0: 11
    kernel_size_l1: 15
    kernel_size_l2: 15
    lr: 0.00014128249200612766
    n_layers: 3
    optimizer: RMSprop
    out_channels_l0: 16
    out_channels_l1: 96
    out_channels_l2: 128
    weight_decay: 0.02593205648211903

AUPRC test score: 0.09386995221457561



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.08377


In [12]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [13]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 2) ACTIVE PROMOTERS vs INACTIVE PROMOTERS

In [7]:
task = TASKS[1]
task

'active_P_vs_inactive_P'

In [18]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


---

### 1. FFNN

In [25]:
model=FFNN

In [20]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:09:09,588][0m Using an existing study with name 'H1_active_P_vs_inactive_P_FFNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.2863761152816352
  Params: 
    dropout_l0: 0.3
    lr: 0.06702402559531472
    n_layers: 1
    n_units_l0: 32
    optimizer: RMSprop
    weight_decay: 0.011619034006685102



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:09:09,784][0m Using an existing study with name 'H1_active_P_vs_inactive_P_FFNN_1_2' instead of creating a new one.[0m


AUPRC test score: 0.24084418666708585


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.3019769625009417
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.0
    lr: 0.011839528743459985
    n_layers: 2
    n_units_l0: 32
    n_units_l1: 64
    optimizer: RMSprop
    weight_decay: 0.0012198181546860037



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:09:10,019][0m Using an existing study with name 'H1_active_P_vs_inactive_P_FFNN_1_2_3' instead of creating a new one.[0m


AUPRC test score: 0.3055288667455186


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.13241036955322666
  Params: 
    dropout_l0: 0.0
    dropout_l1: 0.0
    dropout_l2: 0.5
    dropout_l3: 0.0
    lr: 0.05777301501720411
    n_layers: 4
    n_units_l0: 32
    n_units_l1: 128
    n_units_l2: 32
    n_units_l3: 16
    optimizer: Adam
    weight_decay: 0.035722268161443554

AUPRC test score: 0.13270974180670916



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.22636


In [21]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [22]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [8]:
model=CNN

In [17]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:39:58,366][0m Using an existing study with name 'H1_active_P_vs_inactive_P_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:39:58,660][0m Using an existing study with name 'H1_active_P_vs_inactive_P_CNN_1_2' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.23541767892308152
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.4
    kernel_size_l0: 15
    kernel_size_l1: 15
    lr: 2.565402548964133e-05
    n_layers: 2
    optimizer: Nadam
    out_channels_l0: 32
    out_channels_l1: 32
    weight_decay: 0.04656752699327209

AUPRC test score: 0.23557775770994369


>>> ITERATION N. 2



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:39:58,908][0m Using an existing study with name 'H1_active_P_vs_inactive_P_CNN_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  4
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.2463172490146871
  Params: 
    dropout_l0: 0
    dropout_l1: 0.4
    kernel_size_l0: 5
    kernel_size_l1: 15
    lr: 1.0901345541621569e-05
    n_layers: 2
    optimizer: RMSprop
    out_channels_l0: 32
    out_channels_l1: 32
    weight_decay: 0.036839351159283516

AUPRC test score: 0.2382536103295317


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.22482077529053046
  Params: 
    dropout_l0: 0.2
    kernel_size_l0: 5
    lr: 0.04307874929953698
    n_layers: 1
    optimizer: Nadam
    out_channels_l0: 32
    weight_decay: 0.0002145846181217555

AUPRC test score: 0.13274418904127233



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.20219


In [11]:
from BIOINF_tesi.visual import parse_as_dict

i=3

d = torch.load(f'{cell_line}_{model.__name__}_{task}_{i}_test_.pt', map_location=torch.device(device))

s =  """dropout_l0: 0.2
    kernel_size_l0: 5
    lr: 0.04307874929953698
    n_layers: 1
    optimizer: Nadam
    out_channels_l0: 32
    weight_decay: 0.0002145846181217555"""

d['model_params']=parse_as_dict(s)
print(d['model_params'])

torch.save(d, f'{cell_line}_{model.__name__}_{task}_{i}_test_.pt')

OrderedDict([('dropout_l0', 0.2), ('kernel_size_l0', 5.0), ('lr', 0.04307874929953698), ('n_layers', 1.0), ('optimizer', 'Nadam'), ('out_channels_l0', 32.0), ('weight_decay', 0.0002145846181217555)])


In [18]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [19]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 3) ACTIVE ENHANCERS vs ACTIVE PROMOTERS

In [33]:
task = TASKS[2]
task

'active_E_vs_active_P'

In [24]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


---
### 1. FFNN

In [34]:
model=FFNN

In [26]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:09:14,472][0m Using an existing study with name 'H1_active_E_vs_active_P_FFNN_1' instead of creating a new one.[0m
  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:09:14,625][0m Using an existing study with name 'H1_active_E_vs_active_P_FFNN_1_2' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5399161385431701
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.2
    dropout_l2: 0.0
    dropout_l3: 0.5
    lr: 0.03231466001603883
    n_layers: 4
    n_units_l0: 32
    n_units_l1: 64
    n_units_l2: 16
    n_units_l3: 4
    optimizer: RMSprop
    weight_decay: 0.000267984322347798

AUPRC test score: 0.5926446543612682


>>> ITERATION N. 2



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:09:14,770][0m Using an existing study with name 'H1_active_E_vs_active_P_FFNN_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5776680552691617
  Params: 
    dropout_l0: 0.0
    dropout_l1: 0.3
    dropout_l2: 0.5
    dropout_l3: 0.0
    lr: 0.00024250447613984245
    n_layers: 4
    n_units_l0: 128
    n_units_l1: 32
    n_units_l2: 16
    n_units_l3: 32
    optimizer: Nadam
    weight_decay: 0.0003958948680868851

AUPRC test score: 0.5336821189017318


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5751381642276375
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.2
    lr: 0.06797237205015519
    n_layers: 2
    n_units_l0: 64
    n_units_l1: 64
    optimizer: RMSprop
    weight_decay: 0.00046428364653413245

AUPRC test score: 0.46348376551134557



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.52994


In [27]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [28]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [38]:
model=CNN

In [23]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:40:04,978][0m Using an existing study with name 'H1_active_E_vs_active_P_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.40329137059768944
  Params: 
    dropout_l0: 0.3
    kernel_size_l0: 5
    lr: 4.533635731053414e-05
    n_layers: 1
    optimizer: Nadam
    out_channels_l0: 16
    weight_decay: 0.014830838754160311



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:40:05,495][0m Using an existing study with name 'H1_active_E_vs_active_P_CNN_1_2' instead of creating a new one.[0m


AUPRC test score: 0.4972313000361089


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.4682954553004083
  Params: 
    dropout_l0: 0
    dropout_l1: 0
    dropout_l2: 0.5
    kernel_size_l0: 15
    kernel_size_l1: 11
    kernel_size_l2: 5
    lr: 9.208111192385932e-05
    n_layers: 3
    optimizer: RMSprop
    out_channels_l0: 32
    out_channels_l1: 32
    out_channels_l2: 128
    weight_decay: 0.0003666370793026254



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:40:06,107][0m Using an existing study with name 'H1_active_E_vs_active_P_CNN_1_2_3' instead of creating a new one.[0m


AUPRC test score: 0.33518044342181125


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.4310892010813663
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.4
    dropout_l2: 0.4
    dropout_l3: 0.5
    kernel_size_l0: 15
    kernel_size_l1: 15
    kernel_size_l2: 5
    kernel_size_l3: 11
    lr: 0.019049489901312926
    n_layers: 4
    optimizer: Nadam
    out_channels_l0: 64
    out_channels_l1: 32
    out_channels_l2: 128
    out_channels_l3: 256
    weight_decay: 0.023777722365210998

AUPRC test score: 0.18487012987012985



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.33909


In [24]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [25]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 4) INACTIVE ENHANCERS vs INACTIVE PROMOTERS

In [42]:
task = TASKS[3]
task

'inactive_E_vs_inactive_P'

In [30]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


---
### 1. FFNN

In [43]:
model=FFNN

In [32]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:09:18,863][0m Using an existing study with name 'H1_inactive_E_vs_inactive_P_FFNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.569631467812587
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.0
    lr: 1.8177580600376535e-05
    n_layers: 2
    n_units_l0: 256
    n_units_l1: 128
    optimizer: RMSprop
    weight_decay: 0.006083287174451145

AUPRC test score: 0.5723879336768627


>>> ITERATION N. 2



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:09:19,265][0m Using an existing study with name 'H1_inactive_E_vs_inactive_P_FFNN_1_2' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:09:19,539][0m Using an existing study with name 'H1_inactive_E_vs_inactive_P_FFNN_1_2_3' instead of creating a new one.[0m


  Value:  0.5765155029389104
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.3
    lr: 3.484653048577119e-05
    n_layers: 2
    n_units_l0: 128
    n_units_l1: 128
    optimizer: Nadam
    weight_decay: 0.005051745594459535

AUPRC test score: 0.5738098812393496


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5710105939949554
  Params: 
    dropout_l0: 0.4
    lr: 0.00871683043555429
    n_layers: 1
    n_units_l0: 32
    optimizer: RMSprop
    weight_decay: 0.00011903344141523859

AUPRC test score: 0.5799715907410307



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.57539


In [33]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [34]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [47]:
model=CNN

In [31]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:40:17,382][0m Using an existing study with name 'H1_inactive_E_vs_inactive_P_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  4
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5978849252036323
  Params: 
    dropout_l0: 0
    dropout_l1: 0.4
    kernel_size_l0: 5
    kernel_size_l1: 11
    lr: 5.790729239174156e-05
    n_layers: 2
    optimizer: Adam
    out_channels_l0: 32
    out_channels_l1: 32
    weight_decay: 0.004747788863421461



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:40:17,674][0m Using an existing study with name 'H1_inactive_E_vs_inactive_P_CNN_1_2' instead of creating a new one.[0m


AUPRC test score: 0.5980582333623193


>>> ITERATION N. 2



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:40:17,924][0m Using an existing study with name 'H1_inactive_E_vs_inactive_P_CNN_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  4
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5991319792027304
  Params: 
    dropout_l0: 0.3
    kernel_size_l0: 5
    lr: 0.0013939424243341444
    n_layers: 1
    optimizer: Adam
    out_channels_l0: 16
    weight_decay: 0.00013104073881237878

AUPRC test score: 0.5919393079244233


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  4
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.598135303334985
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.5
    kernel_size_l0: 11
    kernel_size_l1: 5
    lr: 0.0008406654128772997
    n_layers: 2
    optimizer: Adam
    out_channels_l0: 64
    out_channels_l1: 64
    weight_decay: 0.0020267974010821483

AUPRC test score: 0.6090944568964746



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.5997


In [32]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [33]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 5) ACTIVE ENHANCERS + ACTIVE PROMOTERS vs INACTIVE REST

In [51]:
task = TASKS[4]
task

'active_EP_vs_inactive_rest'

In [36]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


---
### 1. FFNN

In [52]:
model=FFNN

In [38]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

>>> ITERATION N. 1



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:09:23,484][0m Using an existing study with name 'H1_active_EP_vs_inactive_rest_FFNN_1' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.2758455013741939
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.2
    dropout_l2: 0.5
    dropout_l3: 0.4
    lr: 0.00031239823744060816
    n_layers: 4
    n_units_l0: 64
    n_units_l1: 32
    n_units_l2: 32
    n_units_l3: 4
    optimizer: Adam
    weight_decay: 0.0008329074529329131

AUPRC test score: 0.2659557602996507


>>> ITERATION N. 2



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:09:23,823][0m Using an existing study with name 'H1_active_EP_vs_inactive_rest_FFNN_1_2' instead of creating a new one.[0m
  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:09:24,026][0m Using an existing study with name 'H1_active_EP_vs_inactive_rest_FFNN_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.27424284563910345
  Params: 
    dropout_l0: 0.0
    lr: 0.0005465537864537738
    n_layers: 1
    n_units_l0: 256
    optimizer: RMSprop
    weight_decay: 0.0027931439817977474

AUPRC test score: 0.26284722388109344


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.25704633450114966
  Params: 
    dropout_l0: 0.4
    lr: 0.00026290390483834643
    n_layers: 1
    n_units_l0: 128
    optimizer: Adam
    weight_decay: 0.008473548140911768

AUPRC test score: 0.2649957510438808



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.2646


In [39]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [40]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [56]:
model=CNN

In [37]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:40:24,072][0m Using an existing study with name 'H1_active_EP_vs_inactive_rest_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:40:24,372][0m Using an existing study with name 'H1_active_EP_vs_inactive_rest_CNN_1_2' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  5
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.18789850931021715
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0
    kernel_size_l0: 11
    kernel_size_l1: 5
    lr: 1.4816335581390753e-05
    n_layers: 2
    optimizer: Adam
    out_channels_l0: 16
    out_channels_l1: 32
    weight_decay: 0.0007618536696654548

AUPRC test score: 0.20723833623827714


>>> ITERATION N. 2



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:40:24,657][0m Using an existing study with name 'H1_active_EP_vs_inactive_rest_CNN_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.2036399880954602
  Params: 
    dropout_l0: 0.3
    kernel_size_l0: 11
    lr: 1.1993177743036288e-05
    n_layers: 1
    optimizer: Nadam
    out_channels_l0: 32
    weight_decay: 0.012919499281693347

AUPRC test score: 0.20236471231526587


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.20321003958377995
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.4
    dropout_l2: 0
    kernel_size_l0: 15
    kernel_size_l1: 5
    kernel_size_l2: 15
    lr: 0.0014666724983444786
    n_layers: 3
    optimizer: RMSprop
    out_channels_l0: 64
    out_channels_l1: 32
    out_channels_l2: 96
    weight_decay: 0.03727375336122134

AUPRC test score: 0.16067930891423635



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.19009


In [38]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [39]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---