## BIOINFORMATICS THESIS: MULTIMODAL NEURAL NETWORK

# CELL LINE: H1

In [1]:
import pandas as pd 
import numpy as np
import os

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from collections import defaultdict, OrderedDict
import pickle

import sqlite3
from sqlalchemy import create_engine

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
# create a database to store optuna studies with sqlite backend
#engine = create_engine('sqlite:///BIOINF_optuna_tuning.db')

In [3]:
from BIOINF_tesi.data_pipe import CELL_LINES, TASKS

In [4]:
cell_line = CELL_LINES[2]
cell_line

'H1'

---

In [5]:
from BIOINF_tesi.data_pipe import Load_Create_Task
from BIOINF_tesi.data_pipe import Build_DataLoader_Pipeline

In [6]:
from BIOINF_tesi.models import FFNN, CNN, CNN_LSTM
from BIOINF_tesi.models.utils import fit, Param_Search, Kfold_CV

In [7]:
with open ('results_dict.pickle', 'rb') as fin:
    results_dict = pickle.load(fin)
    results_dict = defaultdict(lambda: defaultdict(dict), results_dict)

## 1) ACTIVE ENHANCERS vs INACTIVE ENHANCERS

In [8]:
task = TASKS[0]
task

'active_E_vs_inactive_E'

In [9]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!




### 1. FFNN

In [9]:
model=FFNN

In [10]:
# IMBALANCED
type_augm_genfeatures = 'smote'

kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                rebalancing=True,
                type_augm_genfeatures=type_augm_genfeatures,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}_{type_augm_genfeatures}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_{type_augm_genfeatures}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_{type_augm_genfeatures}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-03 20:18:44,567][0m Using an existing study with name 'H1_active_E_vs_inactive_E_FFNN_smote_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:




  Value:  0.19169590505564696
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.3
    dropout_l2: 0.0
    dropout_l3: 0.4
    lr: 0.0012591071540920468
    n_layers: 4
    n_units_l0: 256
    n_units_l1: 128
    n_units_l2: 4
    n_units_l3: 32
    optimizer: RMSprop
    weight_decay: 0.0001564177920100629

AUPRC test score: 0.19323627095832827


>>> ITERATION N. 2



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-03 20:18:44,857][0m Using an existing study with name 'H1_active_E_vs_inactive_E_FFNN_smote_1_2' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.10080042660189174
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.2
    dropout_l2: 0.5
    lr: 0.0001198983872717062
    n_layers: 3
    n_units_l0: 32
    n_units_l1: 16
    n_units_l2: 4
    optimizer: Nadam
    weight_decay: 0.0002581327498102977

AUPRC test score: 0.04757696127110227


>>> ITERATION N. 3



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-03 20:18:45,135][0m Using an existing study with name 'H1_active_E_vs_inactive_E_FFNN_smote_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.1725241960327473
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.4
    dropout_l2: 0.0
    lr: 0.005043865553651833
    n_layers: 3
    n_units_l0: 256
    n_units_l1: 64
    n_units_l2: 64
    optimizer: Nadam
    weight_decay: 0.0011695088418004964

AUPRC test score: 0.20344155478394108



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.14808


In [12]:
results_dict[cell_line][task][f'{model.__name__}_{type_augm_genfeatures}'] = kf_CV.scores_dict

In [13]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

In [11]:
# IMBALANCED
type_augm_genfeatures = 'double'

kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                rebalancing=True,
                type_augm_genfeatures=type_augm_genfeatures,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}_{type_augm_genfeatures}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_{type_augm_genfeatures}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_{type_augm_genfeatures}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-03 20:18:46,547][0m Using an existing study with name 'H1_active_E_vs_inactive_E_FFNN_double_1' instead of creating a new one.[0m
  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-03 20:18:46,668][0m Using an existing study with name 'H1_active_E_vs_inactive_E_FFNN_double_1_2' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.17070996021336224
  Params: 
    dropout_l0: 0.0
    lr: 0.026259044880863083
    n_layers: 1
    n_units_l0: 64
    optimizer: RMSprop
    weight_decay: 0.006393040934977423

AUPRC test score: 0.04902929493545182


>>> ITERATION N. 2



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-03 20:18:46,783][0m Using an existing study with name 'H1_active_E_vs_inactive_E_FFNN_double_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.15166185731326576
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.0
    lr: 0.0011143841529507625
    n_layers: 2
    n_units_l0: 64
    n_units_l1: 16
    optimizer: Adam
    weight_decay: 0.004280247499739179

AUPRC test score: 0.14139986010013464


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.17946310536099278
  Params: 
    dropout_l0: 0.0
    dropout_l1: 0.2
    dropout_l2: 0.0
    lr: 0.001071953356668121
    n_layers: 3
    n_units_l0: 256
    n_units_l1: 128
    n_units_l2: 16
    optimizer: Nadam
    weight_decay: 0.001327045783660192

AUPRC test score: 0.19134309736391397



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.12726


In [15]:
results_dict[cell_line][task][f'{model.__name__}_{type_augm_genfeatures}'] = kf_CV.scores_dict

In [16]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [12]:
model=CNN

In [13]:
# IMBALANCED
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                rebalancing=True,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-05 16:09:43,661][0m Using an existing study with name 'H1_active_E_vs_inactive_E_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.09625769016971096
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.5
    kernel_size_l0: 5
    kernel_size_l1: 5
    lr: 1.1465493564666274e-05
    n_layers: 2
    optimizer: Nadam
    out_channels_l0: 16
    out_channels_l1: 96
    weight_decay: 0.0009246569089574612



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-05 16:09:45,170][0m Using an existing study with name 'H1_active_E_vs_inactive_E_CNN_1_2' instead of creating a new one.[0m


AUPRC test score: 0.09354446762173997


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.048133802816901404
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0
    kernel_size_l0: 11
    kernel_size_l1: 15
    lr: 0.016709426400280756
    n_layers: 2
    optimizer: RMSprop
    out_channels_l0: 16
    out_channels_l1: 96
    weight_decay: 0.032184745676596065



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-05 16:09:46,537][0m Using an existing study with name 'H1_active_E_vs_inactive_E_CNN_1_2_3' instead of creating a new one.[0m


AUPRC test score: 0.04773336643495529


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.10383787266835554
  Params: 
    dropout_l0: 0.4
    kernel_size_l0: 5
    lr: 0.0034196905881384073
    n_layers: 1
    optimizer: Nadam
    out_channels_l0: 64
    weight_decay: 0.00017740089252482645

AUPRC test score: 0.06389892808835214



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.06839


In [14]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [15]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 2) ACTIVE PROMOTERS vs INACTIVE PROMOTERS

In [16]:
task = TASKS[1]
task

'active_P_vs_inactive_P'

In [17]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!




---

### 1. FFNN

In [14]:
model=FFNN

In [15]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-03 20:18:50,964][0m Using an existing study with name 'H1_active_P_vs_inactive_P_FFNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.16865752028670983
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.4
    lr: 0.004529479886188263
    n_layers: 2
    n_units_l0: 32
    n_units_l1: 64
    optimizer: Adam
    weight_decay: 0.00036821770018561435

AUPRC test score: 0.17728534918963768


>>> ITERATION N. 2


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-03 20:18:51,154][0m Using an existing study with name 'H1_active_P_vs_inactive_P_FFNN_1_2' instead of creating a new one.[0m
  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-03 20:18:51,267][0m Using an existing study with name 'H1_active_P_vs_inactive_P_FFNN_1_2_3' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.13336183121897408
  Params: 
    dropout_l0: 0.0
    dropout_l1: 0.2
    lr: 0.002431341338777055
    n_layers: 2
    n_units_l0: 256
    n_units_l1: 64
    optimizer: RMSprop
    weight_decay: 0.01713687561946092

AUPRC test score: 0.13392725187922025


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.19593157241508405
  Params: 
    dropout_l0: 0.0
    dropout_l1: 0.2
    lr: 0.0005626575263702496
    n_layers: 2
    n_units_l0: 256
    n_units_l1: 64
    optimizer: RMSprop
    weight_decay: 0.0025038496452685576

AUPRC test score: 0.17501579560650324



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.16208


In [21]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [22]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [18]:
model=CNN

In [19]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-05 16:09:53,959][0m Using an existing study with name 'H1_active_P_vs_inactive_P_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.16608732667995563
  Params: 
    dropout_l0: 0
    kernel_size_l0: 15
    lr: 0.0005516445082442662
    n_layers: 1
    optimizer: Nadam
    out_channels_l0: 32
    weight_decay: 0.001301118849981746



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-05 16:09:55,259][0m Using an existing study with name 'H1_active_P_vs_inactive_P_CNN_1_2' instead of creating a new one.[0m


AUPRC test score: 0.16383131326251466


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.15563833150290746
  Params: 
    dropout_l0: 0.3
    kernel_size_l0: 11
    lr: 0.00015524335404584936
    n_layers: 1
    optimizer: Adam
    out_channels_l0: 64
    weight_decay: 0.00014885666001690376



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-05 16:09:57,141][0m Using an existing study with name 'H1_active_P_vs_inactive_P_CNN_1_2_3' instead of creating a new one.[0m


AUPRC test score: 0.15264495000223485


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.22473176942367348
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0
    kernel_size_l0: 15
    kernel_size_l1: 11
    lr: 4.739550963035682e-05
    n_layers: 2
    optimizer: Nadam
    out_channels_l0: 64
    out_channels_l1: 32
    weight_decay: 0.015518033673366122

AUPRC test score: 0.2189019667140438



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.17846


In [20]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [21]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 3) ACTIVE ENHANCERS vs ACTIVE PROMOTERS

In [22]:
task = TASKS[2]
task

'active_E_vs_active_P'

In [23]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!




---
### 1. FFNN

In [18]:
model=FFNN

In [19]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-03 20:18:55,406][0m Using an existing study with name 'H1_active_E_vs_active_P_FFNN_1' instead of creating a new one.[0m
  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-03 20:18:55,499][0m Using an existing study with name 'H1_active_E_vs_active_P_FFNN_1_2' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.53648705290612
  Params: 
    dropout_l0: 0.0
    dropout_l1: 0.4
    dropout_l2: 0.0
    lr: 0.0424877482728396
    n_layers: 3
    n_units_l0: 256
    n_units_l1: 64
    n_units_l2: 4
    optimizer: Nadam
    weight_decay: 0.0012537888344100038

AUPRC test score: 0.5749952077002526


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.6037152549727937
  Params: 
    dropout_l0: 0.3
    lr: 0.01339591480404898
    n_layers: 1
    n_units_l0: 128
    optimizer: Nadam
    weight_decay: 0.0002389239261981375

AUPRC test score: 0.6107974884285954


>>> ITERATION N. 3



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-03 20:18:55,586][0m Using an existing study with name 'H1_active_E_vs_active_P_FFNN_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.1923961218836565
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.0
    dropout_l2: 0.4
    dropout_l3: 0.4
    lr: 5.460475053247793e-05
    n_layers: 4
    n_units_l0: 128
    n_units_l1: 16
    n_units_l2: 32
    n_units_l3: 16
    optimizer: Nadam
    weight_decay: 0.043230580471915594

AUPRC test score: 0.18772727272727271



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.45784


In [27]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [28]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [24]:
model=CNN

In [25]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-05 16:10:04,072][0m Using an existing study with name 'H1_active_E_vs_active_P_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.33711665737827695
  Params: 
    dropout_l0: 0
    dropout_l1: 0.4
    dropout_l2: 0
    kernel_size_l0: 15
    kernel_size_l1: 5
    kernel_size_l2: 15
    lr: 0.0025515339805708193
    n_layers: 3
    optimizer: Nadam
    out_channels_l0: 32
    out_channels_l1: 64
    out_channels_l2: 96
    weight_decay: 0.0018416757825418667



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-05 16:10:05,104][0m Using an existing study with name 'H1_active_E_vs_active_P_CNN_1_2' instead of creating a new one.[0m


AUPRC test score: 0.41314218882986353


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5442957294577445
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.5
    kernel_size_l0: 5
    kernel_size_l1: 11
    lr: 0.0011664818820017037
    n_layers: 2
    optimizer: Adam
    out_channels_l0: 64
    out_channels_l1: 96
    weight_decay: 0.0017223377843116829

AUPRC test score: 0.5551891242492616




  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-05 16:10:07,046][0m Using an existing study with name 'H1_active_E_vs_active_P_CNN_1_2_3' instead of creating a new one.[0m


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.20583924881161783
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0
    dropout_l2: 0.5
    dropout_l3: 0
    kernel_size_l0: 11
    kernel_size_l1: 15
    kernel_size_l2: 5
    kernel_size_l3: 15
    lr: 3.835802105956803e-05
    n_layers: 4
    optimizer: Adam
    out_channels_l0: 32
    out_channels_l1: 32
    out_channels_l2: 64
    out_channels_l3: 128
    weight_decay: 0.00011682287246153977

AUPRC test score: 0.2090264000336895



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.39245


In [26]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [27]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 4) INACTIVE ENHANCERS vs INACTIVE PROMOTERS

In [28]:
task = TASKS[3]
task

'inactive_E_vs_inactive_P'

In [29]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!




---
### 1. FFNN

In [22]:
model=FFNN

In [23]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-03 20:18:59,265][0m Using an existing study with name 'H1_inactive_E_vs_inactive_P_FFNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5301425214901895
  Params: 
    dropout_l0: 0.3
    lr: 2.281764458735932e-05
    n_layers: 1
    n_units_l0: 128
    optimizer: Adam
    weight_decay: 0.0014670566392869298

AUPRC test score: 0.5388982485902519


>>> ITERATION N. 2



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-03 20:18:59,444][0m Using an existing study with name 'H1_inactive_E_vs_inactive_P_FFNN_1_2' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5791828183760261
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.4
    dropout_l2: 0.5
    dropout_l3: 0.4
    lr: 0.0017512624458024975
    n_layers: 4
    n_units_l0: 128
    n_units_l1: 64
    n_units_l2: 64
    n_units_l3: 16
    optimizer: RMSprop
    weight_decay: 0.005709711535457803

AUPRC test score: 0.41267776584317967


>>> ITERATION N. 3



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-03 20:18:59,610][0m Using an existing study with name 'H1_inactive_E_vs_inactive_P_FFNN_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5857068887935787
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.3
    dropout_l2: 0.0
    dropout_l3: 0.0
    lr: 0.00048764268506689903
    n_layers: 4
    n_units_l0: 64
    n_units_l1: 128
    n_units_l2: 4
    n_units_l3: 32
    optimizer: Adam
    weight_decay: 0.0026716656034675174

AUPRC test score: 0.5915464052733119



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.51437


In [33]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [34]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [30]:
model=CNN

In [31]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-05 16:10:12,729][0m Using an existing study with name 'H1_inactive_E_vs_inactive_P_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5985109560594046
  Params: 
    dropout_l0: 0
    dropout_l1: 0
    dropout_l2: 0.5
    dropout_l3: 0
    kernel_size_l0: 15
    kernel_size_l1: 5
    kernel_size_l2: 11
    kernel_size_l3: 15
    lr: 0.0006177288276810144
    n_layers: 4
    optimizer: Adam
    out_channels_l0: 16
    out_channels_l1: 64
    out_channels_l2: 96
    out_channels_l3: 256
    weight_decay: 0.011506970047176721



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-05 16:10:13,686][0m Using an existing study with name 'H1_inactive_E_vs_inactive_P_CNN_1_2' instead of creating a new one.[0m


AUPRC test score: 0.5764858683549248


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5757595975625552
  Params: 
    dropout_l0: 0.3
    kernel_size_l0: 5
    lr: 3.370179556588162e-05
    n_layers: 1
    optimizer: Adam
    out_channels_l0: 16
    weight_decay: 0.0042142139300556155



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-05 16:10:14,586][0m Using an existing study with name 'H1_inactive_E_vs_inactive_P_CNN_1_2_3' instead of creating a new one.[0m


AUPRC test score: 0.5820401718614242


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5693037198442992
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.5
    kernel_size_l0: 11
    kernel_size_l1: 15
    lr: 0.0038336073622503852
    n_layers: 2
    optimizer: Nadam
    out_channels_l0: 16
    out_channels_l1: 64
    weight_decay: 0.0015880269793153365

AUPRC test score: 0.5853241468121042



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.58128


In [32]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [33]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 5) ACTIVE ENHANCERS + ACTIVE PROMOTERS vs INACTIVE REST

In [11]:
task = TASKS[4]
task

'active_EP_vs_inactive_rest'

In [12]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


---
### 1. FFNN

In [13]:
model=FFNN

In [14]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

>>> ITERATION N. 1



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-06 18:45:23,293][0m Using an existing study with name 'H1_active_EP_vs_inactive_rest_FFNN_1' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.09907990314769975
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.4
    dropout_l2: 0.5
    dropout_l3: 0.0
    lr: 0.0023514887528533306
    n_layers: 4
    n_units_l0: 128
    n_units_l1: 32
    n_units_l2: 4
    n_units_l3: 16
    optimizer: RMSprop
    weight_decay: 0.03743320502655477



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-06 18:45:23,468][0m Using an existing study with name 'H1_active_EP_vs_inactive_rest_FFNN_1_2' instead of creating a new one.[0m


AUPRC test score: 0.10259492685963278


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.1016334512944682
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.3
    dropout_l2: 0.4
    lr: 0.00030943655547470704
    n_layers: 3
    n_units_l0: 32
    n_units_l1: 128
    n_units_l2: 4
    optimizer: Nadam
    weight_decay: 0.00041599661217944046

AUPRC test score: 0.09926879084967324


>>> ITERATION N. 3



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-06 18:45:23,599][0m Using an existing study with name 'H1_active_EP_vs_inactive_rest_FFNN_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.10518820883698873
  Params: 
    dropout_l0: 0.4
    lr: 0.0003076475445070861
    n_layers: 1
    n_units_l0: 128
    optimizer: Adam
    weight_decay: 0.0032920292591549494

AUPRC test score: 0.10630054769390786



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.10272


In [15]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [16]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [36]:
model=CNN

In [37]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-05 16:10:19,569][0m Using an existing study with name 'H1_active_EP_vs_inactive_rest_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.1024703332784021
  Params: 
    dropout_l0: 0.3
    kernel_size_l0: 11
    lr: 2.6196700583316394e-05
    n_layers: 1
    optimizer: RMSprop
    out_channels_l0: 16
    weight_decay: 0.0074563135020834235



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-05 16:10:20,411][0m Using an existing study with name 'H1_active_EP_vs_inactive_rest_CNN_1_2' instead of creating a new one.[0m


AUPRC test score: 0.1033010095781835


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.11952412780693837
  Params: 
    dropout_l0: 0
    kernel_size_l0: 5
    lr: 4.4946131492384865e-05
    n_layers: 1
    optimizer: Adam
    out_channels_l0: 16
    weight_decay: 0.005984387460470812



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-05 16:10:21,551][0m Using an existing study with name 'H1_active_EP_vs_inactive_rest_CNN_1_2_3' instead of creating a new one.[0m


AUPRC test score: 0.12255407536407693


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.11379471746021276
  Params: 
    dropout_l0: 0
    kernel_size_l0: 11
    lr: 0.0004306828345584087
    n_layers: 1
    optimizer: Nadam
    out_channels_l0: 32
    weight_decay: 0.0009460647752329262

AUPRC test score: 0.11402469784651957



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.11329


In [38]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [39]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

In [19]:
from math import modf
modf(1.0)

(0.0, 1.0)