## BIOINFORMATICS THESIS: MULTIMODAL NEURAL NETWORK

# CELL LINE: A549

In [2]:
import pandas as pd 
import numpy as np
import os

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from collections import defaultdict, OrderedDict
import pickle

import sqlite3
from sqlalchemy import create_engine

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
# create a database to store optuna studies with sqlite backend
#engine = create_engine('sqlite:///BIOINF_optuna_tuning.db')

In [4]:
from BIOINF_tesi.data_pipe import CELL_LINES, TASKS

In [5]:
cell_line = CELL_LINES[0]
cell_line

'A549'

---

In [6]:
from BIOINF_tesi.data_pipe import Load_Create_Task
from BIOINF_tesi.data_pipe import Build_DataLoader_Pipeline

In [7]:
from BIOINF_tesi.models import FFNN, CNN, CNN_LSTM
from BIOINF_tesi.models.utils import fit, Param_Search, Kfold_CV

In [17]:
with open ('results_dict.pickle', 'rb') as fin:
    results_dict = pickle.load(fin)
    results_dict = defaultdict(lambda: defaultdict(dict), results_dict)

## 1) ACTIVE ENHANCERS vs INACTIVE ENHANCERS

In [18]:
task = TASKS[0]
task

'active_E_vs_inactive_E'

In [9]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


### 1. FFNN

In [19]:
model=FFNN

In [11]:
# IMBALANCED
type_augm_genfeatures = 'smote'

kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                type_augm_genfeatures=type_augm_genfeatures,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}_{type_augm_genfeatures}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_{type_augm_genfeatures}_TEST')

>>> ITERATION N. 1



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:06:56,144][0m Using an existing study with name 'A549_active_E_vs_inactive_E_FFNN_smote_1' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  5
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.08143502984111718
  Params: 
    dropout_l0: 0.4
    lr: 0.027111894800821008
    n_layers: 1
    n_units_l0: 128
    optimizer: Adam
    weight_decay: 0.0006750514030212153



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:06:56,678][0m Using an existing study with name 'A549_active_E_vs_inactive_E_FFNN_smote_1_2' instead of creating a new one.[0m


AUPRC test score: 0.10078988584259727


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.057860915492957736
  Params: 
    dropout_l0: 0.0
    dropout_l1: 0.4
    dropout_l2: 0.4
    dropout_l3: 0.5
    lr: 0.0069134605688635576
    n_layers: 4
    n_units_l0: 64
    n_units_l1: 64
    n_units_l2: 32
    n_units_l3: 32
    optimizer: Adam
    weight_decay: 0.004263230767303585

AUPRC test score: 0.05554865938430984


>>> ITERATION N. 3



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:06:57,269][0m Using an existing study with name 'A549_active_E_vs_inactive_E_FFNN_smote_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.13661488396684307
  Params: 
    dropout_l0: 0.0
    dropout_l1: 0.3
    dropout_l2: 0.4
    lr: 0.017134120009264942
    n_layers: 3
    n_units_l0: 256
    n_units_l1: 32
    n_units_l2: 64
    optimizer: RMSprop
    weight_decay: 0.004466602964923941

AUPRC test score: 0.05852035749751737



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.07162


In [12]:
results_dict[cell_line][task][f'{model.__name__}_{type_augm_genfeatures}'] = kf_CV.scores_dict

In [13]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

In [14]:
# IMBALANCED
type_augm_genfeatures = 'double'

kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                type_augm_genfeatures=type_augm_genfeatures,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}_{type_augm_genfeatures}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_{type_augm_genfeatures}_TEST.pt')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:06:58,433][0m Using an existing study with name 'A549_active_E_vs_inactive_E_FFNN_double_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.11359246252835949
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.2
    dropout_l2: 0.0
    lr: 0.024731376245428784
    n_layers: 3
    n_units_l0: 64
    n_units_l1: 128
    n_units_l2: 16
    optimizer: RMSprop
    weight_decay: 0.004078330488802357



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:06:58,692][0m Using an existing study with name 'A549_active_E_vs_inactive_E_FFNN_double_1_2' instead of creating a new one.[0m


AUPRC test score: 0.05687437934458789


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.05786374654964966
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.3
    dropout_l2: 0.0
    lr: 5.65581253133464e-05
    n_layers: 3
    n_units_l0: 32
    n_units_l1: 64
    n_units_l2: 64
    optimizer: Adam
    weight_decay: 0.004385557345504528



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:06:58,879][0m Using an existing study with name 'A549_active_E_vs_inactive_E_FFNN_double_1_2_3' instead of creating a new one.[0m


AUPRC test score: 0.11023519722453504


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.1269234089835595
  Params: 
    dropout_l0: 0.4
    lr: 0.0013624976247216787
    n_layers: 1
    n_units_l0: 32
    optimizer: Adam
    weight_decay: 0.0009621113605341923

AUPRC test score: 0.13371450487634975



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.10027


In [15]:
results_dict[cell_line][task][f'{model.__name__}_{type_augm_genfeatures}'] = kf_CV.scores_dict

In [16]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [26]:
model=CNN

In [11]:
# IMBALANCED
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:37:43,397][0m Using an existing study with name 'A549_active_E_vs_inactive_E_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1





Study statistics: 
  Number of finished trials:  4
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.06671030052771904
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0
    kernel_size_l0: 5
    kernel_size_l1: 15
    lr: 0.002492549875641731
    n_layers: 2
    optimizer: Nadam
    out_channels_l0: 16
    out_channels_l1: 96
    weight_decay: 0.05400548483737392

AUPRC test score: 0.056770109235352535




  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:37:46,521][0m Using an existing study with name 'A549_active_E_vs_inactive_E_CNN_1_2' instead of creating a new one.[0m


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.07353027134563184
  Params: 
    dropout_l0: 0
    dropout_l1: 0.4
    kernel_size_l0: 15
    kernel_size_l1: 5
    lr: 0.003187423399350215
    n_layers: 2
    optimizer: Nadam
    out_channels_l0: 64
    out_channels_l1: 64
    weight_decay: 0.010152542998255033



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:37:47,152][0m Using an existing study with name 'A549_active_E_vs_inactive_E_CNN_1_2_3' instead of creating a new one.[0m


AUPRC test score: 0.06519933360703083


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.07250059734214663
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.4
    dropout_l2: 0.4
    dropout_l3: 0.5
    kernel_size_l0: 5
    kernel_size_l1: 11
    kernel_size_l2: 15
    kernel_size_l3: 5
    lr: 0.0010533905245534373
    n_layers: 4
    optimizer: Nadam
    out_channels_l0: 16
    out_channels_l1: 64
    out_channels_l2: 96
    out_channels_l3: 512
    weight_decay: 0.07991688720405098

AUPRC test score: 0.05836395233366432



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.06011


In [12]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [13]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 2) ACTIVE PROMOTERS vs INACTIVE PROMOTERS

In [12]:
task = TASKS[1]
task

'active_P_vs_inactive_P'

In [18]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


---

### 1. FFNN

In [32]:
model=FFNN

In [20]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:07:03,471][0m Using an existing study with name 'A549_active_P_vs_inactive_P_FFNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.24630239801613507
  Params: 
    dropout_l0: 0.0
    lr: 0.010822302474528958
    n_layers: 1
    n_units_l0: 256
    optimizer: RMSprop
    weight_decay: 0.0037445355021975568

AUPRC test score: 0.24068126121120859


>>> ITERATION N. 2


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:07:03,696][0m Using an existing study with name 'A549_active_P_vs_inactive_P_FFNN_1_2' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.2496577966428925
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.2
    lr: 0.0003444879502087814
    n_layers: 2
    n_units_l0: 128
    n_units_l1: 16
    optimizer: RMSprop
    weight_decay: 0.009321065770065128



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:07:03,921][0m Using an existing study with name 'A549_active_P_vs_inactive_P_FFNN_1_2_3' instead of creating a new one.[0m


AUPRC test score: 0.23681676820641315


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.10209689281117858
  Params: 
    dropout_l0: 0.0
    dropout_l1: 0.4
    dropout_l2: 0.4
    dropout_l3: 0.4
    lr: 0.0002866616626840838
    n_layers: 4
    n_units_l0: 64
    n_units_l1: 32
    n_units_l2: 64
    n_units_l3: 4
    optimizer: Adam
    weight_decay: 0.006557897177013657

AUPRC test score: 0.24753521221225544



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.24168


In [21]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [22]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [37]:
model=CNN

In [17]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:37:55,021][0m Using an existing study with name 'A549_active_P_vs_inactive_P_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.18273752343785848
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.4
    kernel_size_l0: 5
    kernel_size_l1: 11
    lr: 6.581272290135745e-05
    n_layers: 2
    optimizer: Adam
    out_channels_l0: 16
    out_channels_l1: 64
    weight_decay: 0.0006137804736567615



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:37:55,363][0m Using an existing study with name 'A549_active_P_vs_inactive_P_CNN_1_2' instead of creating a new one.[0m


AUPRC test score: 0.19503897469253328


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.18840931902263708
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.4
    kernel_size_l0: 5
    kernel_size_l1: 11
    lr: 1.7628967520980997e-05
    n_layers: 2
    optimizer: RMSprop
    out_channels_l0: 32
    out_channels_l1: 32
    weight_decay: 0.0006201936675350008



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:37:55,811][0m Using an existing study with name 'A549_active_P_vs_inactive_P_CNN_1_2_3' instead of creating a new one.[0m


AUPRC test score: 0.1907454524469494


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.18569023842685717
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0
    kernel_size_l0: 11
    kernel_size_l1: 15
    lr: 0.0011673180152785819
    n_layers: 2
    optimizer: Adam
    out_channels_l0: 32
    out_channels_l1: 96
    weight_decay: 0.012785315175687432

AUPRC test score: 0.19543550008918104



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.19374


In [18]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [19]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 3) ACTIVE ENHANCERS vs ACTIVE PROMOTERS

In [41]:
task = TASKS[2]
task

'active_E_vs_active_P'

In [24]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


---
### 1. FFNN

In [42]:
model=FFNN

In [26]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:07:08,391][0m Using an existing study with name 'A549_active_E_vs_active_P_FFNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.7166961657151243
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.3
    lr: 0.0008497074355355139
    n_layers: 2
    n_units_l0: 64
    n_units_l1: 32
    optimizer: RMSprop
    weight_decay: 0.00042750672867661294

AUPRC test score: 0.6895415141808625


>>> ITERATION N. 2



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:07:08,561][0m Using an existing study with name 'A549_active_E_vs_active_P_FFNN_1_2' instead of creating a new one.[0m
  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:07:08,677][0m Using an existing study with name 'A549_active_E_vs_active_P_FFNN_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.6473271526808818
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.0
    dropout_l2: 0.4
    lr: 0.007660288906168663
    n_layers: 3
    n_units_l0: 128
    n_units_l1: 32
    n_units_l2: 4
    optimizer: Adam
    weight_decay: 0.0011567097743807203

AUPRC test score: 0.727450023846858


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.6407601539356916
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.0
    lr: 0.00015428408333952455
    n_layers: 2
    n_units_l0: 32
    n_units_l1: 32
    optimizer: Nadam
    weight_decay: 0.02186164909249316

AUPRC test score: 0.6277023073733831



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.68156


In [27]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [28]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [48]:
model=CNN

In [23]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:38:00,600][0m Using an existing study with name 'A549_active_E_vs_active_P_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5907730514883093
  Params: 
    dropout_l0: 0
    dropout_l1: 0
    kernel_size_l0: 15
    kernel_size_l1: 5
    lr: 0.0002075881270271377
    n_layers: 2
    optimizer: RMSprop
    out_channels_l0: 64
    out_channels_l1: 96
    weight_decay: 0.0022148936628602002

AUPRC test score: 0.6564181982824108




  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:38:01,476][0m Using an existing study with name 'A549_active_E_vs_active_P_CNN_1_2' instead of creating a new one.[0m


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.6022678317974148
  Params: 
    dropout_l0: 0
    kernel_size_l0: 11
    lr: 0.0005568882336287384
    n_layers: 1
    optimizer: RMSprop
    out_channels_l0: 16
    weight_decay: 0.007168711522114368



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:38:01,877][0m Using an existing study with name 'A549_active_E_vs_active_P_CNN_1_2_3' instead of creating a new one.[0m


AUPRC test score: 0.6782589514404268


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.6304423926366542
  Params: 
    dropout_l0: 0
    dropout_l1: 0
    kernel_size_l0: 15
    kernel_size_l1: 11
    lr: 0.0009214251171952463
    n_layers: 2
    optimizer: RMSprop
    out_channels_l0: 64
    out_channels_l1: 96
    weight_decay: 0.00030578141855603757

AUPRC test score: 0.6385317881614837



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.65774


In [24]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [25]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 4) INACTIVE ENHANCERS vs INACTIVE PROMOTERS

In [53]:
task = TASKS[3]
task

'inactive_E_vs_inactive_P'

In [30]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


---
### 1. FFNN

In [54]:
model=FFNN

In [32]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

>>> ITERATION N. 1



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:07:13,948][0m Using an existing study with name 'A549_inactive_E_vs_inactive_P_FFNN_1' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5745674373714987
  Params: 
    dropout_l0: 0.2
    lr: 5.008317052796455e-05
    n_layers: 1
    n_units_l0: 128
    optimizer: Adam
    weight_decay: 0.00020326126474239946

AUPRC test score: 0.573199921595868


>>> ITERATION N. 2



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:07:14,291][0m Using an existing study with name 'A549_inactive_E_vs_inactive_P_FFNN_1_2' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.4002612818136451
  Params: 
    dropout_l0: 0.0
    dropout_l1: 0.0
    lr: 0.05242370181027648
    n_layers: 2
    n_units_l0: 256
    n_units_l1: 64
    optimizer: Nadam
    weight_decay: 0.09411938478516767

AUPRC test score: 0.39936786105460803


>>> ITERATION N. 3



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:07:14,736][0m Using an existing study with name 'A549_inactive_E_vs_inactive_P_FFNN_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.566402609611672
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.4
    lr: 1.104105076424251e-05
    n_layers: 2
    n_units_l0: 32
    n_units_l1: 16
    optimizer: RMSprop
    weight_decay: 0.0003154095452807073

AUPRC test score: 0.5672468849646535



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.51327


In [33]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [34]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [59]:
model=CNN

In [29]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:38:07,012][0m Using an existing study with name 'A549_inactive_E_vs_inactive_P_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5760663763670382
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.4
    kernel_size_l0: 5
    kernel_size_l1: 5
    lr: 0.00037378373926965676
    n_layers: 2
    optimizer: RMSprop
    out_channels_l0: 32
    out_channels_l1: 32
    weight_decay: 0.0019414154771545153



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:38:07,625][0m Using an existing study with name 'A549_inactive_E_vs_inactive_P_CNN_1_2' instead of creating a new one.[0m


AUPRC test score: 0.5864192948037895


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5639809297465003
  Params: 
    dropout_l0: 0
    dropout_l1: 0.4
    dropout_l2: 0
    kernel_size_l0: 11
    kernel_size_l1: 5
    kernel_size_l2: 5
    lr: 0.0008113426985655316
    n_layers: 3
    optimizer: Nadam
    out_channels_l0: 64
    out_channels_l1: 96
    out_channels_l2: 96
    weight_decay: 0.05459213463084904



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:38:08,122][0m Using an existing study with name 'A549_inactive_E_vs_inactive_P_CNN_1_2_3' instead of creating a new one.[0m


AUPRC test score: 0.5719893161752271


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5888828656642322
  Params: 
    dropout_l0: 0.2
    kernel_size_l0: 11
    lr: 3.0731683605684234e-05
    n_layers: 1
    optimizer: Adam
    out_channels_l0: 32
    weight_decay: 0.014016142066702372

AUPRC test score: 0.5942429027008511



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.58422


In [30]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [31]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 5) ACTIVE ENHANCERS + ACTIVE PROMOTERS vs INACTIVE REST

In [8]:
task = TASKS[4]
task

'active_EP_vs_inactive_rest'

In [36]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


---
### 1. FFNN

In [9]:
model=FFNN

In [38]:
# IMBALANCED
type_augm_genfeatures = 'smote'

kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                type_augm_genfeatures=type_augm_genfeatures,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}_{type_augm_genfeatures}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_{type_augm_genfeatures}_TEST')

>>> ITERATION N. 1



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:07:19,645][0m Using an existing study with name 'A549_active_EP_vs_inactive_rest_FFNN_smote_1' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.21535957579289228
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.3
    dropout_l2: 0.0
    lr: 0.001651837888927492
    n_layers: 3
    n_units_l0: 64
    n_units_l1: 32
    n_units_l2: 32
    optimizer: Nadam
    weight_decay: 0.0015060098914122845

AUPRC test score: 0.21303423144813163


>>> ITERATION N. 2



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:07:23,538][0m Using an existing study with name 'A549_active_EP_vs_inactive_rest_FFNN_smote_1_2' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.21718298679155643
  Params: 
    dropout_l0: 0.0
    lr: 0.0019073114038689288
    n_layers: 1
    n_units_l0: 128
    optimizer: Nadam
    weight_decay: 0.00016946403036089493

AUPRC test score: 0.20922790198786095


>>> ITERATION N. 3



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:07:28,970][0m Using an existing study with name 'A549_active_EP_vs_inactive_rest_FFNN_smote_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.2198683219329508
  Params: 
    dropout_l0: 0.0
    dropout_l1: 0.3
    dropout_l2: 0.0
    dropout_l3: 0.4
    lr: 0.0019100681965982071
    n_layers: 4
    n_units_l0: 256
    n_units_l1: 128
    n_units_l2: 16
    n_units_l3: 16
    optimizer: RMSprop
    weight_decay: 0.0005802275484579843

AUPRC test score: 0.20377105632873682



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.20868


In [12]:
from BIOINF_tesi.visual import parse_as_dict

i=3

d = torch.load(f'{cell_line}_{model.__name__}_{task}_{i}_test_.pt', map_location=torch.device(device))

s =  """dropout_l0: 0.0
    dropout_l1: 0.3
    dropout_l2: 0.0
    dropout_l3: 0.4
    lr: 0.0019100681965982071
    n_layers: 4
    n_units_l0: 256
    n_units_l1: 128
    n_units_l2: 16
    n_units_l3: 16
    optimizer: RMSprop
    weight_decay: 0.0005802275484579843"""

d['model_params']=parse_as_dict(s)
print(d['model_params'])

torch.save(d, f'{cell_line}_{model.__name__}_{task}_{i}_test_.pt')

OrderedDict([('dropout_l0', 0.0), ('dropout_l1', 0.3), ('dropout_l2', 0.0), ('dropout_l3', 0.4), ('lr', 0.0019100681965982071), ('n_layers', 4.0), ('n_units_l0', 256.0), ('n_units_l1', 128.0), ('n_units_l2', 16.0), ('n_units_l3', 16.0), ('optimizer', 'RMSprop'), ('weight_decay', 0.0005802275484579843)])


In [39]:
results_dict[cell_line][task][f'{model.__name__}_{type_augm_genfeatures}'] = kf_CV.scores_dict

In [40]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

In [41]:
# IMBALANCED
type_augm_genfeatures = 'double'

kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                type_augm_genfeatures=type_augm_genfeatures,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}_{type_augm_genfeatures}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_{type_augm_genfeatures}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:07:32,057][0m Using an existing study with name 'A549_active_EP_vs_inactive_rest_FFNN_double_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.2018778874464189
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.0
    dropout_l2: 0.0
    lr: 3.781803941584326e-05
    n_layers: 3
    n_units_l0: 32
    n_units_l1: 64
    n_units_l2: 32
    optimizer: Adam
    weight_decay: 0.000333225560015162

AUPRC test score: 0.21236833529824256


>>> ITERATION N. 2



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:07:32,350][0m Using an existing study with name 'A549_active_EP_vs_inactive_rest_FFNN_double_1_2' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.21006128981487612
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.4
    dropout_l2: 0.0
    lr: 4.769799031853187e-05
    n_layers: 3
    n_units_l0: 64
    n_units_l1: 128
    n_units_l2: 4
    optimizer: RMSprop
    weight_decay: 0.00014174160214194557

AUPRC test score: 0.20921430575346636


>>> ITERATION N. 3



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-11-04 07:07:32,681][0m Using an existing study with name 'A549_active_EP_vs_inactive_rest_FFNN_double_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.20398065072386948
  Params: 
    dropout_l0: 0.0
    dropout_l1: 0.2
    dropout_l2: 0.4
    lr: 1.2005247227684579e-05
    n_layers: 3
    n_units_l0: 128
    n_units_l1: 32
    n_units_l2: 16
    optimizer: Nadam
    weight_decay: 0.0009092847518404929

AUPRC test score: 0.20387677608140506



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.20849


In [42]:
results_dict[cell_line][task][f'{model.__name__}_{type_augm_genfeatures}'] = kf_CV.scores_dict

In [43]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [68]:
model=CNN

In [38]:
# IMBALANCED
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

>>> ITERATION N. 1



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:38:25,279][0m Using an existing study with name 'A549_active_EP_vs_inactive_rest_CNN_1' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.14372508208904314
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.5
    dropout_l2: 0.5
    kernel_size_l0: 5
    kernel_size_l1: 15
    kernel_size_l2: 15
    lr: 0.00011225256307637438
    n_layers: 3
    optimizer: RMSprop
    out_channels_l0: 32
    out_channels_l1: 64
    out_channels_l2: 64
    weight_decay: 0.022471260724093313

AUPRC test score: 0.15808317888679663


>>> ITERATION N. 2



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:38:25,991][0m Using an existing study with name 'A549_active_EP_vs_inactive_rest_CNN_1_2' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.15414174923759058
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.5
    dropout_l2: 0.5
    dropout_l3: 0.4
    kernel_size_l0: 11
    kernel_size_l1: 15
    kernel_size_l2: 15
    kernel_size_l3: 15
    lr: 0.0011837942258440628
    n_layers: 4
    optimizer: Adam
    out_channels_l0: 32
    out_channels_l1: 96
    out_channels_l2: 64
    out_channels_l3: 128
    weight_decay: 0.0007216288878670173

AUPRC test score: 0.15534093185276715


>>> ITERATION N. 3



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:38:26,540][0m Using an existing study with name 'A549_active_EP_vs_inactive_rest_CNN_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  6
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.16006682488349847
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.5
    dropout_l2: 0.5
    dropout_l3: 0.5
    kernel_size_l0: 15
    kernel_size_l1: 15
    kernel_size_l2: 15
    kernel_size_l3: 5
    lr: 0.00034665706509859866
    n_layers: 4
    optimizer: Adam
    out_channels_l0: 16
    out_channels_l1: 32
    out_channels_l2: 256
    out_channels_l3: 512
    weight_decay: 0.006522553452181734

AUPRC test score: 0.12690914560097605



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.14678


In [39]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [40]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---