## BIOINFORMATICS THESIS: MULTIMODAL NEURAL NETWORK

# CELL LINE: MCF7

In [1]:
import pandas as pd 
import numpy as np
import os

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from collections import defaultdict, OrderedDict
import pickle

import sqlite3
from sqlalchemy import create_engine

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
# create a database to store optuna studies with sqlite backend
#engine = create_engine('sqlite:///BIOINF_optuna_tuning.db')

In [3]:
from BIOINF_tesi.data_pipe import CELL_LINES, TASKS

In [4]:
cell_line = CELL_LINES[6]
cell_line

'MCF7'

---

In [5]:
from BIOINF_tesi.data_pipe import Load_Create_Task
from BIOINF_tesi.data_pipe import Build_DataLoader_Pipeline

In [6]:
from BIOINF_tesi.models import FFNN, CNN, CNN_LSTM
from BIOINF_tesi.models.utils import fit, Param_Search, Kfold_CV

In [75]:
with open ('results_dict.pickle', 'rb') as fin:
    results_dict = pickle.load(fin)
    results_dict = defaultdict(lambda: defaultdict(dict), results_dict)

In [77]:
for i in results_dict.keys():
    print(i)
    for k in results_dict[i].keys():
        print(k)
        for j in results_dict[i][k].keys():
            print(j)
    print('\n')

A549
active_E_vs_inactive_E
FFNN_smote
FFNN_double
active_P_vs_inactive_P
FFNN
active_E_vs_active_P
FFNN
inactive_E_vs_inactive_P
FFNN
active_EP_vs_inactive_rest
FFNN_smote
FFNN_double


GM12878
active_E_vs_inactive_E
FFNN
active_P_vs_inactive_P
FFNN
active_E_vs_active_P
FFNN
inactive_E_vs_inactive_P
FFNN
active_EP_vs_inactive_rest
FFNN


HEK293
active_E_vs_inactive_E
FFNN
active_P_vs_inactive_P
FFNN
active_E_vs_active_P
FFNN
inactive_E_vs_inactive_P
FFNN
active_EP_vs_inactive_rest
FFNN


K562
active_E_vs_inactive_E
FFNN_smote
FFNN_double
active_P_vs_inactive_P
FFNN
active_E_vs_active_P
FFNN
inactive_E_vs_inactive_P
FFNN
active_EP_vs_inactive_rest
FFNN


MCF7
active_E_vs_inactive_E
CNN
FFNN_smote
FFNN_double
active_P_vs_inactive_P
CNN
FFNN
active_E_vs_active_P
CNN
FFNN
inactive_E_vs_inactive_P
CNN
FFNN
active_EP_vs_inactive_rest
CNN
FFNN


H1
active_E_vs_inactive_E
FFNN_smote
FFNN_double
active_P_vs_inactive_P
FFNN
active_E_vs_active_P
FFNN
inactive_E_vs_inactive_P
FFNN
active_EP_vs_in

## 1) ACTIVE ENHANCERS vs INACTIVE ENHANCERS

In [8]:
task = TASKS[0]
task

'active_E_vs_inactive_E'

In [9]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


### 1. FFNN

In [10]:
model=FFNN

In [11]:
# IMBALANCED
type_augm_genfeatures = 'smote'

kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                rebalancing=True,
                type_augm_genfeatures=type_augm_genfeatures,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}_{type_augm_genfeatures}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_{type_augm_genfeatures}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_{type_augm_genfeatures}_TEST')

>>> ITERATION N. 1



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:22,645][0m Using an existing study with name 'MCF7_active_E_vs_inactive_E_FFNN_smote_1' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.1445522993803127
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.2
    lr: 0.000717191305294996
    n_layers: 2
    n_units_l0: 64
    n_units_l1: 64
    optimizer: Nadam
    weight_decay: 0.008209367329633885



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:23,222][0m Using an existing study with name 'MCF7_active_E_vs_inactive_E_FFNN_smote_1_2' instead of creating a new one.[0m


AUPRC test score: 0.1499894959309805


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.12911526731494335
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.0
    dropout_l2: 0.0
    dropout_l3: 0.5
    lr: 0.09198801413997783
    n_layers: 4
    n_units_l0: 128
    n_units_l1: 128
    n_units_l2: 32
    n_units_l3: 32
    optimizer: RMSprop
    weight_decay: 0.0224501751210264



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:23,687][0m Using an existing study with name 'MCF7_active_E_vs_inactive_E_FFNN_smote_1_2_3' instead of creating a new one.[0m


AUPRC test score: 0.0803227408142999


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.14322283514320844
  Params: 
    dropout_l0: 0.4
    lr: 0.0013961250157577215
    n_layers: 1
    n_units_l0: 32
    optimizer: Adam
    weight_decay: 0.0015015402150240415

AUPRC test score: 0.14668329615962725



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.12567


In [12]:
f'{cell_line}_{task}_{model.__name__}_{type_augm_genfeatures}'

'MCF7_active_E_vs_inactive_E_FFNN_smote'

In [13]:
results_dict[cell_line][task][f'{model.__name__}_{type_augm_genfeatures}'] = kf_CV.scores_dict

In [14]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

In [15]:
# IMBALANCED
type_augm_genfeatures = 'double'

kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                rebalancing=True,
                type_augm_genfeatures=type_augm_genfeatures,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}_{type_augm_genfeatures}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_{type_augm_genfeatures}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_{type_augm_genfeatures}_TEST')

>>> ITERATION N. 1



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:24,203][0m Using an existing study with name 'MCF7_active_E_vs_inactive_E_FFNN_double_1' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.14725989179288396
  Params: 
    dropout_l0: 0.4
    lr: 0.05379494618917977
    n_layers: 1
    n_units_l0: 32
    optimizer: Nadam
    weight_decay: 0.006888892446787857

AUPRC test score: 0.1305424947202898


>>> ITERATION N. 2


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:24,401][0m Using an existing study with name 'MCF7_active_E_vs_inactive_E_FFNN_double_1_2' instead of creating a new one.[0m



Study statistics: 


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:24,553][0m Using an existing study with name 'MCF7_active_E_vs_inactive_E_FFNN_double_1_2_3' instead of creating a new one.[0m


  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.07672535211267607
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.3
    dropout_l2: 0.5
    dropout_l3: 0.4
    lr: 0.0024910306364775616
    n_layers: 4
    n_units_l0: 256
    n_units_l1: 128
    n_units_l2: 16
    n_units_l3: 4
    optimizer: Nadam
    weight_decay: 0.00196668286965003

AUPRC test score: 0.1600558274061459


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.11495131592013766
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.4
    lr: 0.01434909560566445
    n_layers: 2
    n_units_l0: 64
    n_units_l1: 32
    optimizer: RMSprop
    weight_decay: 0.00032918265075784753

AUPRC test score: 0.1409686497070065



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.14386


In [16]:
results_dict[cell_line][task][f'{model.__name__}_{type_augm_genfeatures}'] = kf_CV.scores_dict

In [17]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

In [18]:
 #checkpoint = torch.load('models/HEPG2_active_EP_vs_inactive_rest_FFNN_TEST.pt')

---
### 2. CNN

In [19]:
model=CNN

In [20]:
# IMBALANCED
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                rebalancing=True,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:26,061][0m Using an existing study with name 'MCF7_active_E_vs_inactive_E_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.09389196027710629
  Params: 
    dropout_l0: 0.2
    kernel_size_l0: 11
    lr: 0.00025087213223384634
    n_layers: 1
    optimizer: RMSprop
    out_channels_l0: 32
    weight_decay: 0.003070739372241233

AUPRC test score: 0.10003335708702521


>>> ITERATION N. 2



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:26,457][0m Using an existing study with name 'MCF7_active_E_vs_inactive_E_CNN_1_2' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.09150154817619814
  Params: 
    dropout_l0: 0.2
    kernel_size_l0: 5
    lr: 7.209500322705577e-05
    n_layers: 1
    optimizer: Adam
    out_channels_l0: 64
    weight_decay: 0.00028139605023063427



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:26,801][0m Using an existing study with name 'MCF7_active_E_vs_inactive_E_CNN_1_2_3' instead of creating a new one.[0m


AUPRC test score: 0.09937864570737716


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.09905333668116595
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.5
    dropout_l2: 0.4
    dropout_l3: 0.5
    kernel_size_l0: 5
    kernel_size_l1: 11
    kernel_size_l2: 5
    kernel_size_l3: 5
    lr: 3.829983183128241e-05
    n_layers: 4
    optimizer: RMSprop
    out_channels_l0: 64
    out_channels_l1: 96
    out_channels_l2: 96
    out_channels_l3: 512
    weight_decay: 0.0005940623477646493

AUPRC test score: 0.10141603934021945



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.10028


In [21]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [22]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 2) ACTIVE PROMOTERS vs INACTIVE PROMOTERS

In [23]:
task = TASKS[1]
task

'active_P_vs_inactive_P'

In [24]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


---

### 1. FFNN

In [25]:
model=FFNN

In [26]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

>>> ITERATION N. 1


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:28,445][0m Using an existing study with name 'MCF7_active_P_vs_inactive_P_FFNN_1' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.315137409532656
  Params: 
    dropout_l0: 0.4
    lr: 0.008984685326587456
    n_layers: 1
    n_units_l0: 64
    optimizer: Adam
    weight_decay: 0.00015096825300081571

AUPRC test score: 0.3221291200095142


>>> ITERATION N. 2



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:28,600][0m Using an existing study with name 'MCF7_active_P_vs_inactive_P_FFNN_1_2' instead of creating a new one.[0m
  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:28,751][0m Using an existing study with name 'MCF7_active_P_vs_inactive_P_FFNN_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.31439846738766786
  Params: 
    dropout_l0: 0.0
    dropout_l1: 0.3
    dropout_l2: 0.4
    lr: 0.00764701679582116
    n_layers: 3
    n_units_l0: 256
    n_units_l1: 128
    n_units_l2: 16
    optimizer: Nadam
    weight_decay: 0.0002468820967513187

AUPRC test score: 0.3154837498069207


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.31228434731514454
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.2
    dropout_l2: 0.0
    dropout_l3: 0.4
    lr: 0.013549864334306648
    n_layers: 4
    n_units_l0: 32
    n_units_l1: 16
    n_units_l2: 4
    n_units_l3: 4
    optimizer: RMSprop
    weight_decay: 0.00030756802184957253

AUPRC test score: 0.31977422932589095



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.31913


In [27]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [28]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [29]:
model=CNN

In [30]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:29,268][0m Using an existing study with name 'MCF7_active_P_vs_inactive_P_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.21755355160397039
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.4
    dropout_l2: 0.5
    dropout_l3: 0.4
    kernel_size_l0: 15
    kernel_size_l1: 11
    kernel_size_l2: 15
    kernel_size_l3: 15
    lr: 0.0031281302922769295
    n_layers: 4
    optimizer: RMSprop
    out_channels_l0: 64
    out_channels_l1: 32
    out_channels_l2: 64
    out_channels_l3: 256
    weight_decay: 0.0004295828911804462

AUPRC test score: 0.24170841944289315


>>> ITERATION N. 2


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:29,449][0m Using an existing study with name 'MCF7_active_P_vs_inactive_P_CNN_1_2' instead of creating a new one.[0m
  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:29,616][0m Using an existing study with name 'MCF7_active_P_vs_inactive_P_CNN_1_2_3' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.22281379371296492
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0
    kernel_size_l0: 5
    kernel_size_l1: 15
    lr: 0.002318493036521783
    n_layers: 2
    optimizer: Nadam
    out_channels_l0: 16
    out_channels_l1: 32
    weight_decay: 0.0001800519654237394

AUPRC test score: 0.23492085542110488


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.13697801195388568
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0
    dropout_l2: 0.5
    kernel_size_l0: 5
    kernel_size_l1: 11
    kernel_size_l2: 5
    lr: 3.677475902026386e-05
    n_layers: 3
    optimizer: Adam
    out_channels_l0: 64
    out_channels_l1: 64
    out_channels_l2: 256
    weight_decay: 0.013255794749337402

AUPRC test score: 0.18198899214900563



3-FOLD CROSS-VALIDATION 

In [31]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [32]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 3) ACTIVE ENHANCERS vs ACTIVE PROMOTERS

In [33]:
task = TASKS[2]
task

'active_E_vs_active_P'

In [34]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


---
### 1. FFNN

In [35]:
model=FFNN

In [36]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:30,816][0m Using an existing study with name 'MCF7_active_E_vs_active_P_FFNN_1' instead of creating a new one.[0m
  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:30,906][0m Using an existing study with name 'MCF7_active_E_vs_active_P_FFNN_1_2' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.7096454663191246
  Params: 
    dropout_l0: 0.3
    lr: 1.6774085362236975e-05
    n_layers: 1
    n_units_l0: 128
    optimizer: Adam
    weight_decay: 0.019451910892673445

AUPRC test score: 0.6950011441334473


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.7678653119946519
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.4
    dropout_l2: 0.0
    dropout_l3: 0.5
    lr: 0.0005573033856009564
    n_layers: 4
    n_units_l0: 32
    n_units_l1: 32
    n_units_l2: 4
    n_units_l3: 16
    optimizer: Adam
    weight_decay: 0.00023428768324883526

AUPRC test score: 0.7707464011494527


>>> ITERATION N. 3



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:31,011][0m Using an existing study with name 'MCF7_active_E_vs_active_P_FFNN_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.7112440575704065
  Params: 
    dropout_l0: 0.0
    lr: 0.007165542191669888
    n_layers: 1
    n_units_l0: 128
    optimizer: RMSprop
    weight_decay: 0.03156357401665216

AUPRC test score: 0.7354451483225724



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.73373


In [37]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [38]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [39]:
model=CNN

In [40]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:31,663][0m Using an existing study with name 'MCF7_active_E_vs_active_P_CNN_1' instead of creating a new one.[0m
  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:31,794][0m Using an existing study with name 'MCF7_active_E_vs_active_P_CNN_1_2' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.6484612326102499
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.5
    kernel_size_l0: 15
    kernel_size_l1: 15
    lr: 0.0003597358552654506
    n_layers: 2
    optimizer: Nadam
    out_channels_l0: 16
    out_channels_l1: 32
    weight_decay: 0.0001066404046495287

AUPRC test score: 0.6670092164704676


>>> ITERATION N. 2



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:32,019][0m Using an existing study with name 'MCF7_active_E_vs_active_P_CNN_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.6323274142001838
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.4
    kernel_size_l0: 5
    kernel_size_l1: 11
    lr: 0.021535828508998656
    n_layers: 2
    optimizer: Adam
    out_channels_l0: 64
    out_channels_l1: 96
    weight_decay: 0.0011004887566763912

AUPRC test score: 0.6929092041182124


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.6856422191760143
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0
    dropout_l2: 0.5
    dropout_l3: 0.4
    kernel_size_l0: 11
    kernel_size_l1: 11
    kernel_size_l2: 5
    kernel_size_l3: 11
    lr: 0.020334941944798873
    n_layers: 4
    optimizer: Adam
    out_channels_l0: 32
    out_channels_l1: 64
    out_channels_l2: 128
    out_channels_l3: 512
    weight_decay: 0.00020706441060796616

In [41]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [42]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 4) INACTIVE ENHANCERS vs INACTIVE PROMOTERS

In [43]:
task = TASKS[3]
task

'inactive_E_vs_inactive_P'

In [44]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


---
### 1. FFNN

In [45]:
model=FFNN

In [46]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

>>> ITERATION N. 1



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:34,225][0m Using an existing study with name 'MCF7_inactive_E_vs_inactive_P_FFNN_1' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5756689780331499
  Params: 
    dropout_l0: 0.3
    lr: 0.0022989517016313325
    n_layers: 1
    n_units_l0: 256
    optimizer: Adam
    weight_decay: 0.0001749697831462642



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:34,470][0m Using an existing study with name 'MCF7_inactive_E_vs_inactive_P_FFNN_1_2' instead of creating a new one.[0m


AUPRC test score: 0.5691602618017462


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5708677478324361
  Params: 
    dropout_l0: 0.2
    lr: 0.0011757572944647852
    n_layers: 1
    n_units_l0: 256
    optimizer: Adam
    weight_decay: 0.000403575919185443



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:34,636][0m Using an existing study with name 'MCF7_inactive_E_vs_inactive_P_FFNN_1_2_3' instead of creating a new one.[0m


AUPRC test score: 0.574564571611527


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5618202881392927
  Params: 
    dropout_l0: 0.0
    dropout_l1: 0.4
    dropout_l2: 0.5
    dropout_l3: 0.5
    lr: 0.0002694905757456156
    n_layers: 4
    n_units_l0: 32
    n_units_l1: 32
    n_units_l2: 16
    n_units_l3: 32
    optimizer: Adam
    weight_decay: 0.00170263668849003

AUPRC test score: 0.5657412569422515



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.56982


In [47]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [48]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [49]:
model=CNN

In [50]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:34,846][0m Using an existing study with name 'MCF7_inactive_E_vs_inactive_P_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5681047544165029
  Params: 
    dropout_l0: 0
    dropout_l1: 0.4
    kernel_size_l0: 11
    kernel_size_l1: 11
    lr: 0.0037647308588027394
    n_layers: 2
    optimizer: Adam
    out_channels_l0: 16
    out_channels_l1: 64
    weight_decay: 0.008370639638239263



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:35,125][0m Using an existing study with name 'MCF7_inactive_E_vs_inactive_P_CNN_1_2' instead of creating a new one.[0m


AUPRC test score: 0.5627756728674096


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5723067781338029
  Params: 
    dropout_l0: 0
    dropout_l1: 0.4
    kernel_size_l0: 11
    kernel_size_l1: 5
    lr: 0.00020617600088146414
    n_layers: 2
    optimizer: RMSprop
    out_channels_l0: 32
    out_channels_l1: 64
    weight_decay: 0.0500321450084389



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:35,341][0m Using an existing study with name 'MCF7_inactive_E_vs_inactive_P_CNN_1_2_3' instead of creating a new one.[0m


AUPRC test score: 0.5781441243421823


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5791040756455338
  Params: 
    dropout_l0: 0
    dropout_l1: 0.5
    kernel_size_l0: 11
    kernel_size_l1: 5
    lr: 0.00011953090520814405
    n_layers: 2
    optimizer: Nadam
    out_channels_l0: 64
    out_channels_l1: 96
    weight_decay: 0.027301004324881854

AUPRC test score: 0.5947735538563304



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.57856


In [51]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [52]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 5) ACTIVE ENHANCERS + ACTIVE PROMOTERS vs INACTIVE REST

In [53]:
task = TASKS[4]
task

'active_EP_vs_inactive_rest'

In [54]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


---
### 1. FFNN

In [55]:
model = FFNN

In [56]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

>>> ITERATION N. 1



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:37,678][0m Using an existing study with name 'MCF7_active_EP_vs_inactive_rest_FFNN_1' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.24223812419447419
  Params: 
    dropout_l0: 0.0
    dropout_l1: 0.2
    lr: 0.0016743579481795215
    n_layers: 2
    n_units_l0: 128
    n_units_l1: 32
    optimizer: Adam
    weight_decay: 0.007796462531463298



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:37,920][0m Using an existing study with name 'MCF7_active_EP_vs_inactive_rest_FFNN_1_2' instead of creating a new one.[0m


AUPRC test score: 0.2541496830090382


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.2736386525778468
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.0
    dropout_l2: 0.4
    lr: 0.0030609091678264833
    n_layers: 3
    n_units_l0: 32
    n_units_l1: 16
    n_units_l2: 32
    optimizer: Adam
    weight_decay: 0.0009475385603190167



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:38,099][0m Using an existing study with name 'MCF7_active_EP_vs_inactive_rest_FFNN_1_2_3' instead of creating a new one.[0m


AUPRC test score: 0.2584451780787411


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.25881126252186665
  Params: 
    dropout_l0: 0.0
    dropout_l1: 0.2
    dropout_l2: 0.0
    lr: 0.00023322409021867444
    n_layers: 3
    n_units_l0: 32
    n_units_l1: 128
    n_units_l2: 16
    optimizer: RMSprop
    weight_decay: 0.0005708521575316305

AUPRC test score: 0.256277711331824



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.25629


In [57]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [58]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [59]:
model=CNN

In [60]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:38,285][0m Using an existing study with name 'MCF7_active_EP_vs_inactive_rest_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.1855954884146058
  Params: 
    dropout_l0: 0.4
    kernel_size_l0: 5
    lr: 0.00036984561976086263
    n_layers: 1
    optimizer: Adam
    out_channels_l0: 16
    weight_decay: 0.00042275750090350564

AUPRC test score: 0.1933381159622135


>>> ITERATION N. 2


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:38,482][0m Using an existing study with name 'MCF7_active_EP_vs_inactive_rest_CNN_1_2' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.1975263722231554
  Params: 
    dropout_l0: 0
    dropout_l1: 0.5
    kernel_size_l0: 11
    kernel_size_l1: 15
    lr: 3.1489663521196865e-05
    n_layers: 2
    optimizer: RMSprop
    out_channels_l0: 64
    out_channels_l1: 32
    weight_decay: 0.00036618334676583736

AUPRC test score: 0.1821397192274808


>>> ITERATION N. 3



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 10:45:38,646][0m Using an existing study with name 'MCF7_active_EP_vs_inactive_rest_CNN_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.1819120028971152
  Params: 
    dropout_l0: 0.2
    kernel_size_l0: 15
    lr: 1.3565983231173965e-05
    n_layers: 1
    optimizer: RMSprop
    out_channels_l0: 32
    weight_decay: 0.00019074218256160837

AUPRC test score: 0.18436622580657638



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.18661


In [61]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [62]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)