## BIOINFORMATICS THESIS: MULTIMODAL NEURAL NETWORK

# CELL LINE: HEK293

In [1]:
import pandas as pd 
import numpy as np
import os

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from collections import defaultdict, OrderedDict
import pickle

import sqlite3
from sqlalchemy import create_engine

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
# create a database to store optuna studies with sqlite backend
#engine = create_engine('sqlite:///BIOINF_optuna_tuning.db')

In [3]:
from BIOINF_tesi.data_pipe import CELL_LINES, TASKS

In [4]:
cell_line = CELL_LINES[3]
cell_line

'HEK293'

---

In [5]:
from BIOINF_tesi.data_pipe import Load_Create_Task
from BIOINF_tesi.data_pipe import Build_DataLoader_Pipeline

In [6]:
from BIOINF_tesi.models import FFNN, CNN, CNN_LSTM
from BIOINF_tesi.models.utils import fit, Param_Search, Kfold_CV

In [7]:
with open ('results_dict.pickle', 'rb') as fin:
    results_dict = pickle.load(fin)
    results_dict = defaultdict(lambda: defaultdict(dict), results_dict)

## 1) ACTIVE ENHANCERS vs INACTIVE ENHANCERS

In [8]:
task = TASKS[0]
task

'active_E_vs_inactive_E'

In [9]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


### 1. FFNN

In [10]:
model=FFNN

In [11]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

>>> ITERATION N. 1



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 09:26:23,507][0m Using an existing study with name 'HEK293_active_E_vs_inactive_E_FFNN_1' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.16458739347187304
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.4
    lr: 1.7760222507598077e-05
    n_layers: 2
    n_units_l0: 32
    n_units_l1: 16
    optimizer: Adam
    weight_decay: 0.00035153603567130044

AUPRC test score: 0.16191443666789307


>>> ITERATION N. 2


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 09:26:24,003][0m Using an existing study with name 'HEK293_active_E_vs_inactive_E_FFNN_1_2' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.16593744693547127
  Params: 
    dropout_l0: 0.0
    lr: 0.00017024566211531846
    n_layers: 1
    n_units_l0: 128
    optimizer: Nadam
    weight_decay: 0.0001575567483095393

AUPRC test score: 0.16076440502930825


>>> ITERATION N. 3



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 09:26:24,374][0m Using an existing study with name 'HEK293_active_E_vs_inactive_E_FFNN_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.16185245696146186
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.0
    dropout_l2: 0.5
    dropout_l3: 0.5
    lr: 5.22980118108992e-05
    n_layers: 4
    n_units_l0: 64
    n_units_l1: 64
    n_units_l2: 4
    n_units_l3: 4
    optimizer: Adam
    weight_decay: 0.00018574717692331917

AUPRC test score: 0.1591607049566432



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.16061


In [12]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [13]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [10]:
model=CNN

In [11]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:41:08,763][0m Using an existing study with name 'HEK293_active_E_vs_inactive_E_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1





Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.1133099961743724
  Params: 
    dropout_l0: 0
    kernel_size_l0: 5
    lr: 0.0004381109119950302
    n_layers: 1
    optimizer: Nadam
    out_channels_l0: 64
    weight_decay: 0.08954239966464901

AUPRC test score: 0.10284756703078449




  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:41:12,570][0m Using an existing study with name 'HEK293_active_E_vs_inactive_E_CNN_1_2' instead of creating a new one.[0m


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.12235637688715266
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.5
    kernel_size_l0: 11
    kernel_size_l1: 15
    lr: 9.38558837773788e-05
    n_layers: 2
    optimizer: RMSprop
    out_channels_l0: 16
    out_channels_l1: 96
    weight_decay: 0.00031157750873082526



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:41:13,718][0m Using an existing study with name 'HEK293_active_E_vs_inactive_E_CNN_1_2_3' instead of creating a new one.[0m


AUPRC test score: 0.12327314678920338


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.11941615382898556
  Params: 
    dropout_l0: 0.2
    kernel_size_l0: 15
    lr: 4.4592402172350214e-05
    n_layers: 1
    optimizer: Nadam
    out_channels_l0: 32
    weight_decay: 0.05858941378539896

AUPRC test score: 0.11656113702031633



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.11423


In [12]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [13]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 2) ACTIVE PROMOTERS vs INACTIVE PROMOTERS

In [10]:
task = TASKS[1]
task

'active_P_vs_inactive_P'

In [11]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


---

### 1. FFNN

In [16]:
model=FFNN

In [17]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

>>> ITERATION N. 1



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 09:26:26,911][0m Using an existing study with name 'HEK293_active_P_vs_inactive_P_FFNN_1' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.2988499227774299
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.4
    dropout_l2: 0.0
    dropout_l3: 0.0
    lr: 0.0002592171558201749
    n_layers: 4
    n_units_l0: 256
    n_units_l1: 128
    n_units_l2: 64
    n_units_l3: 32
    optimizer: Nadam
    weight_decay: 0.003364548679653016

AUPRC test score: 0.3135440132988252


>>> ITERATION N. 2


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 09:26:27,497][0m Using an existing study with name 'HEK293_active_P_vs_inactive_P_FFNN_1_2' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.30961353697269156
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.4
    dropout_l2: 0.0
    dropout_l3: 0.4
    lr: 0.0014329665555075366
    n_layers: 4
    n_units_l0: 64
    n_units_l1: 32
    n_units_l2: 16
    n_units_l3: 16
    optimizer: RMSprop
    weight_decay: 0.0002560110097076588

AUPRC test score: 0.3131067296755952


>>> ITERATION N. 3



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 09:26:27,877][0m Using an existing study with name 'HEK293_active_P_vs_inactive_P_FFNN_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.3228463156489267
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.3
    lr: 0.0006090341310439356
    n_layers: 2
    n_units_l0: 256
    n_units_l1: 128
    optimizer: Adam
    weight_decay: 0.00018754638517839026

AUPRC test score: 0.31016971641786



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.31227


In [18]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [19]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [16]:
model=CNN

In [17]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:41:15,266][0m Using an existing study with name 'HEK293_active_P_vs_inactive_P_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  5
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.23634838115626083
  Params: 
    dropout_l0: 0.2
    kernel_size_l0: 5
    lr: 0.0012476395416578523
    n_layers: 1
    optimizer: Nadam
    out_channels_l0: 16
    weight_decay: 0.007799346530330436



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:41:15,833][0m Using an existing study with name 'HEK293_active_P_vs_inactive_P_CNN_1_2' instead of creating a new one.[0m


AUPRC test score: 0.23896392245610296


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.25739152247561664
  Params: 
    dropout_l0: 0.2
    kernel_size_l0: 5
    lr: 0.00026891573541761195
    n_layers: 1
    optimizer: Nadam
    out_channels_l0: 64
    weight_decay: 0.015210202806871999

AUPRC test score: 0.24010894049732545




  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:41:17,510][0m Using an existing study with name 'HEK293_active_P_vs_inactive_P_CNN_1_2_3' instead of creating a new one.[0m


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.25248577546912715
  Params: 
    dropout_l0: 0.2
    kernel_size_l0: 11
    lr: 0.0015091286858401926
    n_layers: 1
    optimizer: RMSprop
    out_channels_l0: 32
    weight_decay: 0.0006599021522262837

AUPRC test score: 0.24176528259304247



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.24028


In [18]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [19]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 3) ACTIVE ENHANCERS vs ACTIVE PROMOTERS

In [12]:
task = TASKS[2]
task

'active_E_vs_active_P'

In [13]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


---
### 1. FFNN

In [22]:
model=FFNN

In [23]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 09:26:30,790][0m Using an existing study with name 'HEK293_active_E_vs_active_P_FFNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 09:26:31,224][0m Using an existing study with name 'HEK293_active_E_vs_active_P_FFNN_1_2' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.8233405492046173
  Params: 
    dropout_l0: 0.0
    dropout_l1: 0.3
    lr: 0.021833340366453704
    n_layers: 2
    n_units_l0: 256
    n_units_l1: 32
    optimizer: RMSprop
    weight_decay: 0.0010644867089441496

AUPRC test score: 0.8281184241233259


>>> ITERATION N. 2



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 09:26:31,418][0m Using an existing study with name 'HEK293_active_E_vs_active_P_FFNN_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.8131740437815634
  Params: 
    dropout_l0: 0.2
    lr: 0.0003858198354945763
    n_layers: 1
    n_units_l0: 64
    optimizer: Adam
    weight_decay: 0.0047500877074207765

AUPRC test score: 0.7851906003634145


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.8175466379718149
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.4
    dropout_l2: 0.5
    dropout_l3: 0.5
    lr: 0.0007211654523065957
    n_layers: 4
    n_units_l0: 256
    n_units_l1: 16
    n_units_l2: 64
    n_units_l3: 16
    optimizer: Adam
    weight_decay: 0.00036872344853272165

AUPRC test score: 0.8169269055580827



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.81008


In [24]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [25]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [22]:
model=CNN

In [23]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:41:21,801][0m Using an existing study with name 'HEK293_active_E_vs_active_P_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  4
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.664790084372147
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.5
    kernel_size_l0: 15
    kernel_size_l1: 5
    lr: 0.0003881792019576403
    n_layers: 2
    optimizer: Adam
    out_channels_l0: 16
    out_channels_l1: 96
    weight_decay: 0.0051438368117251035

AUPRC test score: 0.7338670139511426




  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:41:22,293][0m Using an existing study with name 'HEK293_active_E_vs_active_P_CNN_1_2' instead of creating a new one.[0m


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.74215569832422
  Params: 
    dropout_l0: 0.4
    kernel_size_l0: 15
    lr: 0.03505385342870182
    n_layers: 1
    optimizer: Adam
    out_channels_l0: 16
    weight_decay: 0.014873046346607424

AUPRC test score: 0.3366822867853796


>>> ITERATION N. 3



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:41:22,474][0m Using an existing study with name 'HEK293_active_E_vs_active_P_CNN_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.7807566112308788
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.5
    kernel_size_l0: 5
    kernel_size_l1: 5
    lr: 5.573739705646726e-05
    n_layers: 2
    optimizer: Nadam
    out_channels_l0: 64
    out_channels_l1: 64
    weight_decay: 0.00013988365246620204

AUPRC test score: 0.7338253181026665



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.60146


In [24]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [25]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 4) INACTIVE ENHANCERS vs INACTIVE PROMOTERS

In [14]:
task = TASKS[3]
task

'inactive_E_vs_inactive_P'

In [15]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


---
### 1. FFNN

In [27]:
model=FFNN

In [28]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

>>> ITERATION N. 1



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 09:26:36,529][0m Using an existing study with name 'HEK293_inactive_E_vs_inactive_P_FFNN_1' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5802277424662698
  Params: 
    dropout_l0: 0.0
    dropout_l1: 0.2
    dropout_l2: 0.0
    lr: 0.0013293172727225032
    n_layers: 3
    n_units_l0: 128
    n_units_l1: 128
    n_units_l2: 64
    optimizer: Nadam
    weight_decay: 0.0003540990190286287

AUPRC test score: 0.576534583647843


>>> ITERATION N. 2


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 09:26:37,476][0m Using an existing study with name 'HEK293_inactive_E_vs_inactive_P_FFNN_1_2' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5676141434069432
  Params: 
    dropout_l0: 0.2
    lr: 0.00015197979051611893
    n_layers: 1
    n_units_l0: 64
    optimizer: Nadam
    weight_decay: 0.00026931571794091576

AUPRC test score: 0.5677191108271845


>>> ITERATION N. 3


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 09:26:38,533][0m Using an existing study with name 'HEK293_inactive_E_vs_inactive_P_FFNN_1_2_3' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5608781202064563
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.3
    lr: 5.4644444349085786e-05
    n_layers: 2
    n_units_l0: 32
    n_units_l1: 64
    optimizer: RMSprop
    weight_decay: 0.0011322628212337555

AUPRC test score: 0.5698621920649399



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.57137


In [29]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [30]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [28]:
model=CNN

In [29]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:41:27,869][0m Using an existing study with name 'HEK293_inactive_E_vs_inactive_P_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5174594840953733
  Params: 
    dropout_l0: 0
    dropout_l1: 0
    dropout_l2: 0.4
    kernel_size_l0: 15
    kernel_size_l1: 11
    kernel_size_l2: 11
    lr: 0.03357499696572306
    n_layers: 3
    optimizer: Nadam
    out_channels_l0: 16
    out_channels_l1: 96
    out_channels_l2: 96
    weight_decay: 0.0006138088608068706

AUPRC test score: 0.39535618495869573


>>> ITERATION N. 2



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:41:28,244][0m Using an existing study with name 'HEK293_inactive_E_vs_inactive_P_CNN_1_2' instead of creating a new one.[0m
  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:41:28,532][0m Using an existing study with name 'HEK293_inactive_E_vs_inactive_P_CNN_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.3964761904761904
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.4
    dropout_l2: 0.4
    kernel_size_l0: 5
    kernel_size_l1: 5
    kernel_size_l2: 5
    lr: 0.019231152651047648
    n_layers: 3
    optimizer: Nadam
    out_channels_l0: 32
    out_channels_l1: 64
    out_channels_l2: 256
    weight_decay: 0.019618584226549756

AUPRC test score: 0.3927159103100528


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5673330264793102
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.5
    kernel_size_l0: 5
    kernel_size_l1: 5
    lr: 3.058417787608628e-05
    n_layers: 2
    optimizer: Nadam
    out_channels_l0: 16
    out_channels_l1: 96
    weight_decay: 0.0021571276491351506

AUPRC test score: 0.578205631109915



3-FOLD CROSS-VALIDATION AUP

In [30]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [31]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 5) ACTIVE ENHANCERS + ACTIVE PROMOTERS vs INACTIVE REST

In [16]:
task = TASKS[4]
task

'active_EP_vs_inactive_rest'

In [17]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


---
### 1. FFNN

In [33]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

>>> ITERATION N. 1



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 09:26:42,052][0m Using an existing study with name 'HEK293_active_EP_vs_inactive_rest_FFNN_1' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.26388295888279484
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.4
    dropout_l2: 0.5
    lr: 0.0031293958124626693
    n_layers: 3
    n_units_l0: 32
    n_units_l1: 16
    n_units_l2: 64
    optimizer: RMSprop
    weight_decay: 0.006273691609027729

AUPRC test score: 0.2345843510719315


>>> ITERATION N. 2


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 09:26:43,669][0m Using an existing study with name 'HEK293_active_EP_vs_inactive_rest_FFNN_1_2' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.2489020624218121
  Params: 
    dropout_l0: 0.3
    lr: 0.0010826199496680139
    n_layers: 1
    n_units_l0: 256
    optimizer: Nadam
    weight_decay: 0.00017954223678108932

AUPRC test score: 0.24701594993118636


>>> ITERATION N. 3


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 09:26:44,953][0m Using an existing study with name 'HEK293_active_EP_vs_inactive_rest_FFNN_1_2_3' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.25956349237903914
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.0
    dropout_l2: 0.5
    dropout_l3: 0.0
    lr: 6.48695595106983e-05
    n_layers: 4
    n_units_l0: 128
    n_units_l1: 64
    n_units_l2: 64
    n_units_l3: 16
    optimizer: Nadam
    weight_decay: 0.0010302150932859893

AUPRC test score: 0.24272350580920019



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.24144


In [34]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [35]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [34]:
model=CNN

In [35]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:41:37,836][0m Using an existing study with name 'HEK293_active_EP_vs_inactive_rest_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.12686427716097468
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.4
    dropout_l2: 0
    kernel_size_l0: 5
    kernel_size_l1: 5
    kernel_size_l2: 15
    lr: 0.0004225652013952601
    n_layers: 3
    optimizer: Adam
    out_channels_l0: 32
    out_channels_l1: 96
    out_channels_l2: 256
    weight_decay: 0.0103318395030826

AUPRC test score: 0.21722458669978875




  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:41:38,412][0m Using an existing study with name 'HEK293_active_EP_vs_inactive_rest_CNN_1_2' instead of creating a new one.[0m


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.22085197786456126
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.5
    dropout_l2: 0
    kernel_size_l0: 5
    kernel_size_l1: 5
    kernel_size_l2: 11
    lr: 0.00019694748738924235
    n_layers: 3
    optimizer: Adam
    out_channels_l0: 64
    out_channels_l1: 32
    out_channels_l2: 64
    weight_decay: 0.006869698350335493

AUPRC test score: 0.22306114772668667


>>> ITERATION N. 3



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-25 18:41:38,984][0m Using an existing study with name 'HEK293_active_EP_vs_inactive_rest_CNN_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.1843078454428729
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0
    dropout_l2: 0
    dropout_l3: 0.4
    kernel_size_l0: 11
    kernel_size_l1: 15
    kernel_size_l2: 15
    kernel_size_l3: 11
    lr: 0.000597476123832435
    n_layers: 4
    optimizer: Adam
    out_channels_l0: 32
    out_channels_l1: 96
    out_channels_l2: 96
    out_channels_l3: 128
    weight_decay: 0.0002517323396866886

AUPRC test score: 0.18547161037111656



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.20859


In [36]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [37]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)