## BIOINFORMATICS THESIS: MULTIMODAL NEURAL NETWORK

# CELL LINE: HEPG2

In [1]:
import pandas as pd 
import numpy as np
import os

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from collections import defaultdict, OrderedDict
import pickle

import sqlite3
from sqlalchemy import create_engine

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
# create a database to store optuna studies with sqlite backend
#engine = create_engine('sqlite:///BIOINF_optuna_tuning.db')

In [3]:
from BIOINF_tesi.data_pipe import CELL_LINES, TASKS

In [4]:
cell_line = CELL_LINES[4]
cell_line

'HEPG2'

---

In [5]:
from BIOINF_tesi.data_pipe import Load_Create_Task
from BIOINF_tesi.data_pipe import Build_DataLoader_Pipeline

In [6]:
from BIOINF_tesi.models import FFNN, CNN, CNN_LSTM
from BIOINF_tesi.models.utils import fit, Param_Search, Kfold_CV

In [7]:
with open ('results_dict.pickle', 'rb') as fin:
    results_dict = pickle.load(fin)
    results_dict = defaultdict(lambda: defaultdict(dict), results_dict)

## 1) ACTIVE ENHANCERS vs INACTIVE ENHANCERS

In [8]:
task = TASKS[0]
task

'active_E_vs_inactive_E'

In [9]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


### 1. FFNN

In [10]:
model=FFNN

In [11]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

>>> ITERATION N. 1


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 09:17:29,177][0m Using an existing study with name 'HEPG2_active_E_vs_inactive_E_FFNN_1' instead of creating a new one.[0m







Study statistics: 
  Number of finished trials:  4
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.30246773255512566
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.4
    dropout_l2: 0.4
    lr: 0.0002751637345988326
    n_layers: 3
    n_units_l0: 64
    n_units_l1: 64
    n_units_l2: 64
    optimizer: RMSprop
    weight_decay: 0.006532121861557937

AUPRC test score: 0.27310734138670323


>>> ITERATION N. 2


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 09:17:29,963][0m Using an existing study with name 'HEPG2_active_E_vs_inactive_E_FFNN_1_2' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.28259727988125605
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.4
    dropout_l2: 0.0
    lr: 0.00023647779728299236
    n_layers: 3
    n_units_l0: 64
    n_units_l1: 16
    n_units_l2: 32
    optimizer: Adam
    weight_decay: 0.00018196473815629665

AUPRC test score: 0.2950961664444564


>>> ITERATION N. 3



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 09:17:30,538][0m Using an existing study with name 'HEPG2_active_E_vs_inactive_E_FFNN_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.2951257922180448
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.4
    lr: 0.0021994398341532444
    n_layers: 2
    n_units_l0: 128
    n_units_l1: 32
    optimizer: Nadam
    weight_decay: 0.00013970265972714527

AUPRC test score: 0.26283101020388605



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.27701


In [12]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [13]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [None]:
model=CNN

In [None]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

In [None]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [None]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 2) ACTIVE PROMOTERS vs INACTIVE PROMOTERS

In [14]:
task = TASKS[1]
task

'active_P_vs_inactive_P'

In [15]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


---

### 1. FFNN

In [16]:
model=FFNN

In [17]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

>>> ITERATION N. 1


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 09:17:33,209][0m Using an existing study with name 'HEPG2_active_P_vs_inactive_P_FFNN_1' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.36608951955804225
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.3
    lr: 0.0003182252665246462
    n_layers: 2
    n_units_l0: 32
    n_units_l1: 128
    optimizer: Nadam
    weight_decay: 0.0002921643805447467

AUPRC test score: 0.36443799700960344


>>> ITERATION N. 2



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 09:17:33,902][0m Using an existing study with name 'HEPG2_active_P_vs_inactive_P_FFNN_1_2' instead of creating a new one.[0m
  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 09:17:34,161][0m Using an existing study with name 'HEPG2_active_P_vs_inactive_P_FFNN_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.3528417088081342
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.3
    dropout_l2: 0.4
    lr: 5.636442127960079e-05
    n_layers: 3
    n_units_l0: 64
    n_units_l1: 128
    n_units_l2: 32
    optimizer: Adam
    weight_decay: 0.0012035100123673963

AUPRC test score: 0.3585021953383037


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.3358176976484098
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.2
    dropout_l2: 0.0
    dropout_l3: 0.0
    lr: 0.0033831488205093526
    n_layers: 4
    n_units_l0: 256
    n_units_l1: 64
    n_units_l2: 64
    n_units_l3: 4
    optimizer: Adam
    weight_decay: 0.00013492296512505818

AUPRC test score: 0.3641387421090204



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.36236


In [18]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [19]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [None]:
model=CNN

In [None]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

In [None]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [None]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 3) ACTIVE ENHANCERS vs ACTIVE PROMOTERS

In [20]:
task = TASKS[2]
task

'active_E_vs_active_P'

In [21]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


---
### 1. FFNN

In [22]:
model=FFNN

In [23]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 09:17:37,002][0m Using an existing study with name 'HEPG2_active_E_vs_active_P_FFNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 09:17:37,363][0m Using an existing study with name 'HEPG2_active_E_vs_active_P_FFNN_1_2' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.7705184516710489
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.3
    lr: 0.0012553604113786918
    n_layers: 2
    n_units_l0: 32
    n_units_l1: 16
    optimizer: RMSprop
    weight_decay: 0.001797821834460632

AUPRC test score: 0.7675287883751185


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.7966589832684886
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.2
    dropout_l2: 0.5
    lr: 0.0014106447213438282
    n_layers: 3
    n_units_l0: 32
    n_units_l1: 16
    n_units_l2: 16
    optimizer: Nadam
    weight_decay: 0.009287629461518369

AUPRC test score: 0.7531940169435164


>>> ITERATION N. 3



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 09:17:37,492][0m Using an existing study with name 'HEPG2_active_E_vs_active_P_FFNN_1_2_3' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.7952590857347394
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.2
    dropout_l2: 0.4
    dropout_l3: 0.0
    lr: 0.00811410030330569
    n_layers: 4
    n_units_l0: 256
    n_units_l1: 128
    n_units_l2: 4
    n_units_l3: 16
    optimizer: Nadam
    weight_decay: 0.0002146880530872523

AUPRC test score: 0.7770170680306702



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.76591


In [24]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [25]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [None]:
model=CNN

In [None]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

In [None]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [None]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 4) INACTIVE ENHANCERS vs INACTIVE PROMOTERS

In [26]:
task = TASKS[3]
task

'inactive_E_vs_inactive_P'

In [27]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


---
### 1. FFNN

In [28]:
model=FFNN

In [29]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

>>> ITERATION N. 1


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 09:17:43,675][0m Using an existing study with name 'HEPG2_inactive_E_vs_inactive_P_FFNN_1' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5885643242798053
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.0
    dropout_l2: 0.4
    lr: 0.003289156104898304
    n_layers: 3
    n_units_l0: 64
    n_units_l1: 16
    n_units_l2: 16
    optimizer: RMSprop
    weight_decay: 0.0004364986683894866

AUPRC test score: 0.5913131626725182


>>> ITERATION N. 2


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 09:17:44,974][0m Using an existing study with name 'HEPG2_inactive_E_vs_inactive_P_FFNN_1_2' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5757882018786497
  Params: 
    dropout_l0: 0.0
    dropout_l1: 0.0
    dropout_l2: 0.4
    lr: 0.00022350904618234039
    n_layers: 3
    n_units_l0: 128
    n_units_l1: 64
    n_units_l2: 4
    optimizer: Adam
    weight_decay: 0.0017387785562133038

AUPRC test score: 0.5881838278528447


>>> ITERATION N. 3


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 09:17:46,027][0m Using an existing study with name 'HEPG2_inactive_E_vs_inactive_P_FFNN_1_2_3' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.593631942556033
  Params: 
    dropout_l0: 0.0
    dropout_l1: 0.0
    dropout_l2: 0.4
    lr: 0.001091750594439732
    n_layers: 3
    n_units_l0: 32
    n_units_l1: 16
    n_units_l2: 32
    optimizer: Nadam
    weight_decay: 0.0005246323876219314

AUPRC test score: 0.5938093934064081



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.5911


In [30]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [31]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [None]:
model=CNN

In [None]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

In [None]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [None]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 5) ACTIVE ENHANCERS + ACTIVE PROMOTERS vs INACTIVE REST

In [32]:
task = TASKS[4]
task

'active_EP_vs_inactive_rest'

In [33]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!


---
### 1. FFNN

In [34]:
model = FFNN

In [35]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=False,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

>>> ITERATION N. 1


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 09:17:48,535][0m Using an existing study with name 'HEPG2_active_EP_vs_inactive_rest_FFNN_1' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.321052896641659
  Params: 
    dropout_l0: 0.2
    lr: 0.0007383437759569108
    n_layers: 1
    n_units_l0: 128
    optimizer: RMSprop
    weight_decay: 0.00022775937514386005

AUPRC test score: 0.31047692071414373


>>> ITERATION N. 2


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 09:17:49,686][0m Using an existing study with name 'HEPG2_active_EP_vs_inactive_rest_FFNN_1_2' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.3314628684083196
  Params: 
    dropout_l0: 0.0
    dropout_l1: 0.0
    dropout_l2: 0.5
    dropout_l3: 0.5
    lr: 0.009549055080469148
    n_layers: 4
    n_units_l0: 128
    n_units_l1: 16
    n_units_l2: 16
    n_units_l3: 32
    optimizer: Adam
    weight_decay: 0.001467953137838054

AUPRC test score: 0.3215525467656329


>>> ITERATION N. 3


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-13 09:17:50,561][0m Using an existing study with name 'HEPG2_active_EP_vs_inactive_rest_FFNN_1_2_3' instead of creating a new one.[0m



Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.32261183935250076
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.3
    lr: 0.0006436334131323483
    n_layers: 2
    n_units_l0: 128
    n_units_l1: 16
    optimizer: Adam
    weight_decay: 0.0012383837141012068

AUPRC test score: 0.3265612344407428



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.31953


In [36]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [37]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [None]:
model=CNN

In [None]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                sequence=True,
                rebalancing=False,
                model = model,
                device = device,
                task=task,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST')

In [None]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [None]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)