## BIOINFORMATICS THESIS: MULTIMODAL NEURAL NETWORK

# CELL LINE: GM12878

In [8]:
import pandas as pd 
import numpy as np
import os

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from collections import defaultdict, OrderedDict
import pickle

import sqlite3

from sqlalchemy import create_engine

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [9]:
# create a database to store optuna studies with sqlite backend
#engine = create_engine('sqlite:///BIOINF_optuna_tuning.db')

In [10]:
from BIOINF_tesi.data_pipe import CELL_LINES, TASKS

In [11]:
cell_line = CELL_LINES[1]
cell_line

'GM12878'

---

In [12]:
from BIOINF_tesi.data_pipe import Load_Create_Task
from BIOINF_tesi.data_pipe import Build_DataLoader_Pipeline

In [13]:
from BIOINF_tesi.models import FFNN, CNN, CNN_LSTM
from BIOINF_tesi.models.utils import fit, Param_Search, Kfold_CV

In [14]:
with open ('results_dict.pickle', 'rb') as fin:
    results_dict = pickle.load(fin)
    results_dict = defaultdict(lambda: defaultdict(dict), results_dict)

## 1) ACTIVE ENHANCERS vs INACTIVE ENHANCERS

In [15]:
task = TASKS[0]
task

'active_E_vs_inactive_E'

In [16]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!




### 1. FFNN

In [None]:
model=FFNN

In [None]:
#
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                task=task,
                sequence=True,
                augmentation=False,
                model = model,
                device = device,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP.pt',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST.pt')

In [None]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [None]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [17]:
model=CNN

In [18]:
#
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                task=task,
                sequence=True,
                augmentation=False,
                model = model,
                device = device,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP.pt',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST.pt')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),


>>> ITERATION N. 1



[32m[I 2021-10-01 07:04:31,620][0m Using an existing study with name 'GM12878_active_E_vs_inactive_E_CNN_1' instead of creating a new one.[0m


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.1991186373748383
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.4
    dropout_l2: 0
    kernel_size_l0: 15
    kernel_size_l1: 11
    kernel_size_l2: 15
    lr: 0.0002635471074607425
    n_layers: 3
    optimizer: Adam
    out_channels_l0: 16
    out_channels_l1: 32
    out_channels_l2: 64
    weight_decay: 0.0002333090185857455



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-01 07:04:44,135][0m Using an existing study with name 'GM12878_active_E_vs_inactive_E_CNN_1_2' instead of creating a new one.[0m


AUPRC test score: 0.18390019860973192


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.17907570422535207
  Params: 
    dropout_l0: 0
    dropout_l1: 0
    kernel_size_l0: 5
    kernel_size_l1: 15
    lr: 0.00018767996453808377
    n_layers: 2
    optimizer: RMSprop
    out_channels_l0: 64
    out_channels_l1: 96
    weight_decay: 0.02158692795983577



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-01 07:04:45,708][0m Using an existing study with name 'GM12878_active_E_vs_inactive_E_CNN_1_2_3' instead of creating a new one.[0m


AUPRC test score: 0.18364697120158888


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.20274777961586543
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.4
    dropout_l2: 0.5
    dropout_l3: 0.4
    kernel_size_l0: 5
    kernel_size_l1: 5
    kernel_size_l2: 11
    kernel_size_l3: 15
    lr: 4.239740950857751e-05
    n_layers: 4
    optimizer: Adam
    out_channels_l0: 32
    out_channels_l1: 64
    out_channels_l2: 128
    out_channels_l3: 256
    weight_decay: 0.00019450532488717148

AUPRC test score: 0.18207547169811322



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.18321


In [19]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [20]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 3. CNN-LSTM

In [None]:
model=CNN_LSTM

In [None]:
#
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                task=task,
                sequence=True,
                augmentation=False,
                model = model,
                device = device,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP.pt',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST.pt')

In [None]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [None]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 2) ACTIVE PROMOTERS vs INACTIVE PROMOTERS

In [21]:
task = TASKS[1]
task

'active_P_vs_inactive_P'

In [22]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!




---

### 1. FFNN

In [None]:
model=FFNN

In [None]:
#
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                task=task,
                sequence=True,
                augmentation=False,
                model = model,
                device = device,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP.pt',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST.pt')

In [None]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [None]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [23]:
model=CNN

In [24]:
#
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                task=task,
                sequence=True,
                augmentation=False,
                model = model,
                device = device,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP.pt',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST.pt')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-01 07:04:49,650][0m Using an existing study with name 'GM12878_active_P_vs_inactive_P_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.1811703092194355
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0.5
    kernel_size_l0: 11
    kernel_size_l1: 11
    lr: 0.005670670018265359
    n_layers: 2
    optimizer: Adam
    out_channels_l0: 32
    out_channels_l1: 96
    weight_decay: 0.0009343382708774257



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-01 07:04:51,126][0m Using an existing study with name 'GM12878_active_P_vs_inactive_P_CNN_1_2' instead of creating a new one.[0m


AUPRC test score: 0.20569959807547683


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.23481733456892948
  Params: 
    dropout_l0: 0
    dropout_l1: 0.5
    dropout_l2: 0.5
    dropout_l3: 0.5
    kernel_size_l0: 11
    kernel_size_l1: 11
    kernel_size_l2: 5
    kernel_size_l3: 11
    lr: 8.286146741443369e-05
    n_layers: 4
    optimizer: RMSprop
    out_channels_l0: 64
    out_channels_l1: 64
    out_channels_l2: 256
    out_channels_l3: 128
    weight_decay: 0.02717582843269129



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-01 07:04:51,875][0m Using an existing study with name 'GM12878_active_P_vs_inactive_P_CNN_1_2_3' instead of creating a new one.[0m


AUPRC test score: 0.16517134343257353


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.18533643774407194
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.5
    kernel_size_l0: 5
    kernel_size_l1: 15
    lr: 0.0011492591657041753
    n_layers: 2
    optimizer: Adam
    out_channels_l0: 64
    out_channels_l1: 32
    weight_decay: 0.00021928291292614368

AUPRC test score: 0.18360172907018218



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.18482


In [25]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [26]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 3. CNN-LSTM

In [None]:
model=CNN_LSTM

In [None]:
#
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                task=task,
                sequence=True,
                augmentation=False,
                model = model,
                device = device,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP.pt',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST.pt')

In [None]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [None]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 3) ACTIVE ENHANCERS vs ACTIVE PROMOTERS

In [27]:
task = TASKS[2]
task

'active_E_vs_active_P'

In [28]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!




---
### 1. FFNN

In [None]:
model=FFNN

In [None]:
#
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                task=task,
                sequence=True,
                augmentation=False,
                model = model,
                device = device,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP.pt',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST.pt')

In [None]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [None]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [29]:
model=CNN

In [30]:
#
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                task=task,
                sequence=True,
                augmentation=False,
                model = model,
                device = device,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP.pt',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST.pt')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-01 07:04:55,197][0m Using an existing study with name 'GM12878_active_E_vs_active_P_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.8034771180238469
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.4
    dropout_l2: 0.5
    kernel_size_l0: 5
    kernel_size_l1: 15
    kernel_size_l2: 5
    lr: 0.00010285616674059193
    n_layers: 3
    optimizer: Nadam
    out_channels_l0: 32
    out_channels_l1: 96
    out_channels_l2: 64
    weight_decay: 0.0024106775748631614



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-01 07:04:55,827][0m Using an existing study with name 'GM12878_active_E_vs_active_P_CNN_1_2' instead of creating a new one.[0m


AUPRC test score: 0.6783653534604998


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.7981987863141515
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.4
    dropout_l2: 0.4
    dropout_l3: 0.4
    kernel_size_l0: 5
    kernel_size_l1: 11
    kernel_size_l2: 15
    kernel_size_l3: 5
    lr: 0.0010914744850346143
    n_layers: 4
    optimizer: Adam
    out_channels_l0: 64
    out_channels_l1: 32
    out_channels_l2: 128
    out_channels_l3: 512
    weight_decay: 0.002357720645183837



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-01 07:04:56,966][0m Using an existing study with name 'GM12878_active_E_vs_active_P_CNN_1_2_3' instead of creating a new one.[0m


AUPRC test score: 0.6817642276635852


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.7765956300025768
  Params: 
    dropout_l0: 0
    dropout_l1: 0.4
    dropout_l2: 0.4
    kernel_size_l0: 15
    kernel_size_l1: 15
    kernel_size_l2: 15
    lr: 0.00026616083877002646
    n_layers: 3
    optimizer: RMSprop
    out_channels_l0: 32
    out_channels_l1: 64
    out_channels_l2: 64
    weight_decay: 0.013217418070770792

AUPRC test score: 0.7221012747590638



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.69408


In [31]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [32]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 3. CNN-LSTM

In [None]:
model=CNN_LSTM

In [None]:
#
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                task=task,
                sequence=True,
                augmentation=False,
                model = model,
                device = device,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP.pt',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST.pt')

In [None]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [None]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 4) INACTIVE ENHANCERS vs INACTIVE PROMOTERS

In [33]:
task = TASKS[3]
task

'inactive_E_vs_inactive_P'

In [34]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!




---
### 1. FFNN

In [None]:
model=FFNN

In [None]:
#
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                task=task,
                sequence=True,
                augmentation=False,
                model = model,
                device = device,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP.pt',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST.pt')

In [None]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [None]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [35]:
model=CNN

In [36]:
#
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                task=task,
                sequence=True,
                augmentation=False,
                model = model,
                device = device,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP.pt',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST.pt')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-01 07:05:10,260][0m Using an existing study with name 'GM12878_inactive_E_vs_inactive_P_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.5518895337960382
  Params: 
    dropout_l0: 0
    dropout_l1: 0.5
    dropout_l2: 0.4
    dropout_l3: 0.4
    kernel_size_l0: 15
    kernel_size_l1: 15
    kernel_size_l2: 5
    kernel_size_l3: 15
    lr: 6.914570864629654e-05
    n_layers: 4
    optimizer: Adam
    out_channels_l0: 64
    out_channels_l1: 64
    out_channels_l2: 64
    out_channels_l3: 128
    weight_decay: 0.006160775476693151



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-01 07:05:10,979][0m Using an existing study with name 'GM12878_inactive_E_vs_inactive_P_CNN_1_2' instead of creating a new one.[0m


AUPRC test score: 0.5258723142208391


>>> ITERATION N. 2

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.552202396000139
  Params: 
    dropout_l0: 0.2
    dropout_l1: 0.4
    kernel_size_l0: 5
    kernel_size_l1: 11
    lr: 0.00019661936905887947
    n_layers: 2
    optimizer: RMSprop
    out_channels_l0: 64
    out_channels_l1: 64
    weight_decay: 0.0027688814708905084



  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-01 07:05:12,003][0m Using an existing study with name 'GM12878_inactive_E_vs_inactive_P_CNN_1_2_3' instead of creating a new one.[0m


AUPRC test score: 0.5729072295632999


>>> ITERATION N. 3

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.564019093532986
  Params: 
    dropout_l0: 0.4
    dropout_l1: 0
    kernel_size_l0: 15
    kernel_size_l1: 11
    lr: 0.002371470963516048
    n_layers: 2
    optimizer: Adam
    out_channels_l0: 64
    out_channels_l1: 96
    weight_decay: 0.0008221576517098549

AUPRC test score: 0.5698042858801128



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.55619


In [37]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [38]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 3. CNN-LSTM

In [None]:
model=CNN_LSTM

In [None]:
#
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                task=task,
                sequence=True,
                augmentation=False,
                model = model,
                device = device,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP.pt',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST.pt')

In [None]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [None]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---

## 5) ACTIVE ENHANCERS + ACTIVE PROMOTERS vs INACTIVE REST

In [39]:
task = TASKS[4]
task

'active_EP_vs_inactive_rest'

In [40]:
pipe_data_load = Build_DataLoader_Pipeline(path_name=f'{task}.pickle')

Data Preprocessing Done!




---
### 1. FFNN

In [None]:
#
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                task=task,
                sequence=True,
                augmentation=False,
                model = model,
                device = device,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP.pt',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST.pt')

In [None]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [None]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 2. CNN

In [41]:
model=CNN

In [42]:
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                task=task,
                sequence=True,
                augmentation=False,
                model = model,
                device = device,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP.pt',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST.pt')

  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-01 07:05:32,953][0m Using an existing study with name 'GM12878_active_EP_vs_inactive_rest_CNN_1' instead of creating a new one.[0m


>>> ITERATION N. 1

Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.19331737073680072
  Params: 
    dropout_l0: 0.3
    dropout_l1: 0.5
    dropout_l2: 0.4
    kernel_size_l0: 5
    kernel_size_l1: 15
    kernel_size_l2: 15
    lr: 0.0012182006706569824
    n_layers: 3
    optimizer: RMSprop
    out_channels_l0: 64
    out_channels_l1: 32
    out_channels_l2: 128
    weight_decay: 0.0033368457211026977



Epochs:   0%|          | 0/100 [00:00<?, ?it/s]

  return torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)


EarlyStopping counter: 1 out of 5
EarlyStopping counter: 2 out of 5
EarlyStopping counter: 1 out of 5
EarlyStopping counter: 2 out of 5
EarlyStopping counter: 3 out of 5
EarlyStopping counter: 4 out of 5


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-01 07:34:36,211][0m A new study created in RDB with name: GM12878_active_EP_vs_inactive_rest_CNN_1_2[0m


EarlyStopping counter: 5 out of 5
Early stopping the training
AUPRC test score: 0.1600325824774355


>>> ITERATION N. 2



Epochs:   0%|          | 0/100 [00:00<?, ?it/s]

	add(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add(Tensor other, *, Number alpha) (Triggered internally at  /opt/conda/conda-bld/pytorch_1623448265233/work/torch/csrc/utils/python_arg_parser.cpp:1025.)
  grad = grad.add(group['weight_decay'], p.data)


EarlyStopping counter: 1 out of 5
EarlyStopping counter: 2 out of 5
EarlyStopping counter: 3 out of 5
EarlyStopping counter: 4 out of 5
EarlyStopping counter: 1 out of 5
EarlyStopping counter: 2 out of 5
EarlyStopping counter: 3 out of 5
EarlyStopping counter: 4 out of 5


[32m[I 2021-10-01 08:00:56,562][0m Trial 0 finished with value: 0.15657990314769973 and parameters: {'n_layers': 3, 'out_channels_l0': 16, 'kernel_size_l0': 15, 'dropout_l0': 0.4, 'out_channels_l1': 96, 'kernel_size_l1': 11, 'dropout_l1': 0, 'out_channels_l2': 64, 'kernel_size_l2': 15, 'dropout_l2': 0.4, 'optimizer': 'Nadam', 'lr': 1.944139544483803e-05, 'weight_decay': 0.0005047961129100892}. Best is trial 0 with value: 0.15657990314769973.[0m


EarlyStopping counter: 5 out of 5
Early stopping the training


Epochs:   0%|          | 0/100 [00:00<?, ?it/s]

EarlyStopping counter: 1 out of 5
EarlyStopping counter: 2 out of 5
EarlyStopping counter: 3 out of 5
EarlyStopping counter: 4 out of 5


[32m[I 2021-10-01 08:12:09,523][0m Trial 1 finished with value: 0.15657990314769976 and parameters: {'n_layers': 3, 'out_channels_l0': 16, 'kernel_size_l0': 5, 'dropout_l0': 0.4, 'out_channels_l1': 32, 'kernel_size_l1': 5, 'dropout_l1': 0, 'out_channels_l2': 128, 'kernel_size_l2': 11, 'dropout_l2': 0.4, 'optimizer': 'Nadam', 'lr': 1.974506666097928e-05, 'weight_decay': 0.010423689158358219}. Best is trial 1 with value: 0.15657990314769976.[0m


EarlyStopping counter: 5 out of 5
Early stopping the training


Epochs:   0%|          | 0/100 [00:00<?, ?it/s]

EarlyStopping counter: 1 out of 5
EarlyStopping counter: 2 out of 5
EarlyStopping counter: 3 out of 5
EarlyStopping counter: 4 out of 5


[32m[I 2021-10-01 08:23:14,859][0m Trial 2 finished with value: 0.16256735389394678 and parameters: {'n_layers': 2, 'out_channels_l0': 64, 'kernel_size_l0': 5, 'dropout_l0': 0, 'out_channels_l1': 32, 'kernel_size_l1': 15, 'dropout_l1': 0.5, 'optimizer': 'Nadam', 'lr': 3.8838808450131e-05, 'weight_decay': 0.00019469079203175214}. Best is trial 2 with value: 0.16256735389394678.[0m


EarlyStopping counter: 5 out of 5
Early stopping the training
Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.16256735389394678
  Params: 
    dropout_l0: 0
    dropout_l1: 0.5
    kernel_size_l0: 5
    kernel_size_l1: 15
    lr: 3.8838808450131e-05
    n_layers: 2
    optimizer: Nadam
    out_channels_l0: 64
    out_channels_l1: 32
    weight_decay: 0.00019469079203175214



Epochs:   0%|          | 0/100 [00:00<?, ?it/s]

EarlyStopping counter: 1 out of 5
EarlyStopping counter: 1 out of 5
EarlyStopping counter: 2 out of 5
EarlyStopping counter: 3 out of 5
EarlyStopping counter: 4 out of 5


  pruner=optuna.pruners.PatientPruner(optuna.pruners.MedianPruner(), patience=2),
[32m[I 2021-10-01 08:45:24,703][0m A new study created in RDB with name: GM12878_active_EP_vs_inactive_rest_CNN_1_2_3[0m


EarlyStopping counter: 5 out of 5
Early stopping the training
AUPRC test score: 0.17259745190993456


>>> ITERATION N. 3



Epochs:   0%|          | 0/100 [00:00<?, ?it/s]

EarlyStopping counter: 1 out of 5
EarlyStopping counter: 1 out of 5
EarlyStopping counter: 2 out of 5
EarlyStopping counter: 3 out of 5
EarlyStopping counter: 4 out of 5


[32m[I 2021-10-01 09:02:53,240][0m Trial 0 finished with value: 0.15707875457875453 and parameters: {'n_layers': 4, 'out_channels_l0': 32, 'kernel_size_l0': 15, 'dropout_l0': 0.3, 'out_channels_l1': 32, 'kernel_size_l1': 15, 'dropout_l1': 0.4, 'out_channels_l2': 64, 'kernel_size_l2': 5, 'dropout_l2': 0.5, 'out_channels_l3': 256, 'kernel_size_l3': 15, 'dropout_l3': 0.4, 'optimizer': 'RMSprop', 'lr': 1.4686575667808578e-05, 'weight_decay': 0.0001685122844890525}. Best is trial 0 with value: 0.15707875457875453.[0m


EarlyStopping counter: 5 out of 5
Early stopping the training


Epochs:   0%|          | 0/100 [00:00<?, ?it/s]

EarlyStopping counter: 1 out of 5
EarlyStopping counter: 2 out of 5
EarlyStopping counter: 3 out of 5
EarlyStopping counter: 1 out of 5
EarlyStopping counter: 2 out of 5
EarlyStopping counter: 3 out of 5
EarlyStopping counter: 4 out of 5


[32m[I 2021-10-01 09:52:14,959][0m Trial 1 finished with value: 0.15682234432234435 and parameters: {'n_layers': 4, 'out_channels_l0': 16, 'kernel_size_l0': 11, 'dropout_l0': 0.3, 'out_channels_l1': 96, 'kernel_size_l1': 11, 'dropout_l1': 0.5, 'out_channels_l2': 256, 'kernel_size_l2': 15, 'dropout_l2': 0.5, 'out_channels_l3': 512, 'kernel_size_l3': 15, 'dropout_l3': 0.5, 'optimizer': 'RMSprop', 'lr': 5.47242428140986e-05, 'weight_decay': 0.0009535977817711929}. Best is trial 0 with value: 0.15707875457875453.[0m


EarlyStopping counter: 5 out of 5
Early stopping the training


Epochs:   0%|          | 0/100 [00:00<?, ?it/s]

EarlyStopping counter: 1 out of 5
EarlyStopping counter: 2 out of 5
EarlyStopping counter: 3 out of 5
EarlyStopping counter: 4 out of 5


[32m[I 2021-10-01 10:03:25,944][0m Trial 2 finished with value: 0.15729104229104232 and parameters: {'n_layers': 2, 'out_channels_l0': 32, 'kernel_size_l0': 15, 'dropout_l0': 0, 'out_channels_l1': 64, 'kernel_size_l1': 5, 'dropout_l1': 0.5, 'optimizer': 'RMSprop', 'lr': 0.0017171586784351205, 'weight_decay': 0.0001296996164198851}. Best is trial 2 with value: 0.15729104229104232.[0m


EarlyStopping counter: 5 out of 5
Early stopping the training
Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.15729104229104232
  Params: 
    dropout_l0: 0
    dropout_l1: 0.5
    kernel_size_l0: 15
    kernel_size_l1: 5
    lr: 0.0017171586784351205
    n_layers: 2
    optimizer: RMSprop
    out_channels_l0: 32
    out_channels_l1: 64
    weight_decay: 0.0001296996164198851



Epochs:   0%|          | 0/100 [00:00<?, ?it/s]

EarlyStopping counter: 1 out of 5
EarlyStopping counter: 2 out of 5
EarlyStopping counter: 1 out of 5
EarlyStopping counter: 2 out of 5
EarlyStopping counter: 3 out of 5
EarlyStopping counter: 4 out of 5
EarlyStopping counter: 5 out of 5
Early stopping the training
AUPRC test score: 0.16498974706876485



3-FOLD CROSS-VALIDATION AUPRC TEST SCORE: 0.16587


In [43]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [44]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---
### 3. CNN-LSTM

In [None]:
model=CNN_LSTM

In [None]:
#
kf_CV = Kfold_CV()

kf_CV(build_dataloader_pipeline = pipe_data_load,
                num_epochs = 100,
                n_folds=3,
                cell_line=cell_line,
                task=task,
                sequence=True,
                augmentation=False,
                model = model,
                device = device,
                sampler = 'TPE',
                study_name = f'{cell_line}_{task}_{model.__name__}',
                hp_model_path = f'{cell_line}_{task}_{model.__name__}_HP.pt',
                test_model_path = f'{cell_line}_{task}_{model.__name__}_TEST.pt')

In [None]:
results_dict[cell_line][task][model.__name__] = kf_CV.scores_dict

In [None]:
with open ('results_dict.pickle', 'wb') as fout:
    pickle.dump(OrderedDict(results_dict), fout)

---