## BIOINFORMATICS THESIS: MULTIMODAL NEURAL NETWORK

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pylab as plt

import os
import pickle
from tqdm.auto import tqdm
import json
from sklearn.model_selection import train_test_split

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import gensim

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import RobustScaler
from scipy.sparse import csr_matrix
from scipy.sparse import coo_matrix, hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm.auto import tqdm
import random
import types

#import optuna
import torch.nn as nn
import torch.optim as optim
import pickle
import re
import sqlite3
from sqlalchemy import create_engine

from sklearn.impute import KNNImputer
import torch
import torch.nn.functional as F
import itertools
import optuna


In [15]:
pip -q install optuna

Note: you may need to restart the kernel to use updated packages.


In [2]:
from BIOINF_tesi.data_pipe import Load_Create_Task
from BIOINF_tesi.data_pipe import Build_DataLoader_Pipeline

In [None]:
data = Load_Create_Task()
data.load(verbose=True)

HBox(children=(HTML(value='Loading data'), FloatProgress(value=0.0, max=9.0), HTML(value='')))

In [5]:
data_dict, labels_dict = data.get_task('active_E_vs_inactive_E')

In [6]:
pipe_data_load = Build_DataLoader_Pipeline(data_dict, labels_dict, path_name='__.pickle', verbose=False)

Data transformation Done!

Data Preprocessing Done!


In [7]:
train_loader, test_loader = pipe_data_load.return_data(cell_line='H1', 
                    hyper_tuning=True, 
                    sequence=True,
                    augmentation=True)

In [43]:
w_pos, w_neg = get_loss_weights_from_dataloader(train_loader)
criterion=nn.CrossEntropyLoss(weight=torch.tensor([w_pos,w_neg]))

## FUNCTIONS SETUP

In [7]:
# if the gpu is available the model is moved on the gpu memory
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [8]:
from BIOINF_tesi.models.utils import load_model, save_best_model, plot_model_scores, get_loss_weights_from_dataloader

In [24]:
#pip install pytorchtools

In [9]:
# create a database to store optuna studies with sqlite backend

engine = create_engine('sqlite:///SA_optuna_tuning.db')

In [16]:
pip -q install botorch # quiet

Note: you may need to restart the kernel to use updated packages.


In [1]:
#!conda install botorch -c pytorch -c gpytorch

In [None]:
from BIOINF_tesi.data_pipe import Build_DataLoader_Pipeline

data_dict, labels_dict = data.get_task('active_E_vs_active_P')

pipe_data_load = Build_DataLoader_Pipeline(data_dict, labels_dict, path_name='n.pickle', 
                                           type_corr='kruskal_wallis_test', intersection=False)

In [None]:
class K_Trials():
    """Used for comparing different types of models and with and without augmentation"""
    
    def __init__(self):
        
        self.scores_dict = defaultdict(defaultdict(list))
    
    
    def run(self,
            build_dataloader_pipeline, 
            cell_line, 
            sequence=False, 
            model=None,
            augmentation=False,
            random_state=123,
            k_trials=3, 
            criterion, 
            num_epochs=50, 
            input_size=None, 
            study_name=None,
            hp_model_path=None, 
            test_model_path=None
            ):
        
        self.k_trials = k_trials
    
    
        for i in range(self.k_trials):
            
            print(f'TRIAL N. {i}')
            print('\n===============> HYPERPARAMETERS TUNING')

            random_state = random_state + 100*i
            train_loader, test_loader = pipe_data_load.return_data(cell_line=cell_line, hyper_tuning=True, 
                        sequence=sequence, random_state=random_state, augmentation=augmentation)

            param_search = Param_Search(train_loader, test_loader,
                                        criterion, num_epochs, input_size = input_size, 
                                        n_trials=5, study_name=study_name)

            param_search.run_trial()
            param_search.save_best_model(hp_model_path) #check the format

            best_params = param_search.get_best_params()

            lr = best_params['lr']
            if best_params['optimizer'] == 'Adam':
                optimizer = optim.Adam(model.parameters(), lr=lr)
                
            
            train_loader, test_loader = pipe_data_load.return_data(cell_line=cell_line, hyper_tuning=False, 
                        sequence=sequence, random_state=random_state, augmentation=True)

            print('\n===============> MODEL TESTING')
            
            F1_train, F1_test = fit(model, train_loader, test_loader, criterion, optimizer, 
                            num_epochs, filename_path='__', patience=3, # check the format of filename_path
                                    # NB: remove filename_path! not needed with new pc! or yes??
                                    # 
                            sequence=sequence, verbose=False)
            
            self.scores_dict[f'trial_n_{i}'][f'F1_train'] = F1_train
            self.scores_dict[f'trial_n_{i}'][f'F1_test'] = F1_test
         #   save_best_model(model, test_model_path) #how to?
    
    def plot_results(self):
        
        for i in self.k_trials:
            print(f'TRAIL N. {i}')
            plot_model_scores(self.scores_dict[f'trial_n_{i}'][f'F1_train'],
                              self.scores_dict[f'trial_n_{i}'][f'F1_test'])
    
    

# 1. FEED FORWARD NN 

In [24]:
class FFNN(nn.Module):
  """ Feed Forward neural network. It uses ReLU activation functions."""

  def __init__(self, input_size):
    super(FFNN, self).__init__()
    self.input_size = input_size
    
    self.layer1 = nn.Sequential(
        nn.Linear(self.input_size, 100), 
        nn.ReLU())
    self.layer2 = nn.Sequential(
        nn.Linear(100, 50),
        nn.ReLU()) 
    

    self.last_layer = nn.Linear(50, 2) 
    # mat1 and mat2 shapes cannot be multiplied (190x50 and 540x100)

    self.drop_out1 = nn.Dropout(p=0.3)
    self.drop_out2 = nn.Dropout(p=0.4) 

  def forward(self, x):
      
      
    out = self.layer1(x)
    out = self.drop_out1(out)

    out = self.layer2(out)
    out = self.drop_out2(out)
    out = self.last_layer(out)

    return out

# DATA

## Hyperparameters tuning

In [12]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [29]:
train_loader, test_loader = pipe_data_load.return_data(cell_line='HEPG2', 
                    hyper_tuning=True, 
                    sequence=True)

In [35]:
def get_input_size_FFNN(data_loader):
  for d,l in data_loader:
    input_size = d.shape[1]
    break
  return input_size

In [36]:
input_size = get_input_size_FFNN(train_loader)

In [71]:
get_loss_weights_from_dataloader(train_loader)

In [None]:
num_epochs = 30
criterion=nn.CrossEntropyLoss(weight=torch.tensor([w_pos,w_neg]))

In [72]:
param_search = Param_Search(train_loader, test_loader,
            criterion, num_epochs, input_size = input_size,
            n_trials=2, study_name='hp_FFNN')

param_search.run_trial()

  sampler=BoTorchSampler())
[32m[I 2021-06-14 14:09:42,313][0m A new study created in memory with name: hp_FFNN[0m


HBox(children=(HTML(value='Epochs'), FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(HTML(value='Training Model'), FloatProgress(value=0.0, max=404.0), HTML(value='')))




HBox(children=(HTML(value='Testing Model'), FloatProgress(value=0.0, max=36.0), HTML(value='')))




HBox(children=(HTML(value='Training Model'), FloatProgress(value=0.0, max=404.0), HTML(value='')))




HBox(children=(HTML(value='Testing Model'), FloatProgress(value=0.0, max=36.0), HTML(value='')))

[32m[I 2021-06-14 14:14:36,524][0m Trial 0 finished with value: 0.20761762003837356 and parameters: {'n_layers': 3, 'n_units_l0': 369, 'dropout_l0': 0.4357738320752208, 'n_units_l1': 455, 'dropout_l1': 0.4073078141474769, 'n_units_l2': 347, 'dropout_l2': 0.4731143539113111, 'optimizer': 'RMSprop', 'lr': 0.04321534024331291}. Best is trial 0 with value: 0.20761762003837356.[0m






HBox(children=(HTML(value='Epochs'), FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(HTML(value='Training Model'), FloatProgress(value=0.0, max=404.0), HTML(value='')))




HBox(children=(HTML(value='Testing Model'), FloatProgress(value=0.0, max=36.0), HTML(value='')))




HBox(children=(HTML(value='Training Model'), FloatProgress(value=0.0, max=404.0), HTML(value='')))




HBox(children=(HTML(value='Testing Model'), FloatProgress(value=0.0, max=36.0), HTML(value='')))

[32m[I 2021-06-14 14:18:30,514][0m Trial 1 finished with value: 0.8307348694286952 and parameters: {'n_layers': 2, 'n_units_l0': 216, 'dropout_l0': 0.3155312391750696, 'n_units_l1': 459, 'dropout_l1': 0.28529844778180163, 'optimizer': 'RMSprop', 'lr': 2.7520615182966584e-05}. Best is trial 1 with value: 0.8307348694286952.[0m




Study statistics: 
  Number of finished trials:  2
  Number of pruned trials:  0
  Number of complete trials:  2
Best trial:
  Value:  0.8307348694286952
  Params: 
    n_layers: 2
    n_units_l0: 216
    dropout_l0: 0.3155312391750696
    n_units_l1: 459
    dropout_l1: 0.28529844778180163
    optimizer: RMSprop
    lr: 2.7520615182966584e-05




In [None]:
best_model_FFNN_hp = param_search.save_best_model('FFNN/best_model_FFNN_hp.pt')

## Model testing

In [44]:
#train_loader, test_loader = pipe_data_load.return_data(cell_line='H1', 
 #                   hyper_tuning=True, 
  #                  sequence=False)

In [23]:
num_epochs = 20
criterion = nn.CrossEntropyLoss()

In [24]:
from BIOINF_tesi.models.utils import F1, EarlyStopping

In [25]:
best_lr = 3.0174993222703274e-05 ##TRAIN
optimizer = optim.Adam(model.parameters(), lr=best_lr)

In [None]:
F1_train, F1_test = fit(model, train_loader, test_loader, criterion, optimizer, 
                        num_epochs, filename_path='__', patience=3,
                        sequence=True, verbose=True)

#save_best_model(model, 'FFNN/best_model_FFNN_test.pt')

# REDUCE FILTERS SIZE

HBox(children=(HTML(value='Epochs'), FloatProgress(value=0.0, max=20.0), HTML(value='')))

HBox(children=(HTML(value='Training model'), FloatProgress(value=0.0, max=105.0), HTML(value='')))

  target = torch.tensor(target)


In [None]:
plot_model_scores(F1_train, F1_test, epochs=20,set_ylim=(0.82,0.88))

In [17]:
num_epochs = 20
criterion = nn.CrossEntropyLoss()

In [18]:
best_lr = 3.0174993222703274e-05 ##TRAIN
optimizer = optim.Adam(model.parameters(), lr=best_lr)

In [19]:
from BIOINF_tesi.models.utils import F1, EarlyStopping

In [None]:
F1_train, F1_test = fit(model, train_loader, test_loader, criterion, optimizer, 
                        num_epochs, pre_trained=True, filename_path='FFNN/ffnn_testing', patience=3,
                        sequence=True, verbose=True)

#save_best_model(model, 'FFNN/best_model_FFNN_test.pt')

# REDUCE FILTERS SIZE

HBox(children=(HTML(value='Epochs'), FloatProgress(value=0.0, max=20.0), HTML(value='')))

HBox(children=(HTML(value='Testing model'), FloatProgress(value=0.0, max=10.0), HTML(value='')))

# 2. CONVOLUTIONAL NN

## NB: must use 1d convolutional nn for sequences!

In [4]:
list(range(1,3))

[1, 2]

In [11]:
from BIOINF_tesi.models import CNN_define_model, FFNN_define_model

In [None]:
def __init__(self, fc_layer_size, classes=2):
    super(CNN_multitask, self).__init__()
    self.fc_layer_size = fc_layer_size 
    self.classes = classes
    
    self.layer1 = nn.Sequential(
            nn.Conv1d(1, 32, kernel_size=5, stride=1, padding=1), #The average word length in English language is 4.7 characters.
            nn.BatchNorm1d(32), 
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=3, stride=1))

    self.layer2 = nn.Sequential(
            nn.Conv1d(32, 32, kernel_size=5, stride=1, padding=1),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=3, stride=1))
    
    self.layer3 = nn.Sequential(
            nn.Conv1d(32, 64, kernel_size=5, stride=1, padding=1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=3, stride=1)
    
        
    self.drop_out1 = nn.Dropout(p=0.3)
    self.drop_out2 = nn.Dropout(p=0.4)
    self.drop_out3 = nn.Dropout(p=0.5)
    
    self.last_layer1 = nn.Linear(self.fc_layer_size, 1000) 
    self.last_layer2 = nn.Linear(1000, self.classes)
    

  def forward(self, x):
      
      # 2 shared blocks 
    out = self.layer1(x)
    out = self.drop_out1(out)
    out = self.layer2(out)
    out = self.drop_out2(out)
    out = self.layer3(out) 
    out = self.drop_out3(out)
    
    out = out.reshape(out.size(0), -1) # batch, rest
    out = self.last_layer1(out)
    out = self.last_layer2(out)

 
    return out

In [23]:
o.size(1)

256

In [18]:
o=torch.rand([100,256,4])
out = out.reshape(out.size(0),out.size(1) -1)

In [21]:
print(o.size(0))

100


In [20]:
o.reshape(o.size(0), -1) 

tensor([[0.5383, 0.0530, 0.2218,  ..., 0.5969, 0.7685, 0.9807],
        [0.4448, 0.5935, 0.5454,  ..., 0.2139, 0.2322, 0.3407],
        [0.6433, 0.7899, 0.4144,  ..., 0.6058, 0.4562, 0.6550],
        ...,
        [0.1098, 0.7348, 0.9494,  ..., 0.2709, 0.9399, 0.1409],
        [0.8204, 0.3057, 0.4057,  ..., 0.7834, 0.7073, 0.8876],
        [0.4486, 0.5272, 0.8601,  ..., 0.8017, 0.5678, 0.8476]])

In [84]:
train_loader, test_loader = pipe_data_load.return_data(cell_line='H1', 
                    hyper_tuning=True, 
                    sequence=True)

In [85]:
num_epochs = 20
criterion = nn.CrossEntropyLoss()

In [99]:
F1_train, F1_test = fit(model, train_loader, test_loader, criterion, optimizer, 
                        num_epochs, pre_trained=True, filename_path='FFNN/ffnn_testing', patience=3,
                        sequence=True, verbose=True)

#save_best_model(model, 'FFNN/best_model_FFNN_test.pt')

HBox(children=(HTML(value='Epochs'), FloatProgress(value=0.0, max=20.0), HTML(value='')))

HBox(children=(HTML(value='Testing model'), FloatProgress(value=0.0, max=36.0), HTML(value='')))





RuntimeError: Given groups=1, weight of size [64, 3, 3, 3], expected input[192, 1, 257, 5] to have 3 channels, but got 1 channels instead

In [97]:
F1_test

[]