# Prediction of Active Enhancers with CNN

In [3]:
import pandas as pd
import numpy as np
from typing import Tuple
import os
import compress_json
from tqdm.auto import tqdm
from plot_keras_history import plot_history
from barplots import barplots

#### un generator ritorna l'isimo valore di una certa lista, e lo toglie dalla lista. Utili per non tenere in memoria tutti i dati
##### es:

In [2]:
def my_generator():
    for i in range(10):
        yield i

In [3]:
gen = my_generator()

In [4]:
next(gen)

0

In [4]:
models = []
#sto scartano modelli più semplici perché è un problema abbastanza intricato
#passo a modelli più complicati

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Flatten
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import AUC

In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Flatten
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import AUC
#percettrone per le sequenze
perceptron = Sequential([
    Input(shape=(200, 4)), #sequenza one-hot encoding, input multi-dimensionale
    Flatten(), #reshap dell'input
    Dense(1, activation="sigmoid")
], "Perceptron")

perceptron.compile(
    optimizer="nadam",
    loss="binary_crossentropy",
    metrics=[
        "accuracy",
        AUC(curve="ROC", name="auroc"),
        AUC(curve="PR", name="auprc")
    ]
)

models.append(perceptron)

In [17]:
#MPL come prima
mlp = Sequential([
    Input(shape=(200, 4)),
    Flatten(),
    Dense(64, activation="relu"),
    Dense(32, activation="relu"),
    Dense(1, activation="sigmoid")
], "MLP")

mlp.compile(
    optimizer="nadam",
    loss="binary_crossentropy",
    metrics=[
        "accuracy",
        AUC(curve="ROC", name="auroc"),
        AUC(curve="PR", name="auprc")
    ]
)

models.append(mlp)

mlp.summary()


Model: "MLP"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_8 (Flatten)          (None, 800)               0         
_________________________________________________________________
dense_26 (Dense)             (None, 64)                51264     
_________________________________________________________________
dense_27 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_28 (Dense)             (None, 1)                 33        
Total params: 53,377
Trainable params: 53,377
Non-trainable params: 0
_________________________________________________________________


In [5]:
#ffnn come altra volta però semplificata
from tensorflow.keras.layers import BatchNormalization, Dropout, Activation

ffnn = Sequential([
    Input(shape=(200, 4)),
    Flatten(),
    Dense(128, activation="relu"),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(32, activation="relu"),
    Dense(16, activation="relu"),
    Dense(1, activation="sigmoid")
], "FFNN")

ffnn.compile(
    optimizer="nadam",
    loss="binary_crossentropy",
    metrics=[
        "accuracy",
        AUC(curve="ROC", name="auroc"),
        AUC(curve="PR", name="auprc")
    ]
)

ffnn.summary()
models.append(ffnn)

Model: "FFNN"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_2 (Flatten)          (None, 800)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 128)               102528    
_________________________________________________________________
dense_5 (Dense)              (None, 64)                8256      
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 32)                2080   

In [19]:
#cnn
from tensorflow.keras.layers import Conv2D, Reshape
from tensorflow.keras.layers import BatchNormalization, Dropout, Activation
cnn = Sequential([
    Input(shape=(200, 4)),
    Reshape((200, 4, 1)),
    Conv2D(64, kernel_size=(10, 2), activation="relu"), #kernel size è la dimensione della convoluzione
    #che ogni neurone fa  (in questo caso sono 64 neuroni) 
    Conv2D(64, kernel_size=(10, 2), activation="relu"),
    Dropout(0.3),
    Conv2D(32, kernel_size=(10, 2), strides=(2, 1), activation="relu"), #in questo caso uso lo stride per ridurre la dimensionalità
    #dei paramteri. Potrei usare anche un max pooling
    Conv2D(32, kernel_size=(10, 1), activation="relu"),
    Conv2D(32, kernel_size=(10, 1), activation="relu"), #potrei aggiungere il padding same aggiungendo ,padding="same"
    #tenendolo ho davvero tantissimi paramentri! senza il kernel compatta lo spazio dell'input
    #se ho un modello molto complesso come l'outer-encoding, è meglio usarlo
    Dropout(0.3),
    Flatten(),
    # questo che segue è un MLP che si mette in coda
    Dense(32, activation="relu"),
    Dense(16, activation="relu"),
    Dense(1, activation="sigmoid")
], "CNN")

cnn.compile(
    optimizer="nadam",
    loss="binary_crossentropy",
    sample_weight_mode=None,  

    metrics=[
        "accuracy",
        AUC(curve="ROC", name="auroc"),
        AUC(curve="PR", name="auprc")
    ]
)

cnn.summary()
models.append(cnn)

Model: "CNN"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_3 (Reshape)          (None, 200, 4, 1)         0         
_________________________________________________________________
conv2d_12 (Conv2D)           (None, 191, 3, 64)        1344      
_________________________________________________________________
conv2d_13 (Conv2D)           (None, 182, 2, 64)        81984     
_________________________________________________________________
dropout_4 (Dropout)          (None, 182, 2, 64)        0         
_________________________________________________________________
conv2d_14 (Conv2D)           (None, 87, 1, 32)         40992     
_________________________________________________________________
conv2d_15 (Conv2D)           (None, 78, 1, 32)         10272     
_________________________________________________________________
conv2d_16 (Conv2D)           (None, 69, 1, 32)         10272   

In [18]:
models=[]

In [7]:
from tensorflow.keras.layers import LSTM #per GRU bisogna solo importare GRU
#long short term memory
#composti da SLTM-cell layer particolari che hanno una sorta di ricorrenza
#per dati con una certa sequenzialità
#molto lento ad apprendere, ma buono nell'accuracy

cudnn_lstm = dict(
    activation="tanh",
    recurrent_activation="sigmoid",
    recurrent_dropout=0,
    unroll=False,
    use_bias=True
)
#è un dizionario di parametri
#LSTM è implementato in modo particolare
#ed eseguibile su GPU sono con un set di parametri

lstm = Sequential([
    Input(shape=(200, 4)), #quando prede in pancia questa sequenza
    #la cella lstm viene moltiplicata 200 volte. => istantaneamente diventa 200 layer
    #molto lente da trainare
    LSTM(256, **cudnn_lstm), #si possono concatenare più LSTM
    #altri layer LSTM sono i GRU, compromesso tra accuracy e velocità
    #altri sono i LMU => tempo di convergenze maggiore. Non ci sono grandi cambiamenti
    #per questi bisogna importare LMU_cell da github ma non vale la pena
    Flatten(),
    Dense(256, activation="relu"),
    Dropout(0.3),
    Dense(32, activation="relu"),
    Dense(16, activation="relu"),
    Dense(1, activation="sigmoid")
], "LSTM")

lstm.compile(
    optimizer="nadam",
    loss="binary_crossentropy",
    metrics=[
        "accuracy",
        AUC(curve="ROC", name="auroc"),
        AUC(curve="PR", name="auprc")
    ]
)

lstm.summary()
models.append(lstm)

Model: "LSTM"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 256)               267264    
_________________________________________________________________
flatten_4 (Flatten)          (None, 256)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 256)               65792     
_________________________________________________________________
dropout_4 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 32)                8224      
_________________________________________________________________
dense_15 (Dense)             (None, 16)                528       
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 17     

In [6]:
from epigenomic_dataset import load_epigenomes

cell_line = "GM12878"
window_size = 200

epigenomes, labels = load_epigenomes(
    cell_line = cell_line,
    dataset = "fantom",
    regions = "enhancers",
    window_size = window_size
)

#epigenomes = epigenomes.droplevel(1, axis=1) 
labels = labels.values.ravel() #flattate le labels

bed = epigenomes.reset_index()[epigenomes.index.names] #estrggo solo i bed (le coordinate)

In [13]:
bed.shape


(65423, 4)

In [7]:
from sklearn.model_selection import StratifiedShuffleSplit

splits = 2
holdouts = StratifiedShuffleSplit(n_splits=splits, test_size=0.2, random_state=42)

In [8]:
from ucsc_genomes_downloader import Genome
from keras_bed_sequence import BedSequence
from keras_mixed_sequence import MixedSequence
from tensorflow.keras.utils import Sequence

genome = Genome(r"C:\Users\matte\OneDrive\Desktop\Bioinformatica\genomes\hg19")

def get_holdout(train:np.ndarray, test:np.ndarray, bed:pd.DataFrame, labels:np.ndarray, genome:genome, batch_size=1024)->Tuple[Sequence, Sequence]:
    return (
        MixedSequence(
            x=BedSequence(genome, bed.iloc[train], batch_size=batch_size),
            y=labels[train],
            batch_size=batch_size
        ),
        MixedSequence(
            x= BedSequence(genome, bed.iloc[test], batch_size=batch_size),
            y=labels[test],
            batch_size=batch_size #batch_size maggiori, risultati + veloci
        )
    )

HBox(children=(FloatProgress(value=0.0, description='Loading chromosomes for genome C:\\Users\\matte\\OneDrive…

In [9]:
def precomputed(results, model:str, holdout:int)->bool:
    df = pd.DataFrame(results)
    if df.empty:
        return False
    return (
        (df.model == model) &
        (df.holdout == holdout)
    ).any()

In [23]:
#if os.path.exists("sequence.json"):
 #   results = compress_json.load("sequence.json")
#else:
results = []

for i, (train_index, test_index) in tqdm(enumerate(holdouts.split(bed, labels)), total=splits, desc="Computing holdouts", dynamic_ncols=True):
    train, test = get_holdout(train_index, test_index, bed, labels, genome)
    for model in tqdm(models, total=len(models), desc="Training models", leave=False, dynamic_ncols=True):
        if precomputed(results, model.name, i):
            continue
        history = model.fit(
            train,
            steps_per_epoch=train.steps_per_epoch,
            validation_data=test,
            validation_steps=test.steps_per_epoch,
            epochs=1000,
            shuffle=True,
            verbose=False,
            callbacks=[
                EarlyStopping(monitor="val_loss", mode="min", patience=50),
            ]
        ).history
        scores = pd.DataFrame(history).iloc[-1].to_dict()
        results.append({
            "model":model.name,
            "run_type":"train",
            "holdout":i,
            **{
                key:value
                for key, value in scores.items()
                if not key.startswith("val_")
            }
        })
        results.append({
            "model":model.name,
            "run_type":"test",
            "holdout":i,
            **{
                key[4:]:value
                for key, value in scores.items()
                if key.startswith("val_")
            }
        })
      #  compress_json.local(results, "sequence.json")

HBox(children=(FloatProgress(value=0.0, description='Computing holdouts', layout=Layout(flex='2'), max=2.0, st…

HBox(children=(FloatProgress(value=0.0, description='Rendering sequences in C:\\Users\\matte\\OneDrive\\Deskto…

HBox(children=(FloatProgress(value=0.0, description='Converting nucleotides to numeric classes', layout=Layout…

HBox(children=(FloatProgress(value=0.0, description='Rendering sequences in C:\\Users\\matte\\OneDrive\\Deskto…

HBox(children=(FloatProgress(value=0.0, description='Converting nucleotides to numeric classes', layout=Layout…

HBox(children=(FloatProgress(value=0.0, description='Training models', layout=Layout(flex='2'), max=1.0, style…

  {0: '...'}
    to  
  ['...']



AttributeError: in converted code:

    C:\Users\matte\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow_core\python\keras\engine\training_v2.py:677 map_fn
        batch_size=None)
    C:\Users\matte\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow_core\python\keras\engine\training.py:2481 _standardize_tensors
        feed_sample_weight_modes)
    C:\Users\matte\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow_core\python\keras\engine\training.py:2480 <listcomp>
        for (ref, sw, cw, mode) in zip(y, sample_weights, class_weights,
    C:\Users\matte\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow_core\python\keras\engine\training_utils.py:964 standardize_weights
        if sample_weight is not None and len(sample_weight.shape) != 1:

    AttributeError: 'dict' object has no attribute 'shape'


In [None]:
df = pd.DataFrame(results).drop(columns="holdout")

In [25]:
import tensorflow as tf
tf.__version__

'2.1.0'

In [None]:
barplots(
    df,
    groupby=["model", "run_type"],
    show_legend=False,
    height=5,
    orientation="horizontal",
    path='barplots/sequence/{feature}.png',
)

In [None]:
from PIL import Image
from glob import glob

for x in glob("barplots/sequence/*.png"):
    display(Image.open(x))

    #accuracy buona
    #AUROC bassa => sbilanciamento delle classi

In [None]:
labels.mean()  #vedo se c'è uno sbilanciamento! non sono a 0.5 
#quindi ho le due classi (0,1) sbilanciate!
