# Prediction of Active Enhancers with CNN

In [1]:
import pandas as pd
import numpy as np
from typing import Tuple
import os
import compress_json
from tqdm.auto import tqdm
from plot_keras_history import plot_history
from barplots import barplots

#### un generator ritorna l'isimo valore di una certa lista, e lo toglie dalla lista. Utili per non tenere in memoria tutti i dati
##### es:

In [2]:
def my_generator():
    for i in range(10):
        yield i

In [3]:
gen = my_generator()

In [4]:
next(gen)

0

In [5]:
models = []
#sto scartano modelli più semplici perché è un problema abbastanza intricato
#passo a modelli più complicati

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Flatten
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import AUC
#percettrone per le sequenze
perceptron = Sequential([
    Input(shape=(200, 4)), #sequenza one-hot encoding, input multi-dimensionale
    Flatten(), #reshap dell'input
    Dense(1, activation="sigmoid")
], "Perceptron")

perceptron.compile(
    optimizer="nadam",
    loss="binary_crossentropy",
    metrics=[
        "accuracy",
        AUC(curve="ROC", name="auroc"),
        AUC(curve="PR", name="auprc")
    ]
)

models.append(perceptron)

In [None]:
#MPL come prima
mlp = Sequential([
    Input(shape=(200, 4)),
    Flatten(),
    Dense(64, activation="relu"),
    Dense(32, activation="relu"),
    Dense(1, activation="sigmoid")
], "MLP")

mlp.compile(
    optimizer="nadam",
    loss="binary_crossentropy",
    metrics=[
        "accuracy",
        AUC(curve="ROC", name="auroc"),
        AUC(curve="PR", name="auprc")
    ]
)

models.append(mlp)

In [None]:
#ffnn come altra volta però semplificata
from tensorflow.keras.layers import BatchNormalization, Dropout, Activation

ffnn = Sequential([
    Input(shape=(200, 4)),
    Flatten(),
    Dense(128, activation="relu"),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(32, activation="relu"),
    Dense(16, activation="relu"),
    Dense(1, activation="sigmoid")
], "FFNN")

ffnn.compile(
    optimizer="nadam",
    loss="binary_crossentropy",
    metrics=[
        "accuracy",
        AUC(curve="ROC", name="auroc"),
        AUC(curve="PR", name="auprc")
    ]
)

ffnn.summary()
models.append(ffnn)

In [None]:
#cnn
from tensorflow.keras.layers import Conv2D, Reshape

cnn = Sequential([
    Input(shape=(200, 4)),
    Reshape((200, 4, 1)),
    Conv2D(64, kernel_size=(10, 2), activation="relu"), #kernel size è la dimensione della convoluzione
    #che ogni neurone fa  (in questo caso sono 64 neuroni) 
    Conv2D(64, kernel_size=(10, 2), activation="relu"),
    Dropout(0.3),
    Conv2D(32, kernel_size=(10, 2), strides=(2, 1), activation="relu"), #in questo caso uso lo stride per ridurre la dimensionalità
    #dei paramteri. Potrei usare anche un max pooling
    Conv2D(32, kernel_size=(10, 1), activation="relu"),
    Conv2D(32, kernel_size=(10, 1), activation="relu"), #potrei aggiungere il padding same aggiungendo ,padding="same"
    #tenendolo ho davvero tantissimi paramentri! senza il kernel compatta lo spazio dell'input
    #se ho un modello molto complesso come l'outer-encoding, è meglio usarlo
    Dropout(0.3),
    Flatten(),
    # questo che segue è un MLP che si mette in coda
    Dense(32, activation="relu"),
    Dense(16, activation="relu"),
    Dense(1, activation="sigmoid")
], "CNN")

cnn.compile(
    optimizer="nadam",
    loss="binary_crossentropy",
    metrics=[
        "accuracy",
        AUC(curve="ROC", name="auroc"),
        AUC(curve="PR", name="auprc")
    ]
)

cnn.summary()
models.append(cnn)

In [None]:
from tensorflow.keras.layers import LSTM #per GRU bisogna solo importare GRU
#long short term memory
#composti da SLTM-cell layer particolari che hanno una sorta di ricorrenza
#per dati con una certa sequenzialità
#molto lento ad apprendere, ma buono nell'accuracy

cudnn_lstm = dict(
    activation="tanh",
    recurrent_activation="sigmoid",
    recurrent_dropout=0,
    unroll=False,
    use_bias=True
)
#è un dizionario di parametri
#LSTM è implementato in modo particolare
#ed eseguibile su GPU sono con un set di parametri

lstm = Sequential([
    Input(shape=(200, 4)), #quando prede in pancia questa sequenza
    #la cella lstm viene moltiplicata 200 volte. => istantaneamente diventa 200 layer
    #molto lente da trainare
    LSTM(256, **cudnn_lstm), #si possono concatenare più LSTM
    #altri layer LSTM sono i GRU, compromesso tra accuracy e velocità
    #altri sono i LMU => tempo di convergenze maggiore. Non ci sono grandi cambiamenti
    #per questi bisogna importare LMU_cell da github ma non vale la pena
    Flatten(),
    Dense(256, activation="relu"),
    Dropout(0.3),
    Dense(32, activation="relu"),
    Dense(16, activation="relu"),
    Dense(1, activation="sigmoid")
], "LSTM")

lstm.compile(
    optimizer="nadam",
    loss="binary_crossentropy",
    metrics=[
        "accuracy",
        AUC(curve="ROC", name="auroc"),
        AUC(curve="PR", name="auprc")
    ]
)

lstm.summary()
models.append(lstm)

In [None]:
from epigenomic_dataset import load_epigenomes

cell_line = "GM12878"
window_size = 200

epigenomes, labels = load_epigenomes(
    cell_line = cell_line,
    dataset = "fantom",
    regions = "enhancers",
    window_size = window_size
)

epigenomes = epigenomes.droplevel(1, axis=1) 
labels = labels.values.ravel() #flattate le labels

bed = epigenomes.reset_index()[epigenomes.index.names] #estrggo solo i bed (le coordinate)

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

splits = 2
holdouts = StratifiedShuffleSplit(n_splits=splits, test_size=0.2, random_state=42)

In [None]:
from ucsc_genomes_downloader import Genome
from keras_bed_sequence import BedSequence
from keras_mixed_sequence import MixedSequence
from tensorflow.keras.utils import Sequence

genome = Genome("hg19")

def get_holdout(train:np.ndarray, test:np.ndarray, bed:pd.DataFrame, labels:np.ndarray, genome:genome, batch_size=1024)->Tuple[Sequence, Sequence]:
    return (
        MixedSequence(
            x=BedSequence(genome, bed.iloc[train], batch_size=batch_size),
            y=labels[train],
            batch_size=batch_size
        ),
        MixedSequence(
            x= BedSequence(genome, bed.iloc[test], batch_size=batch_size),
            y=labels[test],
            batch_size=batch_size #batch_size maggiori, risultati + veloci
        )
    )

In [None]:
def precomputed(results, model:str, holdout:int)->bool:
    df = pd.DataFrame(results)
    if df.empty:
        return False
    return (
        (df.model == model) &
        (df.holdout == holdout)
    ).any()

In [None]:
#if os.path.exists("sequence.json"):
 #   results = compress_json.load("sequence.json")
#else:
results = []

for i, (train_index, test_index) in tqdm(enumerate(holdouts.split(bed, labels)), total=splits, desc="Computing holdouts", dynamic_ncols=True):
    train, test = get_holdout(train_index, test_index, bed, labels, genome)
    for model in tqdm(models, total=len(models), desc="Training models", leave=False, dynamic_ncols=True):
        if precomputed(results, model.name, i):
            continue
        history = model.fit(
            train,
            steps_per_epoch=train.steps_per_epoch,
            validation_data=test,
            validation_steps=test.steps_per_epoch,
            epochs=1000,
            shuffle=True,
            verbose=False,
            callbacks=[
                EarlyStopping(monitor="val_loss", mode="min", patience=50),
            ]
        ).history
        scores = pd.DataFrame(history).iloc[-1].to_dict()
        results.append({
            "model":model.name,
            "run_type":"train",
            "holdout":i,
            **{
                key:value
                for key, value in scores.items()
                if not key.startswith("val_")
            }
        })
        results.append({
            "model":model.name,
            "run_type":"test",
            "holdout":i,
            **{
                key[4:]:value
                for key, value in scores.items()
                if key.startswith("val_")
            }
        })
      #  compress_json.local(results, "sequence.json")

In [None]:
df = pd.DataFrame(results).drop(columns="holdout")

In [None]:
df

In [None]:
barplots(
    df,
    groupby=["model", "run_type"],
    show_legend=False,
    height=5,
    orientation="horizontal",
    path='barplots/sequence/{feature}.png',
)

In [None]:
from PIL import Image
from glob import glob

for x in glob("barplots/sequence/*.png"):
    display(Image.open(x))

    #accuracy buona
    #AUROC bassa => sbilanciamento delle classi

In [None]:
labels.mean()  #vedo se c'è uno sbilanciamento! non sono a 0.5 
#quindi ho le due classi (0,1) sbilanciate!
