In [60]:
from Bio import SeqIO

import numpy as np
import pandas as pd

from tensorflow.keras.models import load_model

In [33]:
def oneHotEncodeSequence(sequence):
    oneHotDimension = (len(sequence), 4)
    dnaAlphabet = {"A":0, "G":1, "C":2, "T":3}    
    one_hot_encoded_sequence = np.zeros(oneHotDimension, dtype='int')
    for i, nucleotide in enumerate(sequence):
        if nucleotide.upper() in dnaAlphabet:
            index = dnaAlphabet[nucleotide.upper()]
            one_hot_encoded_sequence[i][index] = 1
    return one_hot_encoded_sequence

In [34]:
extraSeqLenLeft = len("GAGTCTGAACCTGTGTGCTA")
mpraSeqLen = 227
panTissueControlLen = 224
modelSeqLen = 1000

In [58]:
one_hot_encoded_enhancer_sequences = []
record_ids = []

for record in SeqIO.parse("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ad_variants_processing/common_variants/AD_MPRA_2022.2.3.fasta", "fasta"):
    if(len(record.seq))==300:
        enhancer_seq = record.seq[extraSeqLenLeft:extraSeqLenLeft+mpraSeqLen]

    else:
        assert(len(record.seq)==297)
        enhancer_seq = record.seq[extraSeqLenLeft:extraSeqLenLeft+panTissueControlLen]
        
    leftWindowLen = (modelSeqLen - len(enhancer_seq))//2
    rightWindowLen = modelSeqLen - len(enhancer_seq) - leftWindowLen
    padded_enhancer_seq = "N"*leftWindowLen + enhancer_seq + "N"*rightWindowLen
    one_hot_encoded_enhancer_seq = oneHotEncodeSequence(padded_enhancer_seq)
    assert(one_hot_encoded_enhancer_seq.shape==(1000,4))
    one_hot_encoded_enhancer_sequences.append(one_hot_encoded_enhancer_seq)
    record_ids.append(record.id)
one_hot_encoded_enhancer_sequences = np.stack(one_hot_encoded_enhancer_sequences)

In [57]:
one_hot_encoded_enhancer_sequences.shape

(24000, 1000, 4)

In [59]:
regression_models = {
                     "Microglia":        load_model("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ml_gosselin_microglia/model_7_tf.hdf5", compile=False),
                     "Neuron":           load_model("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ml_fullard_neun/model_2_tf.hdf5", compile=False),
                     "Monocyte":         load_model("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ml_monocyte/model_8_tf.hdf5", compile=False),
                    }

In [61]:
scores_df = pd.DataFrame(columns = ["record_id"]+[celltype+"_score" for celltype in regression_models])

In [63]:
scores_df["record_id"] = record_ids

In [65]:
for celltype, model in regression_models.items():
    predictions = model.predict(one_hot_encoded_enhancer_sequences)
    scores_df[celltype+"_score"] = predictions

In [70]:
scores_df.to_csv("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ad_variants_processing/common_variants/AD_MPRA_2022.2.3_bulk_model_scores.txt",
                 sep='\t',
                 index=False
                )