# Enformer human validation 

### Load  pre-trained model 

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import joblib
import gzip
import kipoiseq
from kipoiseq import Interval
import pyfaidx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import os
import enformer 
from tqdm import tqdm
import importlib.util

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

2022-01-26 09:53:47.387181: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-26 09:53:47.387207: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [13]:
# import utils.py as module
spec_utils = importlib.util.spec_from_file_location("enformer", os.path.join(os.getcwd() ,"utils.py"))
utils = importlib.util.module_from_spec(spec_utils)
spec_utils.loader.exec_module(utils)
from utils import * 

In [86]:
# import enformer.py as module
spec = importlib.util.spec_from_file_location("enformer", os.path.join(os.getcwd() ,"enformer.py"))
enformer = importlib.util.module_from_spec(spec)
spec.loader.exec_module(enformer)
from enformer import * 

### Load files

In [88]:
transform_path = 'gs://dm-enformer/models/enformer.finetuned.SAD.robustscaler-PCA500-robustscaler.transform.pkl'
model_path = 'https://tfhub.dev/deepmind/enformer/1'
datadir = "../../../../data/FED"
fasta_file = os.path.join(datadir, "hg38.fa")
human_sequences = os.path.join(datadir, "data_human_sequences.bed")
pyfaidx.Faidx(fasta_file)

Faidx("../../../../data/FED/hg38.fa")

In [5]:
model = Enformer(model_path)

2022-01-26 09:53:55.510835: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-01-26 09:53:55.510854: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-01-26 09:53:55.510868: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (luisasantus-HP-EliteDesk-800-G5-TWR): /proc/driver/nvidia/version does not exist
2022-01-26 09:53:55.511115: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
fasta_extractor = FastaStringExtractor(fasta_file)

In [90]:
393216/2

196608.0

### Check tracks

In [7]:
# Download targets from Basenji2 dataset 
# Cite: Kelley et al Cross-species regulatory sequence activity prediction. PLoS Comput. Biol. 16, e1008050 (2020).
targets_txt = 'https://raw.githubusercontent.com/calico/basenji/master/manuscripts/cross2020/targets_human.txt'
df_targets = pd.read_csv(targets_txt, sep='\t')
df_targets

Unnamed: 0,index,genome,identifier,file,clip,scale,sum_stat,description
0,0,0,ENCFF833POA,/home/drk/tillage/datasets/human/dnase/encode/...,32,2,mean,DNASE:cerebellum male adult (27 years) and mal...
1,1,0,ENCFF110QGM,/home/drk/tillage/datasets/human/dnase/encode/...,32,2,mean,DNASE:frontal cortex male adult (27 years) and...
2,2,0,ENCFF880MKD,/home/drk/tillage/datasets/human/dnase/encode/...,32,2,mean,DNASE:chorion
3,3,0,ENCFF463ZLQ,/home/drk/tillage/datasets/human/dnase/encode/...,32,2,mean,DNASE:Ishikawa treated with 0.02% dimethyl sul...
4,4,0,ENCFF890OGQ,/home/drk/tillage/datasets/human/dnase/encode/...,32,2,mean,DNASE:GM03348
...,...,...,...,...,...,...,...,...
5308,5308,0,CNhs14239,/home/drk/tillage/datasets/human/cage/fantom/C...,384,1,sum,CAGE:epithelioid sarcoma cell line:HS-ES-2R
5309,5309,0,CNhs14240,/home/drk/tillage/datasets/human/cage/fantom/C...,384,1,sum,CAGE:squamous cell lung carcinoma cell line:RE...
5310,5310,0,CNhs14241,/home/drk/tillage/datasets/human/cage/fantom/C...,384,1,sum,CAGE:gastric cancer cell line:GSS
5311,5311,0,CNhs14244,/home/drk/tillage/datasets/human/cage/fantom/C...,384,1,sum,CAGE:carcinoid cell line:NCI-H727


In [8]:
suppl = pd.ExcelFile(os.path.join(datadir, "enformer_suppl.xlsx"))
print(suppl.sheet_names)
suppl_human = suppl.parse(suppl.sheet_names[1])
suppl_mouse = suppl.parse(suppl.sheet_names[2])
suppl_human["organism"] = "human"
suppl_mouse["organism"] = "mouse"
frames = [suppl_human, suppl_mouse]
suppl_df = pd.concat(frames)

['Supplementary Table 1', 'Supplementary Table 2', 'Supplementary Table 3']


## Example predict one sequence

In [145]:
def one_hot_encode(sequence):
    return kipoiseq.transforms.functional.one_hot_dna(sequence).astype(np.float32)

SEQUENCE_LENGHT = 393216
#REAL_SEQUENCE_LENGTH = SEQUENCE_LENGHT/2
#ADD_ENDS = int((SEQUENCE_LENGHT - REAL_SEQUENCE_LENGTH)/2)

In [173]:
## pad the sequence with Ns (anyways ignored by the model)
def pad_one_hot(sequence_one_hot, NEW_SIZE):
    ADD_ENDS = int((NEW_SIZE - sequence_one_hot.shape[0])/2)
    pad_zero = np.tile(np.array([0., 0., 0., 0.]), (ADD_ENDS, 1))
    padded_left = np.append(pad_zero,sequence_one_hot, axis=0)
    pad_sequence = np.append(padded_left,pad_zero, axis=0)
    return(pad_sequence)

(393216, 4)

### compute score (how well predicted)

In [None]:
TODO 
1 - retrieve the 197k sequence instead o 131k 

In [None]:
human_dataset = get_dataset('human', 'valid').batch(1).repeat()

In [27]:
def evaluate_model_all_sequences(model, dataset, head, max_steps=None):
    
    metric = MetricDict({'PearsonR': PearsonR(reduce_axis=(0,1))})
    print("Metric dictionary created")
    
    @tf.function
    def predict(x):
        padded_sequence = pad_one_hot(np.squeeze(x.numpy(), axis=0), SEQUENCE_LENGHT)[np.newaxis]
        predictions = model.predict_on_batch(padded_sequence)['human']
        return predictions
    print("Predict funciton loaded")
    
    for i, batch in tqdm(enumerate(dataset)):

        if max_steps is not None and i > max_steps:
            break
        metric.update_state(batch['target'], predict(batch['sequence']))
        #metric.update_state(batch['target'], batch['target'])
        print(i)
        print(batch)

    return metric.result()

In [194]:
predict(batch["sequence"])

array([[[0.0829835 , 0.06613109, 0.0496952 , ..., 0.00328913,
         0.0118176 , 0.00945621],
        [0.08675303, 0.06505437, 0.045567  , ..., 0.00316417,
         0.01133935, 0.00850564],
        [0.11366496, 0.07915953, 0.05737301, ..., 0.01192076,
         0.04589322, 0.04694396],
        ...,
        [0.27100125, 0.19733842, 0.10441186, ..., 0.00362802,
         0.0153653 , 0.00943798],
        [0.24782476, 0.2126574 , 0.10856232, ..., 0.00323784,
         0.01595465, 0.01030887],
        [0.2845043 , 0.27189302, 0.13506007, ..., 0.0034502 ,
         0.01703565, 0.01312567]]], dtype=float32)

In [19]:
# Evaluate model on first ten 
# Right now it evaluates the whole model and 
metrics_human = evaluate_model_all_sequences(model,
                               dataset=get_dataset('human', 'valid').batch(1).prefetch(2),
                               head='human',
                               max_steps=1)
print('')
print({k: v.numpy().mean() for k, v in metrics_human.items()})

Metric dictionary created
Predict funciton loaded


0it [00:07, ?it/s]


AttributeError: in user code:

    File "/tmp/ipykernel_3930890/2071429785.py", line 8, in predict  *
        sequence_one_hot = one_hot_encode(fasta_extractor.extract(x.resize(393216)))

    AttributeError: 'Tensor' object has no attribute 'resize'


#### Distributions of pearson correlation coefficients per assay 

In [None]:
# assuming the values are in order of assay (TODO check) 
assay_list = list(suppl_df["assay_type"])
pearson_per_assay = list(metrics_human["PearsonR"].numpy())
data_tuples = list(zip(assay_list,pearson_per_assay))
df_pearson_assay = pd.DataFrame(data_tuples, columns=['assay','pearson'])
df = df.astype({"assay": str, "pearson": float})
df_pearson_assay["pearson"]
df = df_pearson_assay

In [None]:
sns.set_theme(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})
# Initialize the FacetGrid object
g = sns.FacetGrid(df, row="assay", hue="assay", aspect=15,  height = .5, palette="mako")
# Draw the densities in a few steps
g.map(sns.kdeplot, "pearson",
      bw_adjust=.5, clip_on=False,
      fill=True, alpha=1, linewidth=1.5)
g.map(sns.kdeplot, "pearson", clip_on=False, color="w", lw=2, bw_adjust=.5)
g.refline(y=0, linewidth=2, linestyle="-", color=None, clip_on=False)

g.map(label, "pearson")
g.figure.subplots_adjust(hspace=0)
g.set_titles("")
g.set(yticks=[], ylabel="")
g.despine(bottom=True, left=True)
g.set(xlim = (0,1))

# Tests

In [79]:
dataset = get_dataset('mouse', 'train').batch(1).repeat()
max_steps = 1

In [80]:
for i, batch in tqdm(enumerate(dataset)):
        mybatch = batch 
        if max_steps is not None and i > max_steps:
            break
        print(i)
        print(batch)

1it [00:16, 16.11s/it]

0
{'sequence': <tf.Tensor: shape=(1, 131072, 4), dtype=float32, numpy=
array([[[0., 0., 0., 1.],
        [1., 0., 0., 0.],
        [0., 0., 0., 1.],
        ...,
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.]]], dtype=float32)>, 'target': <tf.Tensor: shape=(1, 896, 1643), dtype=float32, numpy=
array([[[0.00774384, 0.06262207, 0.0579834 , ..., 1.1318359 ,
         0.        , 0.6035156 ],
        [0.02897644, 0.04907227, 0.12213135, ..., 0.00302887,
         1.9921875 , 0.09564209],
        [0.05709839, 0.08172607, 0.07495117, ..., 0.9848633 ,
         0.        , 0.        ],
        ...,
        [0.0199585 , 0.0397644 , 0.01802063, ..., 0.        ,
         0.        , 0.        ],
        [0.01231384, 0.0163269 , 0.01808167, ..., 0.        ,
         0.        , 0.        ],
        [0.0050621 , 0.06262207, 0.01885986, ..., 0.        ,
         0.        , 0.        ]]], dtype=float32)>}
1
{'sequence': <tf.Tensor: shape=(1, 131072, 4), dtype=float32, num

2it [00:16,  8.24s/it]


In [121]:
mybatch

{'sequence': <tf.Tensor: shape=(1, 131072, 4), dtype=float32, numpy=
 array([[[0., 1., 0., 0.],
         [0., 0., 0., 1.],
         [0., 0., 0., 1.],
         ...,
         [1., 0., 0., 0.],
         [0., 1., 0., 0.],
         [0., 1., 0., 0.]]], dtype=float32)>,
 'target': <tf.Tensor: shape=(1, 896, 1643), dtype=float32, numpy=
 array([[[0.00814056, 0.02476501, 0.06027222, ..., 0.        ,
          0.9165039 , 1.0214844 ],
         [0.03747559, 0.03271484, 0.0949707 , ..., 1.4677734 ,
          0.6796875 , 1.9335938 ],
         [0.05706787, 0.07159424, 0.11590576, ..., 0.        ,
          1.0380859 , 1.4941406 ],
         ...,
         [0.05126953, 0.05950928, 0.07971191, ..., 0.59228516,
          0.26757812, 0.        ],
         [0.04214478, 0.11450195, 0.12561035, ..., 0.97314453,
          1.3828125 , 0.98339844],
         [0.17272949, 0.14013672, 0.07116699, ..., 0.        ,
          0.        , 0.99658203]]], dtype=float32)>}

In [None]:
mybatch["target"].numpy().shape

### Retrieve real values matched with sequences

In [123]:
mybatch

{'sequence': <tf.Tensor: shape=(1, 131072, 4), dtype=float32, numpy=
 array([[[0., 1., 0., 0.],
         [0., 0., 0., 1.],
         [0., 0., 0., 1.],
         ...,
         [1., 0., 0., 0.],
         [0., 1., 0., 0.],
         [0., 1., 0., 0.]]], dtype=float32)>,
 'target': <tf.Tensor: shape=(1, 896, 1643), dtype=float32, numpy=
 array([[[0.00814056, 0.02476501, 0.06027222, ..., 0.        ,
          0.9165039 , 1.0214844 ],
         [0.03747559, 0.03271484, 0.0949707 , ..., 1.4677734 ,
          0.6796875 , 1.9335938 ],
         [0.05706787, 0.07159424, 0.11590576, ..., 0.        ,
          1.0380859 , 1.4941406 ],
         ...,
         [0.05126953, 0.05950928, 0.07971191, ..., 0.59228516,
          0.26757812, 0.        ],
         [0.04214478, 0.11450195, 0.12561035, ..., 0.97314453,
          1.3828125 , 0.98339844],
         [0.17272949, 0.14013672, 0.07116699, ..., 0.        ,
          0.        , 0.99658203]]], dtype=float32)>}

In [124]:
human_sequences

'../../../../data/FED/data_human_sequences.bed'