# Enformer human validation 

### Load  pre-trained model 

In [5]:
import tensorflow as tf
import tensorflow_hub as hub
import joblib
import gzip
import kipoiseq
from kipoiseq import Interval
import pyfaidx
import statistics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import os
import enformer 
from tqdm import tqdm
import importlib.util
import inspect
from typing import Any, Callable, Dict, Optional, Text, Union, Iterable
import attention_module
import numpy as np
import sonnet as snt
import h5py


%matplotlib inline
%config InlineBackend.figure_format = 'retina'

2022-01-28 11:56:45.833117: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-28 11:56:45.833139: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [None]:
# import utils.py as module
spec_utils = importlib.util.spec_from_file_location("enformer", os.path.join(os.getcwd() ,"utils.py"))
utils = importlib.util.module_from_spec(spec_utils)
spec_utils.loader.exec_module(utils)
from utils import * 

### Load files

In [None]:
transform_path = 'gs://dm-enformer/models/enformer.finetuned.SAD.robustscaler-PCA500-robustscaler.transform.pkl'
model_path = 'https://tfhub.dev/deepmind/enformer/1'
datadir = "../../../../data/FED"
outputdir = os.path.join(datadir, "hd5")
fasta_file = os.path.join(datadir, "hg38.fa")
human_sequences = os.path.join(datadir, "data_human_sequences.bed")
pyfaidx.Faidx(fasta_file)

In [None]:
model = Enformer(model_path)

In [None]:
# import enformer.py as module
spec = importlib.util.spec_from_file_location("enformer", os.path.join(os.getcwd() ,"enformer.py"))
enformer = importlib.util.module_from_spec(spec)
spec.loader.exec_module(enformer)
from enformer import * 

In [None]:
fasta_extractor = FastaStringExtractor(fasta_file)

### Check tracks

In [None]:
# Download targets from Basenji2 dataset 
# Cite: Kelley et al Cross-species regulatory sequence activity prediction. PLoS Comput. Biol. 16, e1008050 (2020).
targets_txt = 'https://raw.githubusercontent.com/calico/basenji/master/manuscripts/cross2020/targets_human.txt'
df_targets = pd.read_csv(targets_txt, sep='\t')
df_targets

In [None]:
suppl = pd.ExcelFile(os.path.join(datadir, "enformer_suppl.xlsx"))
print(suppl.sheet_names)
suppl_human = suppl.parse(suppl.sheet_names[1])
suppl_mouse = suppl.parse(suppl.sheet_names[2])
suppl_human["organism"] = "human"
suppl_mouse["organism"] = "mouse"
frames = [suppl_human, suppl_mouse]
suppl_df = pd.concat(frames)

## Example predict one sequence

In [None]:
def one_hot_encode(sequence):
    return kipoiseq.transforms.functional.one_hot_dna(sequence).astype(np.float32)

SEQUENCE_LENGHT = 393_216

In [None]:
## pad the sequence with Ns (anyways ignored by the model)
def pad_one_hot(sequence_one_hot, NEW_SIZE):
    ADD_ENDS = int((NEW_SIZE - sequence_one_hot.shape[0])/2)
    pad_zero = np.tile(np.array([0., 0., 0., 0.]), (ADD_ENDS, 1))
    padded_left = np.append(pad_zero,sequence_one_hot, axis=0)
    pad_sequence = np.append(padded_left,pad_zero, axis=0)
    return(pad_sequence)

### compute score (how well predicted)

In [None]:
#TODO 
#1 - retrieve the 197k sequence instead o 131k 

In [None]:
human_dataset = get_dataset('human', 'valid').batch(1).repeat()

In [None]:
def evaluate_model_all_sequences(model, dataset, head, max_steps=None):
    
    metric = MetricDict({'PearsonR': PearsonR(reduce_axis=(0,1))})
    print("Metric dictionary created")
    
    def predict(x):
        print("Beginning prediction")
        padded_sequence = pad_one_hot(np.squeeze(x.numpy(), axis=0), SEQUENCE_LENGHT)[np.newaxis]
        predictions = model.predict_on_batch(padded_sequence)[head]
        return tf.convert_to_tensor(predictions, dtype=tf.float32)
    print("Predict funciton loaded")
    
    for i, batch in tqdm(enumerate(dataset)):
        if max_steps is not None and i > max_steps:
            break
        prediction = predict(batch['sequence'])
        metric.update_state(batch['target'], prediction)

    return metric.result()

In [None]:
# Evaluate model on first ten 
# Right now it evaluates the whole model and 
metrics_human = evaluate_model_all_sequences(model,
                               dataset=get_dataset('human', 'valid').batch(1).prefetch(2),
                               head='human',
                               max_steps=2)
#print('')dataset_197k
#print({k: v.numpy().mean() for k, v in metrics_human.items()})

In [None]:
def evaluate_model_all_sequences(model, sequence_dict, head, max_steps=None):
    
    metric = MetricDict({'PearsonR': PearsonR(reduce_axis=(0,1))})
    print("Metric dictionary created")
    
    def predict(x):
        print("Beginning prediction")
        padded_sequence = pad_one_hot(np.squeeze(x.numpy(), axis=0), SEQUENCE_LENGHT)[np.newaxis]
        predictions = model.predict_on_batch(padded_sequence)[head]
        return tf.convert_to_tensor(predictions, dtype=tf.float32)
    print("Predict funciton loaded")
    
    i = 0 
    for keys in sequence_dict.keys():
        if max_steps is not None and i > max_steps:
            break
        i = i+1
        prediction = predict(sequence_dict['sequence'])
        metric.update_state(sequence_dict['target'], prediction)

    return metric.result()



metrics_human = evaluate_model_all_sequences(model,
                               dataset= dataset_197k,
                               head='human',
                               max_steps=2)
#print('')dataset_197k

In [None]:
dataset_197k

In [None]:
metrics_human

#### Distributions of pearson correlation coefficients per assay 

In [1]:
# assuming the values are in order of assay (TODO check) 
assay_list = list(suppl_df["assay_type"])
pearson_per_assay = list(metrics_human["PearsonR"].numpy())
data_tuples = list(zip(assay_list,pearson_per_assay))
df_pearson_assay = pd.DataFrame(data_tuples, columns=['assay','pearson'])
df_pearson_assay["pearson"]
df = df_pearson_assay
df = df.astype({"assay": str, "pearson": float})

NameError: name 'suppl_df' is not defined

In [None]:
df.groupby("assay").mean()

In [None]:
sns.set_theme(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})

# Initialize 
g = sns.FacetGrid(df, row="assay", hue="assay", aspect=15, height=1, palette="mako")

# Draw the densities in a few steps
g.map(sns.kdeplot, "pearson",
      bw_adjust=.5, clip_on=False,
      fill=True, alpha=1, linewidth=1.5)
g.map(sns.kdeplot, "pearson", clip_on=False, color="w", lw=2, bw_adjust=.5)

# passing color=None to refline() uses the hue mapping
g.refline(y=0, linewidth=2, linestyle="-", color=None, clip_on=False)

# Define and use a simple function to label the plot in axes coordinates
def label(x, color, label):
    ax = plt.gca()
    ax.text(0, .2, label, fontweight="bold", color=color,
            ha="left", va="center", transform=ax.transAxes)

g.map(label, "pearson")

# Set the subplots to overlap
g.figure.subplots_adjust(hspace=-.3)

# Remove axes details that don't play well with overlap
g.set_titles("")
g.set(yticks=[], ylabel="")
g.despine(bottom=True, left=True)

# Tests

## Check if the sequences are in order

In [None]:
df = pd.read_csv(human_sequences, memory_map=True, header=None, index_col=False, delimiter="\t")
# keep only validation intervals 
validation_intervals= df[df[3]=="valid"]
#validation_intervals = validation_intervals.head()
# create list with interval
interval_list = list()
validation_intervals.apply(lambda row : interval_list.append(kipoiseq.Interval(row[0],row[1], row[2])), axis = 1)


In [None]:
# Create dictionary for search (can be improved! quite slow)
human_validation_dict = {}
for interval in interval_list: 
    sequence = one_hot_encode(fasta_extractor.extract(interval))
    human_validation_dict[interval] = sequence

In [None]:
# Step 1
import pickle

enformer_dict_file = os.path.join(outputdir,'00_enformer_dict_seqs.h5')
# Step 2
with open(enformer_dict_file, 'wb') as config_dictionary_file:
    pickle.dump(human_validation_dict, config_dictionary_file)
    
# -------- read -------
with open(enformer_dict_file, 'rb') as config_dictionary_file:
    config_dictionary = pickle.load(config_dictionary_file)

print(config_dictionary)

In [None]:
def get_interval_from_sequence(sequence, human_validation_dict=human_validation_dict): 
    for interval, sequence in human_validation_dict.items():
        if np.allclose(sequence,first_dataset_entry):
            return(interval)

In [None]:
## Create new dataset
dataset_197k = []
NEW_SEQUENCE_LENGTH = 196_608
max_steps = 10

for i, batch in tqdm(enumerate(human_dataset)):
    batch_197k = {}
    # 1 from the sequence 131k get the sequence 197k
    interval_test = get_interval_from_sequence(batch["sequence"])
    sequence_197k = one_hot_encode(fasta_extractor.extract(interval_test.resize(NEW_SEQUENCE_LENGTH)))
    batch_197k["sequence"] = tf.Variable(sequence_197k[np.newaxis])
    
    # add same real targets
    batch_197k["target"] = batch["target"]
    dataset_197k.append(batch_197k)
    if max_steps is not None and i > max_steps:
        break

        
# ------ Save
file = os.path.join(outputdir,'new_dataset_197k_valid.h5')
# Step 2
with open(file, 'wb') as config_dictionary_file:
    pickle.dump(dataset_197k, config_dictionary_file)

In [None]:
with open(file, 'rb') as config_dictionary_file:
    dataset_197k = pickle.load(config_dictionary_file)

for i, batch in enumerate(dataset_197k): 
    mybatch = batch 

In [None]:
# try predictions
def evaluate_model_all_sequences_mod(model, dataset_list, head, max_steps=None):
    
    metric = MetricDict({'PearsonR': PearsonR(reduce_axis=(0,1))})
    print("Metric dictionary created")
    
    def predict(x):
        padded_sequence = pad_one_hot(np.squeeze(x.numpy(), axis=0), SEQUENCE_LENGHT)[np.newaxis]
        predictions = model.predict_on_batch(padded_sequence)[head]
        return tf.convert_to_tensor(predictions, dtype=tf.float32)
    
    i = 0 
    for i, batch in enumerate(dataset_197k): 
        if max_steps is not None and i > max_steps:
            break
        i = i+1
        prediction = predict(sequence_dict['sequence'])
        metric.update_state(sequence_dict['target'], prediction)

    return metric.result()



metrics_human = evaluate_model_all_sequences(model,
                               dataset= dataset_197k,
                               head='human',
                               max_steps=2)
#print('')dataset_197k

In [None]:
# Right now it evaluates the whole model and 
metrics_human = evaluate_model_all_sequences(model,
                               dataset=dataset_197k,
                               head='human',
                               max_steps=2)

## Create new TF record

In [None]:
# Update metadata
metadata_human_197k = get_metadata("human")
metadata_human_197k['seq_length'] = NEW_SEQUENCE_LENGTH

In [None]:
def predict(x, head):
    padded_sequence = pad_one_hot(x, SEQUENCE_LENGHT)[np.newaxis]
    predictions = model.predict_on_batch(padded_sequence)[head]
    return tf.convert_to_tensor(predictions, dtype=tf.float32)

In [None]:
# Test entry 
#interval_test = get_interval_from_sequence(first_dataset_entry)
first_dataset_entry
target_one = predict(first_dataset_entry, "human")

In [None]:
# Boolean feature, encoded as False or True.
n_observations = 10
feature0 = np.random.choice([False, True], n_observations)
#feature0 =  (first_dataset_entry)
# Integer feature, random from 0 to 4.
feature1 = np.random.randint(0, 5, n_observations)
#feature1 = target_one

In [None]:
feature0

In [None]:
features_dataset = tf.data.Dataset.from_tensor_slices((feature0, feature1, feature2, feature3))
features_dataset

In [None]:
features_dataset = tf.data.Dataset.from_tensor_slices((feature0, feature0))
features_dataset

In [None]:
# Write the `tf.train.Example` observations to the file.
tfrrecord_file = os.path.join(datadir, "tfr/validation_196k.tfr")
n_observations = 1
with tf.io.TFRecordWriter(tfrrecord_file) as writer:
    for i in range(n_observations):
        example = serialize_example(feature0[i].tobytes(), feature1[i].tobytes())
        writer.write(example)

In [None]:
def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

# The following functions can be used to convert a value to a type compatible
# with tf.train.Example.

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def serialize_example(feature0, feature1):

    feature = {
      'sequence': _bytes_feature(feature0),
      'target': _bytes_feature(feature1),
      }

    # Create a Features message using tf.train.Example.
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

def deserialize(serialized_example, metadata):
    """Deserialize bytes stored in TFRecordFile."""
    feature_map = {
          'sequence': tf.io.FixedLenFeature([], tf.string),
          'target': tf.io.FixedLenFeature([], tf.string),
    }
    example = tf.io.parse_example(serialized_example, feature_map)
    sequence = tf.io.decode_raw(example['sequence'], tf.bool)
    sequence = tf.reshape(sequence, (metadata['seq_length'], 4))
    sequence = tf.cast(sequence, tf.float32)

    target = tf.io.decode_raw(example['target'], tf.float16)
    target = tf.reshape(target,
                          (metadata['target_length'], metadata['num_targets']))
    target = tf.cast(target, tf.float32)

    return {'sequence': sequence,
              'target': target}

## Test get_dataset 

In [None]:
dataset = tf.data.TFRecordDataset(tfrrecord_file,
                                        compression_type='ZLIB',
                                        num_parallel_reads=8)

In [None]:
dataset = dataset.map(functools.partial(deserialize, metadata=metadata_human_197k),
                            num_parallel_calls=8)

In [None]:
human_dataset

In [None]:
d = dataset.batch(1).prefetch(2)
d

In [3]:
batch["sequence"]

NameError: name 'batch' is not defined

In [6]:
for i, batch in tqdm(enumerate(human_dataset)):
    print(i)
    mybatch = batch
    break

NameError: name 'human_dataset' is not defined

In [None]:
max_steps = 1
for i, batch in tqdm(enumerate(dataset)):
    if max_steps is not None and i > max_steps:
        break
    print(i)

In [None]:
def get_dataset(organism, subset, num_threads=8):
    
    metadata = get_metadata(organism)
    
    dataset = tf.data.TFRecordDataset(tfrecord_files(organism, subset),
                                        compression_type='ZLIB',
                                        num_parallel_reads=num_threads)
    dataset = dataset.map(functools.partial(deserialize, metadata=metadata),
                            num_parallel_calls=num_threads)
    return dataset

