# Enformer human validation (smaller dataset)

## Evaluate sequence-wise

In [None]:
import os

In [None]:
datadir = "../../../../data/FED"
outputdir = os.path.join(datadir, "hd5")

In [None]:
file = os.path.join(outputdir,'dataset_197k_evaluation_50.h5')
with open(file, 'rb') as config_dictionary_file:
    dataset_197k_evaluation = pickle.load(config_dictionary_file)

In [None]:
file = os.path.join(outputdir,'summarized_metrics.h5')
with open(file, 'rb') as config_dictionary_file:
    summarized_metrics = pickle.load(config_dictionary_file)

In [None]:
# Download targets from Basenji2 dataset 
# Cite: Kelley et al Cross-species regulatory sequence activity prediction. PLoS Comput. Biol. 16, e1008050 (2020).
targets_txt = 'https://raw.githubusercontent.com/calico/basenji/master/manuscripts/cross2020/targets_human.txt'
df_targets = pd.read_csv(targets_txt, sep='\t')
df_targets

suppl = pd.ExcelFile(os.path.join(datadir, "enformer_suppl.xlsx"))
print(suppl.sheet_names)
suppl_human = suppl.parse(suppl.sheet_names[1])
suppl_mouse = suppl.parse(suppl.sheet_names[2])
suppl_human["organism"] = "human"
suppl_mouse["organism"] = "mouse"
frames = [suppl_human, suppl_mouse]
suppl_df = pd.concat(frames)


file = os.path.join(outputdir,'suppl_df.h5')
with open(file, 'wb') as config_dictionary_file:
    pickle.dump(suppl_df, config_dictionary_file)

# Plot sequences summary

In [None]:
ordered_assays = suppl_df[suppl_df["organism"] == "human"]["assay_type"]
ordered_assays_full = suppl_df[suppl_df["organism"] == "human"]["target"]

### How many tracks per assay type? 

In [None]:
pd.DataFrame(suppl_df[suppl_df["organism"] == "human"].groupby("assay_type").count()["index"])

In [None]:
def get_sequence_evaluation_df(i,dataset_197k_evaluation, ordered_assays ):
    # Create dataframe for plotting
    df = pd.DataFrame()
    # Add sequence
    df["sequence"] = np.repeat(i,len(ordered_assays))
    # Add assay
    df["assay"] = ordered_assays
    df["full"] = ordered_assays_full
    # Add pearson values 
    df["pearson"] = (dataset_197k_evaluation[i]["PearsonR"])
    return(df)

In [None]:
dataset_197k_evaluation

In [None]:
final_df = pd.DataFrame()
for i in range(50):
    df = get_sequence_evaluation_df(i,dataset_197k_evaluation, ordered_assays)
    print(i)
    final_df = pd.concat([final_df, df])

In [None]:
df = final_df[(final_df["assay"]  == "DNASE")]

In [None]:
df.group

In [None]:
sns.scatterplot(data=tips, x="total_bill", y="tip", hue="assay", style="time")


In [None]:
final_df

In [None]:
final_df[final_df["sequence"]==1]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="ticks", palette="pastel")

# Load the example tips dataset
sns.violinplot(x="assay", y="pearson",  palette="mako", data=final_df)

sns.despine(offset=10, trim=True)

In [None]:
penguins = sns.load_dataset("penguins")

In [None]:
from datetime import datetime
now = datetime.now().time() # time object

print("now =", now)
print("type(now) =", type(now))

In [None]:
final_df

In [None]:
sns.ecdfplot(data=final_df, x="pearson")

In [None]:
# Load previous validation dictionary
enformer_dict_file = os.path.join(outputdir,'00_enformer_dict_seqs.h5')

with open(enformer_dict_file, 'rb') as config_dictionary_file:
    human_validation_dict = pickle.load(config_dictionary_file)

In [None]:
human_validation_dict

In [None]:
# ------------- OLD 

### PLOT: Distributions of pearson correlation coefficients per assay 

In [None]:
# assuming the values are in order of assay (TODO check) 
assay_list = list(suppl_df["assay_type"])
pearson_per_assay = list(metrics_human[0]["PearsonR"].numpy())
data_tuples = list(zip(assay_list,pearson_per_assay))
df_pearson_assay = pd.DataFrame(data_tuples, columns=['assay','pearson'])
df_pearson_assay["pearson"]
df = df_pearson_assay
df = df.astype({"assay": str, "pearson": float})
df

In [None]:
df.groupby("assay").mean()

In [None]:
df

sns.set_theme(style="ticks", palette="pastel")

# Load the example tips dataset

# Draw a nested boxplot to show bills by day and time
sns.violinplot(x="assay", y="pearson",  palette="mako", linewidth=1.5,
            data=df)
sns.despine(offset=10, trim=True)

In [None]:
sns.set_theme(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})

# Initialize 
g = sns.FacetGrid(df, row="assay", hue="assay", aspect=15, height=1, palette="mako")

# Draw the densities in a few steps
g.map(sns.kdeplot, "pearson",
      bw_adjust=.5, clip_on=False,
      fill=True, alpha=1, linewidth=1.5)
g.map(sns.kdeplot, "pearson", clip_on=False, color="w", lw=2, bw_adjust=.5)

# passing color=None to refline() uses the hue mapping
g.refline(y=0, linewidth=2, linestyle="-", color=None, clip_on=False)

# Define and use a simple function to label the plot in axes coordinates
def label(x, color, label):
    ax = plt.gca()
    ax.text(0, .2, label, fontweight="bold", color=color,
            ha="left", va="center", transform=ax.transAxes)

g.map(label, "pearson")

# Set the subplots to overlap
g.figure.subplots_adjust(hspace=-.3)

# Remove axes details that don't play well with overlap
g.set_titles("")
g.set(yticks=[], ylabel="")
g.despine(bottom=True, left=True)

# OLD (do not delete) - prepare  dictionary intervals

## Check if the sequences are in order

In [None]:
df = pd.read_csv(human_sequences, memory_map=True, header=None, index_col=False, delimiter="\t")
# keep only validation intervals 
validation_intervals= df[df[3]=="valid"]
#validation_intervals = validation_intervals.head()
# create list with interval
interval_list = list()
validation_intervals.apply(lambda row : interval_list.append(kipoiseq.Interval(row[0],row[1], row[2])), axis = 1)

In [None]:
# Create dictionary for search (can be improved! quite slow)
human_validation_dict = {}
for interval in interval_list: 
    sequence = one_hot_encode(fasta_extractor.extract(interval))
    human_validation_dict[interval] = sequence

In [None]:
# Step 1
import pickle

enformer_dict_file = os.path.join(outputdir,'00_enformer_dict_seqs.h5')
# Step 2
with open(enformer_dict_file, 'wb') as config_dictionary_file:
    pickle.dump(human_validation_dict, config_dictionary_file)
    
# -------- read -------
with open(enformer_dict_file, 'rb') as config_dictionary_file:
    config_dictionary = pickle.load(config_dictionary_file)

print(config_dictionary)

In [34]:
datadir = "../../../../data/FED"
outputdir = os.path.join(datadir, "basenji/human/")
enformer_dict_file = os.path.join(outputdir,'00_enformer_dict_seqs_human.h5')
# -------- read -------
with open(enformer_dict_file, 'rb') as config_dictionary_file:
    config_dictionary = pickle.load(config_dictionary_file)

In [36]:
len(config_dictionary.keys())

2213

In [37]:
next(iter(config_dictionary))

Interval(chrom='chr6', start=165740202, end=165871274, name='', strand='.', ...)

In [38]:
config_dictionary[next(iter(config_dictionary))]


array([[1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       ...,
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.]], dtype=float32)

### Same with mouse

In [5]:
import pandas as pd
import kipoiseq
from kipoiseq import Interval
import pyfaidx
import numpy as np
import os 
import pickle

In [2]:
mouse_sequences = "/home/luisasantus/Desktop/crg_cluster/data/FED/basenji/mouse/mouse_sequences.bed"

In [3]:
fasta_file = "/home/luisasantus/Desktop/crg_cluster/data/FED/hg38.fa"

In [None]:
df = pd.read_csv(mouse_sequences, memory_map=True, header=None, index_col=False, delimiter="\t")
# keep only validation intervals 
validation_intervals= df[df[3]=="valid"]
#validation_intervals = validation_intervals.head()
# create list with interval
interval_list = list()
validation_intervals.apply(lambda row : interval_list.append(kipoiseq.Interval(row[0],row[1], row[2])), axis = 1)

In [None]:
def one_hot_encode(sequence):
    return kipoiseq.transforms.functional.one_hot_dna(sequence).astype(np.float32)

In [None]:
class FastaStringExtractor:

    def __init__(self, fasta_file):
        self.fasta = pyfaidx.Fasta(fasta_file)
        self._chromosome_sizes = {k: len(v) for k, v in self.fasta.items()}

    def extract(self, interval: Interval, **kwargs) -> str:
        # Truncate interval if it extends beyond the chromosome lengths.
        chromosome_length = self._chromosome_sizes[interval.chrom]
        trimmed_interval = Interval(interval.chrom,
                                    max(interval.start, 0),
                                    min(interval.end, chromosome_length),
                                    )
        # pyfaidx wants a 1-based interval
        sequence = str(self.fasta.get_seq(trimmed_interval.chrom,
                                          trimmed_interval.start + 1,
                                          trimmed_interval.stop).seq).upper()
        # Fill truncated values with N's.
        pad_upstream = 'N' * max(-interval.start, 0)
        pad_downstream = 'N' * max(interval.end - chromosome_length, 0)
        return pad_upstream + sequence + pad_downstream

    def close(self):
        return self.fasta.close()


def get_metadata(metadata):
    with tf.io.gfile.GFile(metadata, 'r') as f:
        return json.load(f)

def get_dataset(tfr, metadata):

    metadata = get_metadata(metadata)

    dataset = tf.data.TFRecordDataset(tfrecord, compression_type='ZLIB')

    dataset = dataset.map(functools.partial(deserialize, metadata=metadata))

    return dataset



fasta_extractor = FastaStringExtractor(fasta_file)

In [None]:
# Create dictionary for search (can be improved! quite slow)
mouse_validation_dict = {}
for interval in interval_list: 
    sequence = one_hot_encode(fasta_extractor.extract(interval))
    mouse_validation_dict[interval] = sequence

In [None]:
outputdir = "/home/luisasantus/Desktop/crg_cluster/data/FED/basenji/mouse"

enformer_dict_file = os.path.join(outputdir,'00_enformer_dict_seqs_mouse.h5')
# Step 2
with open(enformer_dict_file, 'wb') as config_dictionary_file:
    pickle.dump(mouse_validation_dict, config_dictionary_file)
    

In [17]:
datadir = "../../../../data/FED"
outputdir = os.path.join(datadir, "basenji/mouse/")
enformer_dict_file = os.path.join(outputdir,'00_enformer_dict_seqs_mouse.h5')
# -------- read -------
with open(enformer_dict_file, 'rb') as config_dictionary_file:
    config_dictionary = pickle.load(config_dictionary_file)

In [33]:
config_dictionary[next(iter(config_dictionary))].shape


(131072, 4)

## Quick test 

In [39]:
file = os.path.join("/home/luisasantus/Desktop/crg_cluster/data/FED/enformer/human/pred_standard/valid-0-6_197k_pred.pkl")
with open(file, 'rb') as config_dictionary_file:
    pred = pickle.load(config_dictionary_file)

2022-02-07 15:51:55.270635: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-02-07 15:51:55.270655: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-02-07 15:51:57.701050: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-02-07 15:51:57.701084: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-02-07 15:51:57.701099: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (luisasantus-HP-EliteDesk-800-G5-TWR): /proc/driver/nvidia/version does not exist
2022-02-07 15:51:57.701475: I tensorflow/

In [43]:
pred[1]

{'sequence': <tf.Tensor: shape=(1, 196608, 4), dtype=float32, numpy=
 array([[[1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         ...,
         [0., 1., 0., 0.],
         [0., 1., 0., 0.],
         [0., 0., 0., 1.]]], dtype=float32)>,
 'target': <tf.Tensor: shape=(896, 5313), dtype=float32, numpy=
 array([[0.0043602 , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.00084496, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.00675964, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.07727051, 0.03848267, 0.02961731, ..., 0.07946777, 0.16455078,
         0.1340332 ],
        [0.0302887 , 0.05227661, 0.0186615 , ..., 0.        , 0.81933594,
         0.        ],
        [0.02532959, 0.01405334, 0.00772476, ..., 0.        , 0.14233398,
         0.22802734]], dtype=float32)>,
 'interval': Interval(chrom='chrX', start=24088313, end=24219385,

In [107]:
df_targets = pd.read_csv("/home/luisasantus/Desktop/crg_cluster/data/FED/enformer/human/pred_standard/summarydf/valid-0-2_197k_pred_eval_df.csv")
df_targets

Unnamed: 0,sequence,assay,pearson
0,0,DNase/cerebellum male adult (27 years) and mal...,0.684329
1,0,DNase/frontal cortex male adult (27 years) and...,0.783626
2,0,DNase/chorion,0.622130
3,0,DNase/Ishikawa treated with 0.02% dimethyl sul...,0.764103
4,0,DNase/GM03348,0.885546
...,...,...,...
15934,2,CAGE/epithelioid sarcoma cell line:HS-ES-2R,-0.007207
15935,2,CAGE/squamous cell lung carcinoma cell line:RE...,0.024904
15936,2,CAGE/gastric cancer cell line:GSS,0.018539
15937,2,CAGE/carcinoid cell line:NCI-H727,0.136828
