In [15]:
import gzip
import kipoiseq
from kipoiseq import Interval
import pyfaidx
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import pickle
import sys
import tensorflow as tf
import importlib
import copy
import time


myDir = os.path.join(os.path.dirname(os.getcwd()), "bin")
model_path = "/home/luisasantus/Desktop/crg_cluster/data/FED/enformer/1"
sys.path.append(myDir)
from utils_full import *
model = Enformer(model_path)

from enformer import *


SEQUENCE_LENGHT = 393_216
## pad the sequence with Ns (anyways ignored by the model)
def pad_one_hot(sequence_one_hot, NEW_SIZE):
    ADD_ENDS = int((NEW_SIZE - sequence_one_hot.shape[0])/2)
    pad_zero = np.tile(np.array([0., 0., 0., 0.]), (ADD_ENDS, 1))
    padded_left = np.append(pad_zero,sequence_one_hot, axis=0)
    pad_sequence = np.append(padded_left,pad_zero, axis=0)
    return(pad_sequence)


In [38]:
import multiprocessing

In [None]:
dataset_197k_file = "/home/luisasantus/Desktop/crg_cluster/data/FED/basenji/human/tfrecords_197k/valid-0-0_197k.pkl"
with open(dataset_197k_file, 'rb') as file:
    dataset_197k = pickle.load(file) 

In [23]:
test_ds = dataset_197k[1:2]

In [25]:
test =  "/home/luisasantus/Desktop/crg_cluster/data/FED/basenji/human/test_197.pkl"
with open(test, 'wb') as file:
    pickle.dump(test_ds, file) 

In [32]:
def evaluate_model_all_sequences_mod(model, dataset, head, dataset_197k_evaluation, max_steps=None):

    # Given a tensor with a one-encoded sequence, predicts head tracks
    def predict(x):
        padded_sequence = pad_one_hot(np.squeeze(x.numpy(), axis=0), SEQUENCE_LENGHT)[np.newaxis]
        predictions = model.predict_on_batch(padded_sequence)[head]
        return tf.convert_to_tensor(predictions, dtype=tf.float32)

    for i in range(len(dataset)):
        if max_steps is not None and i > max_steps:
            break
            
        batch = dataset[i]
        seq = batch['sequence']
        t0 = time.time()
        prediction = predict(seq)
        t1 = time.time()
        pred_time = t1-t0
        print("predtime "+ str(pred_time))
        metric_seq = MetricDict({'PearsonR': PearsonR(reduce_axis=(0,1))})
        metric_seq.update_state(batch['target'][np.newaxis], prediction)
        t2 = time.time()
        metric_seq_time = t2-t1
        print("metric time  "+ str(metric_seq_time))
        
        pearson_seq = metric_seq.result()["PearsonR"].numpy()
        batch_validation = {"sequence": batch["sequence"],
                            "target": batch["target"],
                            "interval": batch["interval"],
                            "PearsonR": pearson_seq}
        dataset_197k_evaluation.append(batch_validation)
        
        
        t3 = time.time()
        object_time = t3-t2
        print("object time  "+ str(object_time))


    return dataset_197k_evaluation

In [11]:
dataset_197k[1]

{'sequence': <tf.Tensor: shape=(1, 196608, 4), dtype=float32, numpy=
 array([[[1., 0., 0., 0.],
         [0., 0., 1., 0.],
         [0., 0., 1., 0.],
         ...,
         [0., 0., 0., 1.],
         [0., 0., 0., 1.],
         [0., 0., 0., 1.]]], dtype=float32)>,
 'target': <tf.Tensor: shape=(896, 5313), dtype=float32, numpy=
 array([[0.        , 0.        , 0.        , ..., 0.        , 0.1184082 ,
         0.        ],
        [0.00741959, 0.00526428, 0.        , ..., 0.        , 0.02198792,
         0.00336266],
        [0.00865936, 0.02127075, 0.01722717, ..., 0.        , 0.        ,
         0.06109619],
        ...,
        [0.03768921, 0.04833984, 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.04605103, 0.08355713, 0.01335907, ..., 0.        , 0.        ,
         0.11633301],
        [0.06158447, 0.08026123, 0.06018066, ..., 0.        , 0.        ,
         0.        ]], dtype=float32)>,
 'interval': Interval(chrom='chrX', start=55044496, end=55175568,

In [33]:
evaluate_model_all_sequences_mod(model,
                               dataset=test_ds,
                               head="human",
                               dataset_197k_evaluation = [], 
                               max_steps=1)

predtime 17.722840309143066
metric time  0.029781818389892578
object time  0.0013587474822998047


[{'sequence': <tf.Tensor: shape=(1, 196608, 4), dtype=float32, numpy=
  array([[[1., 0., 0., 0.],
          [0., 0., 1., 0.],
          [0., 0., 1., 0.],
          ...,
          [0., 0., 0., 1.],
          [0., 0., 0., 1.],
          [0., 0., 0., 1.]]], dtype=float32)>,
  'target': <tf.Tensor: shape=(896, 5313), dtype=float32, numpy=
  array([[0.        , 0.        , 0.        , ..., 0.        , 0.1184082 ,
          0.        ],
         [0.00741959, 0.00526428, 0.        , ..., 0.        , 0.02198792,
          0.00336266],
         [0.00865936, 0.02127075, 0.01722717, ..., 0.        , 0.        ,
          0.06109619],
         ...,
         [0.03768921, 0.04833984, 0.        , ..., 0.        , 0.        ,
          0.        ],
         [0.04605103, 0.08355713, 0.01335907, ..., 0.        , 0.        ,
          0.11633301],
         [0.06158447, 0.08026123, 0.06018066, ..., 0.        , 0.        ,
          0.        ]], dtype=float32)>,
  'interval': Interval(chrom='chrX', start=

In [39]:
def test(test_ds):
    return evaluate_model_all_sequences_mod(model,
                               dataset=test_ds,
                               head="human",
                               dataset_197k_evaluation = [], 
                               max_steps=1)
    

In [88]:
import multiprocessing as mp

# Step 1: Init multiprocessing.Pool()
pool = mp.Pool(2)



Process ForkPoolWorker-51:
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.8/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 358, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'evaluate_model' on <module '__main__'>
Process ForkPoolWorker-50:
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.8/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 356, in get
    res = 

In [71]:
def evaluate_model_seq(model, batch, head, max_steps=None):
     
    prediction = predict(batch['sequence'])
    metric_seq = MetricDict({'PearsonR': PearsonR(reduce_axis=(0,1))})
    metric_seq.update_state(batch['target'][np.newaxis], prediction)        
    pearson_seq = metric_seq.result()["PearsonR"].numpy()
    batch_validation = {"sequence": batch["sequence"],
                            "target": batch["target"],
                            "interval": batch["interval"],
                            "PearsonR": pearson_seq}

    return batch_validation

In [48]:
a = list([test_ds, test_ds, test_ds, test_ds])

In [77]:
a

[[{'sequence': <tf.Tensor: shape=(1, 196608, 4), dtype=float32, numpy=
   array([[[1., 0., 0., 0.],
           [0., 0., 1., 0.],
           [0., 0., 1., 0.],
           ...,
           [0., 0., 0., 1.],
           [0., 0., 0., 1.],
           [0., 0., 0., 1.]]], dtype=float32)>,
   'target': <tf.Tensor: shape=(896, 5313), dtype=float32, numpy=
   array([[0.        , 0.        , 0.        , ..., 0.        , 0.1184082 ,
           0.        ],
          [0.00741959, 0.00526428, 0.        , ..., 0.        , 0.02198792,
           0.00336266],
          [0.00865936, 0.02127075, 0.01722717, ..., 0.        , 0.        ,
           0.06109619],
          ...,
          [0.03768921, 0.04833984, 0.        , ..., 0.        , 0.        ,
           0.        ],
          [0.04605103, 0.08355713, 0.01335907, ..., 0.        , 0.        ,
           0.11633301],
          [0.06158447, 0.08026123, 0.06018066, ..., 0.        , 0.        ,
           0.        ]], dtype=float32)>,
   'interval': Interv

In [86]:
def arrt(inp):
    return(1)

In [98]:
def evaluate_model_seq(model, batch, head, max_steps=None):
    
    prediction = predict(batch['sequence'])
    metric_seq = MetricDict({'PearsonR': PearsonR(reduce_axis=(0,1))})
    metric_seq.update_state(batch['target'][np.newaxis], prediction)        
    pearson_seq = metric_seq.result()["PearsonR"].numpy()
    batch_validation = {"sequence": batch["sequence"],
                            "target": batch["target"],
                            "interval": batch["interval"],
                            "PearsonR": pearson_seq}

    return batch_validation

In [91]:
model

<utils_full.Enformer at 0x7f1c1c3bfb80>

In [122]:
def evaluate_model_seq(model, batch):
    
    def predict(x, model, head):
        padded_sequence = pad_one_hot(np.squeeze(x.numpy(), axis=0), SEQUENCE_LENGHT)[np.newaxis]
        predictions = model.predict_on_batch(padded_sequence)[head]
        return tf.convert_to_tensor(predictions, dtype=tf.float32)
    prediction = predict(batch['sequence'])
    metric_seq = MetricDict({'PearsonR': PearsonR(reduce_axis=(0,1))})
    metric_seq.update_state(batch['target'][np.newaxis], prediction)        
    pearson_seq = metric_seq.result()["PearsonR"].numpy()
    batch_validation = {"sequence": batch["sequence"],
                            "target": batch["target"],
                            "interval": batch["interval"],
                            "PearsonR": pearson_seq}

    return batch_validation

In [119]:
def evaluate_model_2(batch):
     
    prediction = batch['sequence']
    metric_seq = MetricDict({'PearsonR': PearsonR(reduce_axis=(0,1))})
    metric_seq.update_state(batch['target'][np.newaxis], prediction)  
    
    return 1

In [125]:
# Step 2: `pool.apply` the `howmany_within_range()`
results = [pool.apply(evaluate_model_seq, args=(model, example, "human")) for example in a]


AttributeError: Can't pickle local object 'Loader._recreate_base_user_object.<locals>._UserObject'

In [121]:
results

[<tf.Tensor: shape=(1, 196608, 4), dtype=float32, numpy=
 array([[[1., 0., 0., 0.],
         [0., 0., 1., 0.],
         [0., 0., 1., 0.],
         ...,
         [0., 0., 0., 1.],
         [0., 0., 0., 1.],
         [0., 0., 0., 1.]]], dtype=float32)>,
 <tf.Tensor: shape=(1, 196608, 4), dtype=float32, numpy=
 array([[[1., 0., 0., 0.],
         [0., 0., 1., 0.],
         [0., 0., 1., 0.],
         ...,
         [0., 0., 0., 1.],
         [0., 0., 0., 1.],
         [0., 0., 0., 1.]]], dtype=float32)>,
 <tf.Tensor: shape=(1, 196608, 4), dtype=float32, numpy=
 array([[[1., 0., 0., 0.],
         [0., 0., 1., 0.],
         [0., 0., 1., 0.],
         ...,
         [0., 0., 0., 1.],
         [0., 0., 0., 1.],
         [0., 0., 0., 1.]]], dtype=float32)>,
 <tf.Tensor: shape=(1, 196608, 4), dtype=float32, numpy=
 array([[[1., 0., 0., 0.],
         [0., 0., 1., 0.],
         [0., 0., 1., 0.],
         ...,
         [0., 0., 0., 1.],
         [0., 0., 0., 1.],
         [0., 0., 0., 1.]]], dtype=flo

In [60]:

import multiprocessing as mp

# Step 1: Init multiprocessing.Pool()
pool = mp.Pool(mp.cpu_count())

# Step 2: `pool.apply` the `howmany_within_range()`
results = [pool.apply(howmany_within_range, args=(row, 4, 8)) for row in data]

# Step 3: Don't forget to close
pool.close()    

print(results[:10])
#> [3, 1, 4, 4, 4, 2, 1, 1, 3, 3]

[3, 2, 3, 3, 1, 3, 2, 3, 2, 1]


In [40]:
pool = multiprocessing.Pool()
pool.map(test, test_ds)

KeyError: 0