# Prep files 

In [20]:
import pyBigWig
import pandas as pd
import seaborn as sns
import os
import tensorflow as tf
import json
import functools
import tensorflow_hub as hub
from tqdm import tqdm
import sys
import numpy as np 
import scipy

In [2]:
myDir = "/home/luisasantus/Desktop/crg_cluster/projects/FED/src/01_enformer/bin"
sys.path.append(myDir)

In [3]:
fasta_file = "/home/luisasantus/Desktop/crg_cluster/data/FED/assemblies/bosTaurus/Bos_taurus.ARS-UCD1.2.dna.toplevel.fa"
bw_file = "/home/luisasantus/Desktop/crg_cluster/data/FED/raw/bosTaurus/bw/ARS-UCD1.2.ENA.brain_cerebellum.1.bam.bw"
outdir = "/home/luisasantus/Desktop/crg_cluster/data/FED/basenji/bosTaurus/"
script = "/home/luisasantus/Desktop/crg_cluster/projects/FED/src/01_enformer/scripts/basenji_data.py"
model_path = "/home/luisasantus/Desktop/crg_cluster/data/FED/enformer/1"

In [4]:
lines = [['index','identifier','file','clip','sum_stat','description']]
lines.append(['0', 'bos1', bw_file, '384', 'sum', 'cerebellum'])
targets = os.path.join(outdir,'cer_bos_wigs.txt')
samples_out = open(targets, 'w')
for line in lines:
    print('\t'.join(line), file=samples_out)
samples_out.close()

In [5]:
bw = pyBigWig.open(bw_file)

In [6]:
chroms = list(bw.chroms().keys())
chroms_keep = list(filter(lambda x: not x.startswith('NK'), chroms))
textfile = open("/home/luisasantus/Desktop/crg_cluster/data/FED/assemblies/bosTaurus/chr_ids_keep.txt", "w")
for element in chroms_keep:
    textfile.write(element + " ")
textfile.close()

In [46]:
def deserialize(serialized_example, metadata):
    """Deserialize bytes stored in TFRecordFile."""
    feature_map = {
          'sequence': tf.io.FixedLenFeature([], tf.string),
          'target': tf.io.FixedLenFeature([], tf.string),
          'chr': tf.io.FixedLenFeature([], tf.string),
    }
    example = tf.io.parse_example(serialized_example, feature_map)
    sequence = tf.io.decode_raw(example['sequence'], tf.bool)
    sequence = tf.reshape(sequence, (metadata['seq_length'], 4))
    sequence = tf.cast(sequence, tf.float32)

    target = tf.io.decode_raw(example['target'], tf.float16)
    target = tf.reshape(target,
                          (metadata['target_length'], metadata['num_targets']))
    target = tf.cast(target, tf.float32)

    chrom = tf.io.decode_raw(example['chr'], tf.uint8)
    chrom = tf.cast(chrom, tf.float32)


    return {'sequence': sequence,
              'target': target,
              'chr': chrom}

# Load model and predict 

In [48]:
def evaluate_model_all_sequences_mod(model, dataset, head, dataset_197k_evaluation, max_steps=None):

    # Given a tensor with a one-encoded sequence, predicts head tracks
    def predict(x):
        padded_sequence = pad_one_hot(x.numpy(), SEQUENCE_LENGHT)[np.newaxis]
        predictions = model.predict_on_batch(padded_sequence)[head]
        return tf.convert_to_tensor(predictions, dtype=tf.float32)

    for i, batch in tqdm(enumerate(dataset)):
        if max_steps is not None and i > max_steps:
            break

        prediction = predict(batch['sequence'])
        with open('log.txt', 'a') as f:
            f.write(str(i))

        if(eval == "eval"):
            metric_seq = MetricDict({'PearsonR': PearsonR(reduce_axis=(0,1))})
            metric_seq.update_state(batch['target'][np.newaxis], prediction)
            pearson_seq = metric_seq.result()["PearsonR"].numpy()
            batch_validation = {"sequence": batch["sequence"],
                                        "target": batch["target"],
                                        "interval": batch["interval"],
                                        "prediction": prediction,
                                        "PearsonR": pearson_seq}
        else:
            batch_validation = {"sequence": batch["sequence"],
                                                "target": batch["target"],
                                                "prediction": prediction,
                                                "chr": batch["chr"]
                                                }



        dataset_197k_evaluation.append(batch_validation)

    return dataset_197k_evaluation

In [51]:
metadata = get_metadata("/home/luisasantus/Desktop/crg_cluster/data/FED/basenji/bosTaurus/basenji/statistics.json")
tfrecord = "/home/luisasantus/Desktop/crg_cluster/data/FED/basenji/bosTaurus/basenji8/tfrecords/valid-0.tfr"
human_dataset = get_dataset(tfrecord, metadata)

In [52]:
dataset_197k_evaluation = []
metrics_human = evaluate_model_all_sequences_mod(model,
                               dataset=human_dataset,
                               head="human",
                               dataset_197k_evaluation = dataset_197k_evaluation, 
                            max_steps = 1)

2it [00:34, 17.39s/it]


In [53]:
metrics_human

[{'sequence': <tf.Tensor: shape=(196608, 4), dtype=float32, numpy=
  array([[0., 0., 1., 0.],
         [0., 0., 1., 0.],
         [0., 1., 0., 0.],
         ...,
         [0., 0., 1., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.]], dtype=float32)>,
  'target': <tf.Tensor: shape=(896, 1), dtype=float32, numpy=
  array([[  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.],
         [  0.

In [13]:
model = Enformer(model_path)

In [44]:
metadata = get_metadata("/home/luisasantus/Desktop/crg_cluster/data/FED/basenji/bosTaurus/basenji/statistics.json")
tf1 = "/home/luisasantus/Desktop/crg_cluster/data/FED/basenji/bosTaurus/basenji8/tfrecords/valid-0.tfr"

In [47]:
dataset_test = get_dataset(tf1, metadata)
b = next(iter(dataset_test))
b.keys()

dict_keys(['sequence', 'target', 'chr'])

'tf.Tensor(\n[50.  0.  0.  0. 55.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.\n  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.\n  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.\n  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.\n  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 52.  0.  0.  0. 49.  0.\n  0.  0. 56.  0.  0.  0. 56.  0.  0.  0. 52.  0.  0.  0. 56.  0.  0.  0.\n 51.  0.  0.  0. 54.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.\n  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.\n  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.\n  0.  0.  0.  0.  0.  0. 52.  0.  0.  0. 50.  0.  0.  0. 48.  0.  0.  0.\n 56.  0.  0.  0. 49.  0.  0.  0. 52.  0.  0.  0. 52.  0.  0.  0. 52.  0.\n  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.\n  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.\n  0.  0.  0.  0.  0.  0. 

In [25]:
SEQUENCE_LENGHT = 393_216
## pad the sequence with Ns (anyways ignored by the model)
def pad_one_hot(sequence_one_hot, NEW_SIZE):
    ADD_ENDS = int((NEW_SIZE - sequence_one_hot.shape[0])/2)
    pad_zero = np.tile(np.array([0., 0., 0., 0.]), (ADD_ENDS, 1))
    padded_left = np.append(pad_zero,sequence_one_hot, axis=0)
    pad_sequence = np.append(padded_left,pad_zero, axis=0)
    return(pad_sequence)

def evaluate_model(model, dataset, head, max_steps=None, eval_flag = "eval"):

    dataset_197k_evaluation = []
    def predict(x):
        padded_sequence = pad_one_hot(x.numpy(), SEQUENCE_LENGHT)[np.newaxis]
        predictions = model.predict_on_batch(padded_sequence)[head]
        return tf.convert_to_tensor(predictions, dtype=tf.float32)

    for i, batch in tqdm(enumerate(dataset)):
        if max_steps is not None and i > max_steps:
            break
        
        prediction = predict(batch['sequence'])
        with open('log.txt', 'a') as f:
            f.write(str(i))

        if(eval_flag == "eval"):
            metric_seq = MetricDict({'PearsonR': PearsonR(reduce_axis=(0,1))})
            metric_seq.update_state(batch['target'][np.newaxis], prediction)
            pearson_seq = metric_seq.result()["PearsonR"].numpy()
            batch_validation = {"sequence": batch["sequence"],
                                        "target": batch["target"],
                                        "prediction": prediction,
                                        "PearsonR": pearson_seq}
        else:
            batch_validation = {"sequence": batch["sequence"],
                                                "target": batch["target"],
                                                "prediction": prediction
                                                }
        dataset_197k_evaluation.append(batch_validation)

    return dataset_197k_evaluation



In [286]:
metrics = evaluate_model(model, dataset_test, "human", 1, "no")

2it [00:36, 18.04s/it]


In [287]:
metrics[0].keys()

dict_keys(['sequence', 'target', 'prediction'])

In [254]:
train_tracks_ind = list([0])
pred_tracks_ind = list([4979, 5106, 5284])
# Swap axes so i can access to each track
def swap_dim(example):
    if example.shape[1] == 1: 
        example_tmp= (example.numpy())
    else: 
        example_tmp= (example.numpy().squeeze())
    example_swapped = np.swapaxes(example_tmp, 0,1)
    return(example_swapped)

def get_tracks(example_section, track_ids):
    example_swapped = swap_dim(example_section)
    tracks = []
    for track in track_ids:
        tracks.append(list([track,example_swapped[track]]))
    return(tracks)

def eval_sequence(example, train_tracks_ind, pred_tracks_ind):
    # Extract ground truth for each of the training tracks
    train_targets = get_tracks(example["target"], train_tracks_ind)
    prediction_values = get_tracks(example["prediction"], pred_tracks_ind)

    # Compare them
    targets = [];  preds = []; pearson = []
    for target in train_targets:
        for prediction in prediction_values:
            targets.append(target[0])
            preds.append(prediction[0])
            pearson.append(scipy.stats.pearsonr(target[1], prediction[1])[0])

    df = pd.DataFrame(list(zip(targets, preds, pearson)), columns = ["targets", "preds", "pearson"])
    return(df)

In [280]:
eval_sequence(metrics[0],train_tracks_ind, pred_tracks_ind)

Unnamed: 0,targets,preds,pearson
0,0,4979,0.012822
1,0,5106,-0.003439
2,0,5284,-0.005181


In [279]:
pred = metrics
prefix="file"
evaluation_df = pd.DataFrame()
for sequence, example in enumerate(pred):
    df = eval_sequence(example, train_tracks_ind, pred_tracks_ind)
    df["sequence"] = prefix+"_"+str(sequence)
    evaluation_df = pd.concat([evaluation_df, df], ignore_index = True, axis = 0)
evaluation_df

Unnamed: 0,targets,preds,pearson,sequence
0,0,4979,0.012822,file_0
1,0,5106,-0.003439,file_0
2,0,5284,-0.005181,file_0
3,0,4979,-0.016462,file_1
4,0,5106,-0.009132,file_1
5,0,5284,0.011522,file_1


In [None]:
    example_swapped = np.swapaxes(example_tmp, 0,1)
    return(example_swapped)

def get_tracks(example_section, track_ids):
    example_swapped = swap_dim(example_section)


In [235]:
example_swapped = np.swapaxes(example_tmp, 0,1)


AxisError: axis2: axis 1 is out of bounds for array of dimension 1

In [225]:
train_targets = get_tracks(example["target"], train_tracks_ind)

AxisError: axis2: axis 1 is out of bounds for array of dimension 1

In [None]:
    # Compare them
targets = [];  preds = []; pearson = []
for target in train_targets:
    for prediction in prediction_values:
        targets.append(target[0])
        preds.append(prediction[0])
        pearson.append(scipy.stats.pearsonr(target[1], prediction[1])[0])

df = pd.DataFrame(list(zip(targets, preds, pearson)), columns = ["targets", "preds", "pearson"])

In [201]:
# Older imports 

In [9]:
def _reduced_shape(shape, axis):
    if axis is None:
        return tf.TensorShape([])
    return tf.TensorShape([d for i, d in enumerate(shape) if i not in axis])

def get_dataset(tfs, metadata):
    dataset = tf.data.TFRecordDataset(tfs, compression_type= "ZLIB")
    dataset = dataset.map(functools.partial(deserialize, metadata=metadata))
    return dataset

def get_metadata(path):
    with tf.io.gfile.GFile(path, 'r') as f:
        return json.load(f)

class CorrelationStats(tf.keras.metrics.Metric):
    """Contains shared code for PearsonR and R2."""

    def __init__(self, reduce_axis=None, name='pearsonr'):
        """Pearson correlation coefficient.

        Args:
          reduce_axis: Specifies over which axis to compute the correlation (say
            (0, 1). If not specified, it will compute the correlation across the
            whole tensor.
          name: Metric name.
        """
        super(CorrelationStats, self).__init__(name=name)
        self._reduce_axis = reduce_axis
        self._shape = None  # Specified in _initialize.

    def _initialize(self, input_shape):
        # Remaining dimensions after reducing over self._reduce_axis.
        self._shape = _reduced_shape(input_shape, self._reduce_axis)

        weight_kwargs = dict(shape=self._shape, initializer='zeros')
        self._count = self.add_weight(name='count', **weight_kwargs)
        self._product_sum = self.add_weight(name='product_sum', **weight_kwargs)
        self._true_sum = self.add_weight(name='true_sum', **weight_kwargs)
        self._true_squared_sum = self.add_weight(name='true_squared_sum',
                                                 **weight_kwargs)
        self._pred_sum = self.add_weight(name='pred_sum', **weight_kwargs)
        self._pred_squared_sum = self.add_weight(name='pred_squared_sum',
                                                 **weight_kwargs)

    def update_state(self, y_true, y_pred, sample_weight=None):
        """Update the metric state.

        Args:
          y_true: Multi-dimensional float tensor [batch, ...] containing the ground
            truth values.
          y_pred: float tensor with the same shape as y_true containing predicted
            values.
          sample_weight: 1D tensor aligned with y_true batch dimension specifying
            the weight of individual observations.
        """
        if self._shape is None:
          # Explicit initialization check.
          self._initialize(y_true.shape)
        y_true.shape.assert_is_compatible_with(y_pred.shape)
        y_true = tf.cast(y_true, 'float32')
        y_pred = tf.cast(y_pred, 'float32')

        self._product_sum.assign_add(
            tf.reduce_sum(y_true * y_pred, axis=self._reduce_axis))

        self._true_sum.assign_add(
            tf.reduce_sum(y_true, axis=self._reduce_axis))

        self._true_squared_sum.assign_add(
            tf.reduce_sum(tf.math.square(y_true), axis=self._reduce_axis))

        self._pred_sum.assign_add(
            tf.reduce_sum(y_pred, axis=self._reduce_axis))

        self._pred_squared_sum.assign_add(
            tf.reduce_sum(tf.math.square(y_pred), axis=self._reduce_axis))

        self._count.assign_add(
            tf.reduce_sum(tf.ones_like(y_true), axis=self._reduce_axis))

    def result(self):
        raise NotImplementedError('Must be implemented in subclasses.')

    def reset_states(self):
        if self._shape is not None:
            tf.keras.backend.batch_set_value([(v, np.zeros(self._shape))
                                        for v in self.variables])


class PearsonR(CorrelationStats):
    """Pearson correlation coefficient.

          Computed as:
      ((x - x_avg) * (y - y_avg) / sqrt(Var[x] * Var[y])
      """

    def __init__(self, reduce_axis=(0,), name='pearsonr'):
        """Pearson correlation coefficient.

        Args:
          reduce_axis: Specifies over which axis to compute the correlation.
          name: Metric name.
        """
        super(PearsonR, self).__init__(reduce_axis=reduce_axis,
                                       name=name)

    def result(self):
        true_mean = self._true_sum / self._count
        pred_mean = self._pred_sum / self._count

        covariance = (self._product_sum
                      - true_mean * self._pred_sum
                      - pred_mean * self._true_sum
                      + self._count * true_mean * pred_mean)

        true_var = self._true_squared_sum - self._count * tf.math.square(true_mean)
        pred_var = self._pred_squared_sum - self._count * tf.math.square(pred_mean)
        tp_var = tf.math.sqrt(true_var) * tf.math.sqrt(pred_var)
        correlation = covariance / tp_var

        return correlation


class R2(CorrelationStats):
    """R-squared  (fraction of explained variance)."""

    def __init__(self, reduce_axis=None, name='R2'):
        """R-squared metric.

        Args:
          reduce_axis: Specifies over which axis to compute the correlation.
          name: Metric name.
        """
        super(R2, self).__init__(reduce_axis=reduce_axis,
                                 name=name)

    def result(self):
        true_mean = self._true_sum / self._count
        total = self._true_squared_sum - self._count * tf.math.square(true_mean)
        residuals = (self._pred_squared_sum - 2 * self._product_sum
                     + self._true_squared_sum)

        return tf.ones_like(residuals) - residuals / total


class MetricDict:
    def __init__(self, metrics):
        self._metrics = metrics

    def update_state(self, y_true, y_pred):
        for k, metric in self._metrics.items():
            metric.update_state(y_true, y_pred)

    def result(self):
        return {k: metric.result() for k, metric in self._metrics.items()}






def one_hot_encode(sequence):
    return kipoiseq.transforms.functional.one_hot_dna(sequence).astype(np.float32)

class Enformer:

    def __init__(self, tfhub_url):
        self._model = hub.load(tfhub_url).model

    def predict_on_batch(self, inputs):
        predictions = self._model.predict_on_batch(inputs)
        return {k: v.numpy() for k, v in predictions.items()}

    @tf.function
    def contribution_input_grad(self, input_sequence,
                                  target_mask, output_head='human'):
        input_sequence = input_sequence[tf.newaxis]

        target_mask_mass = tf.reduce_sum(target_mask)
        with tf.GradientTape() as tape:
            tape.watch(input_sequence)
            prediction = tf.reduce_sum(
              target_mask[tf.newaxis] *
              self._model.predict_on_batch(input_sequence)[output_head]) / target_mask_mass

        input_grad = tape.gradient(prediction, input_sequence) * input_sequence
        input_grad = tf.squeeze(input_grad, axis=0)
        return tf.reduce_sum(input_grad, axis=-1)
    
    
def deserialize(serialized_example, metadata):
    """Deserialize bytes stored in TFRecordFile."""
    feature_map = {
          'sequence': tf.io.FixedLenFeature([], tf.string),
          'target': tf.io.FixedLenFeature([], tf.string),
    }
    example = tf.io.parse_example(serialized_example, feature_map)
    sequence = tf.io.decode_raw(example['sequence'], tf.bool)
    sequence = tf.reshape(sequence, (metadata['seq_length'], 4))
    sequence = tf.cast(sequence, tf.float32)

    target = tf.io.decode_raw(example['target'], tf.float16)
    target = tf.reshape(target,
                          (metadata['target_length'], metadata['num_targets']))
    target = tf.cast(target, tf.float32)

    return {'sequence': sequence,
              'target': target}


## Liftover 

In [None]:
import pyranges as pr

### Load all files 

In [107]:
# Human 
path_human = ("/home/luisasantus/Desktop/crg_cluster/data/FED/basenji/human/data_human_sequences.bed")
gr = pr.read_bed(path_human)
df_human = gr.df
df_human["width"] = df_human.End - df_human.Start
print(len(df_human))
df_human.head(1)

38171


Unnamed: 0,Chromosome,Start,End,Name,width
0,chr1,45428793,45559865,train,131072


In [116]:
df_human_valid = df_human[df_human.Name == "valid"][list(["Chromosome", "Start", "End"])]
df_human_valid["id"] = list(range(0,len(df_human_valid)))

df_human_valid.to_csv("/home/luisasantus/Desktop/crg_cluster/data/FED/basenji/human/liftover/data_human_sequences_valid.bed", sep = "\t",header = False, index = False)  
print(len(df_human_valid))
df_human_valid.head(1)

2213


Unnamed: 0,Chromosome,Start,End,id
3239,chr1,247498484,247629556,0


In [149]:
# Human lifted 
path_human_lifted = ("/home/luisasantus/Desktop/crg_cluster/data/FED/basenji/human/liftover/data_human_sequences_lifted.bed")
gr = pr.read_bed(path_human_lifted)
df_human_lifted = gr.df
df_human_lifted["width"] = df_human_lifted.End - df_human_lifted.Start
df_human_lifted.head(1)

Unnamed: 0,Chromosome,Start,End,Name,width
0,chr1,53948241,54062992,11,114751


In [89]:
# Mouse
path_mouse = ("/home/luisasantus/Desktop/crg_cluster/data/FED/basenji/mouse/mouse_sequences.bed")
gr = pr.read_bed(path_mouse)
df_mouse = gr.df
df_mouse["width"] = df_mouse.End - df_mouse.Start

In [168]:
df_mouse_valid = df_mouse[df_mouse.Name =="valid"]
df_mouse_valid.head(2)
df_mouse_valid["id"] = list(range(0,len(df_mouse_valid)))
df_mouse_valid = df_mouse_valid[df_mouse_valid.Name == "valid"][list(["Chromosome", "Start", "End", "id"])]


path_mouse_valid = "/home/luisasantus/Desktop/crg_cluster/data/FED/basenji/mouse/liftover/data_mouse_sequences_valid.bed"
df_mouse_valid.to_csv(path_mouse_valid, sep = "\t",header = False, index = False)  
print(len(df_mouse_valid))

df_mouse_valid.head(1)

2209


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mouse_valid["id"] = list(range(0,len(df_mouse_valid)))


Unnamed: 0,Chromosome,Start,End,id
1767,chr1,110559401,110690473,0


In [150]:
# match human and lifted over entries 
class Interval:
    def __init__(self, chrom, start, end):
        self.chrom = chrom
        self.start = start
        self.end = end
        
def addLine(human_sequences, line): 
    human_sequences[line.id] = Interval(line.Chromosome, line.Start, line.End)

In [151]:
# Dict: key is id, value is interval object 
human_sequences = {}
df_human_valid.apply(lambda line: addLine(human_sequences, line), axis=1)
len(human_sequences.keys())

2213

In [156]:
# Dict: key is id, value is interval object 
human_sequences_lifted = {}
df_human_lifted = df_human_lifted.rename(columns={'Name': 'id'})
df_human_lifted.apply(lambda line: addLine(human_sequences_lifted, line), axis=1)
len(human_sequences_lifted.keys())

1845

In [169]:
# Dict: key is id, value is interval object 
mouse_sequences = {}
df_mouse_valid.apply(lambda line: addLine(mouse_sequences, line), axis=1)
len(mouse_sequences.keys())

2209

In [162]:
# Main objects
human_sequences
human_sequences_lifted
mouse_sequences
print("")




In [176]:
path_bedintersect = "/home/luisasantus/Desktop/crg_cluster/data/FED/basenji/human/liftover/human_lifted_mouse_valid.bed"

In [217]:
f = (100000/131072)
f

0.762939453125

In [234]:
!bedtools intersect -a {path_human_lifted} -b {path_mouse_valid} -wa -wb -f {f} > {path_bedintersect}

In [235]:
list(["chrA", "startA", "endA", "idA", "chrB", "startB", "endB", "idB"])

['chrA', 'startA', 'endA', 'idA', 'chrB', 'startB', 'endB', 'idB']

In [236]:
gr = pr.read_bed(path_bedintersect)
df_intersect = gr.df
df_intersect = df_intersect.set_axis(list(["chrA", "startA", "endA", "idA", "chrB", "startB", "endB", "idB"]), axis = 1)
df_intersect.head(1)

Unnamed: 0,chrA,startA,endA,idA,chrB,startB,endB,idB
0,chr1,53948241,54062992,11,chr1,53973303,54104375,390


In [237]:
print(df_intersect.duplicated(subset=['idA']).value_counts())
print(df_intersect.duplicated(subset=['idB']).value_counts())

False    1084
dtype: int64
False    1044
True       40
dtype: int64


In [248]:
# For the moment just remove the sequences that have multiple matches
df_intersect = df_intersect.drop_duplicates(subset=['idB'], keep='first')
print(df_intersect.duplicated(subset=['idB']).value_counts())

False    1044
dtype: int64


In [249]:
pairs = []
def addPair(pairs, line):
    pair = tuple([line.idA, line.idB])
    pairs.append(pair)
    
df_intersect.apply(lambda line: addPair(pairs, line), axis=1)
len(pairs)

1044

In [250]:
pairs

[(11, 390),
 (12, 100),
 (14, 430),
 (17, 129),
 (20, 405),
 (21, 434),
 (22, 294),
 (25, 437),
 (28, 534),
 (29, 191),
 (31, 86),
 (32, 217),
 (36, 369),
 (37, 229),
 (39, 301),
 (40, 344),
 (41, 8),
 (43, 98),
 (44, 71),
 (48, 141),
 (49, 464),
 (52, 234),
 (54, 92),
 (55, 330),
 (57, 188),
 (58, 314),
 (60, 164),
 (62, 142),
 (64, 237),
 (69, 152),
 (72, 156),
 (74, 317),
 (75, 231),
 (76, 132),
 (79, 501),
 (81, 248),
 (82, 290),
 (83, 57),
 (85, 525),
 (87, 186),
 (91, 51),
 (93, 271),
 (94, 439),
 (95, 66),
 (96, 87),
 (97, 110),
 (98, 120),
 (99, 225),
 (100, 410),
 (101, 263),
 (106, 236),
 (107, 305),
 (108, 436),
 (109, 39),
 (114, 109),
 (116, 538),
 (117, 67),
 (118, 216),
 (119, 498),
 (120, 221),
 (122, 44),
 (123, 535),
 (124, 37),
 (125, 118),
 (126, 412),
 (128, 212),
 (132, 254),
 (133, 146),
 (136, 264),
 (137, 101),
 (143, 91),
 (146, 267),
 (147, 108),
 (149, 269),
 (150, 311),
 (151, 139),
 (152, 209),
 (153, 149),
 (157, 276),
 (159, 218),
 (162, 452),
 (163, 241