In [1]:
import glob
import json
import os
from collections import defaultdict

import numpy as np
import pandas as pd
import tensorflow as tf
from natsort import natsorted
from tqdm import tqdm

2024-03-29 15:29:15.460935: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
TFR_DIR = "/clusterfs/nilah/richard/basenji2/data_pretrained_basenji2/human/tfrecords"
BED_PATH = "/clusterfs/nilah/richard/basenji2/data_pretrained_basenji2/human/sequences.bed"
SEQUENCE_STATS_PATH = "/clusterfs/nilah/richard/basenji2/data_pretrained_basenji2/human/statistics.json"

In [3]:
seq_stats = json.load(open(SEQUENCE_STATS_PATH))
seq_stats

{'num_targets': 5313,
 'train_seq': 34021,
 'valid_seqs': 2213,
 'test_seqs': 1937,
 'seq_length': 131072,
 'pool_width': 128,
 'crop_bp': 8192,
 'target_length': 896}

In [4]:
def load_bed_file(seq_stats: dict) -> pd.DataFrame:
    df = pd.read_csv(BED_PATH, sep="\t", names=["chrom", "start", "end", "split"])
    df["target_start"] = df["start"] + seq_stats["crop_bp"]
    df["target_end"] = df["end"] - seq_stats["crop_bp"]
    assert (df["target_end"] - df["target_start"] == seq_stats["target_length"] * seq_stats["pool_width"]).all()
    return df

In [5]:
bed_df = load_bed_file(seq_stats)
train_bed_df = bed_df[bed_df["split"] == "train"].copy()
valid_bed_df = bed_df[bed_df["split"] == "valid"].copy()
test_bed_df = bed_df[bed_df["split"] == "test"].copy()

In [6]:
def file_to_records(filename):
    return tf.data.TFRecordDataset(filename, compression_type='ZLIB')

def parse_proto(example_proto):
    features = {
        "sequence": tf.io.FixedLenFeature([], tf.string),
        "target": tf.io.FixedLenFeature([], tf.string),
    }
    
    parsed_features = tf.io.parse_single_example(example_proto, features)
    sequence = tf.io.decode_raw(parsed_features["sequence"], tf.uint8)
    targets = tf.io.decode_raw(parsed_features["target"], tf.float16)
    return sequence, targets

def get_data(split: str, bed_df: pd.DataFrame, seq_stats: dict, track_idx: int = 5110):
    # Create dataset
    tfr_files = natsorted(glob.glob(os.path.join(TFR_DIR, f"{split}-0-*.tfr")))
    dataset = tf.data.Dataset.from_tensor_slices(tfr_files)
    dataset = dataset.flat_map(file_to_records)
    dataset = dataset.map(parse_proto)
    dataset = dataset.batch(1)
    
    # Get data
    outputs = defaultdict(list)
    for seq_idx, (_, targets) in tqdm(enumerate(dataset)):
        assert seq_idx <= bed_df.shape[0]
        targets = targets.numpy().reshape((seq_stats["target_length"], -1))
        targets = targets[:, track_idx].astype(np.float32)
        n_bins = targets.size
        
        row = bed_df.iloc[seq_idx]
        starts = np.array([
            row["target_start"] + i * seq_stats["pool_width"] for i in range(n_bins)
        ])
        ends = starts + seq_stats["pool_width"]
        
        outputs["chrom"].extend([row["chrom"]] * n_bins)
        outputs["start"].extend(starts)
        outputs["end"].extend(ends)
        outputs["split"].extend([row["split"]] * n_bins)
        outputs[f"track_{track_idx}"].extend(targets)
        
    output_df = pd.DataFrame(outputs)
    return output_df

In [7]:
train_output_df = get_data("train", train_bed_df, seq_stats)

34021it [51:45, 10.96it/s]


In [8]:
valid_output_df = get_data("valid", valid_bed_df, seq_stats)

2213it [02:16, 16.26it/s]


In [11]:
test_output_df = get_data("test", test_bed_df, seq_stats)

1937it [02:02, 15.76it/s]


In [13]:
output_df = pd.concat((train_output_df, valid_output_df, test_output_df))
output_df.to_csv(f"track_5110.csv")