In [None]:
import numpy as np
import pandas as pd
import torch
import bioio

In [None]:
import struct
import tensorflow as tf
from tensorflow_datasets.core.features.features_dict import FeaturesDict
import bioio

import numpy as np
import pandas as pd
import torch
import bioio

# def numpy_to_torch(x):
#     if isinstance(x, str) or isinstance(x, bytes):
#         if isinstance(x, str):
#             x = x.encode('UTF-8')
#         x = np.frombuffer(x, dtype=np.uint8)
#     return torch.Tensor(x)

# def load_index(filepath):
#     return np.array(pd.read_csv(filepath, sep='\t', header=None)[0], dtype=np.int64)

class GFileTFRecord:
    def __init__(self, filepath, features=None, index=None):
        self._gfile_tfrecord = tf.io.gfile.GFile(filepath, 'rb')
        self.features = self._read_features(features) if features is not None else None
        self.index = self._read_index(index) if index is not None else None

    def __call__(self, offset, deserialize=True, to_numpy=False, to_torch=False, validate=False):
        try:
            proto = self._read_proto(offset, validate)
        except:
            raise ValueError(f'Invalid record at offset {offset}.')
        
        if (self.features is None) or (not deserialize):
            return proto
        else:
            return self.deserialize(proto, to_numpy, to_torch)
    
    def __getitem__(self, idx, **kwargs):
        if self.index is None:
            raise ValueError('Index not specified.')
        return self(self.index[idx], **kwargs)
    
    def __iter__(self):
        while True:
            try:
                proto = self._read_next_proto()
                if not proto:
                    break
                yield proto
            except:
                break
    
    def __len__(self):
        if self.index is not None:
            return len(self.index)
        else:
            if hasattr(self, '_len'):
                return self._len
            else:
                self._len = self._get_length()
                return self._len
    
    def _get_length(self):
        n = 0
        for _ in iter(self):
            n += 1
        return n

    @property
    def size(self):
        return self._gfile_tfrecord.size()

    def as_tf_dataset(self, cache=False, shuffle=False, deserialize=True):
        assert self.index is not None, 'Index not specified. Provide index or use `bioio.load_tfrecord` instead.'
        dataset = tf.data.Dataset.from_tensor_slices(self.index)
        dataset = dataset.map(self._read_proto_pyfunc)
        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
        if deserialize:
            dataset = dataset.map(self.deserialize, num_parallel_calls=tf.data.experimental.AUTOTUNE)
            dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
        if cache:
            dataset = dataset.cache()
        if shuffle:
            dataset = dataset.shuffle(len(self))
        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
        return dataset
    
    def as_numpy_iterator(self, cache=False, shuffle=False):
        return self.as_tf_dataset(cache, shuffle, deserialize=True).as_numpy_iterator()

    def as_torch_iterator(self, cache=False, shuffle=False):
        for example in self.as_numpy_iterator(cache, shuffle):
            yield tf.nest.map_structure(bioio.torch.utils.numpy_to_torch, example)

    def deserialize(self, proto, to_numpy=False, to_torch=False):
        assert not (to_numpy and to_torch), 'Cannot convert to both numpy and torch.'

        if self.features is None:
            raise ValueError('Features not specified.')
        
        example = self.features.deserialize_example(proto)
        if to_numpy:
            example = tf.nest.map_structure(lambda x: x.numpy(), example)
        if to_torch:
            example = tf.nest.map_structure(lambda x: bioio.torch.utils.numpy_to_torch(x.numpy()), example)
        return example
    
    def _read_proto(self, offset, validate=False):
        # seek to offset
        self._gfile_tfrecord.seek(offset)
        return self._read_next_proto(validate)
    
    def _read_proto_pyfunc(self, offset):
        # proto_bytes = tf.py_function(lambda offset: self._read_proto(offset.numpy()), inp=[offset], Tout=tf.string)
        proto_bytes = tf.py_function(self._read_proto, inp=[offset], Tout=tf.string)
        proto_bytes.set_shape(shape=())
        return proto_bytes
    
    def _read_next_proto(self, validate=False):
        # get proto length
        proto_len_bytes = self._gfile_tfrecord.read(8)
        if len(proto_len_bytes) == 0:
            return None
        proto_len = struct.unpack('q', proto_len_bytes)[0]

        # proto length crc
        proto_len_crc = self._gfile_tfrecord.read(4)
        if validate:
            raise NotImplementedError('CRC validation not implemented.')

        # proto bytes
        proto_bytes = self._gfile_tfrecord.read(proto_len)

        # proto bytes crc
        proto_bytes_crc = self._gfile_tfrecord.read(4)
        if validate:
            raise NotImplementedError('CRC validation not implemented.')
        
        return proto_bytes

    def _read_features(self, features):
        if isinstance(features, FeaturesDict):
            return features
        elif isinstance(features, str):
            return bioio.tf.utils.features_from_json_file(features)
        else:
            raise ValueError(f'Invalid features type: {type(features)}')
        
    def _read_index(self, index):
        if isinstance(index, np.ndarray):
            return index
        elif isinstance(index, str):
            return bioio.tf.index.load_index(index)
        else:
            raise ValueError(f'Invalid features type: {type(index)}')


In [None]:
# gfile_tfrecord = GFileTFRecord('examples/windows.chr13.4.data.matrix.filtered.tfrecord', 'examples/windows.chr13.4.data.matrix.filtered.tfrecord.features.json', 'examples/windows.chr13.4.data.matrix.filtered.tfrecord.idx')
gfile_tfrecord = GFileTFRecord('examples/windows.chr13.4.data.matrix.filtered.tfrecord', 'examples/windows.chr13.4.data.matrix.filtered.tfrecord.features.json')

print(len(gfile_tfrecord))

# d = gfile_tfrecord.as_tf_dataset()
# print(d)
# # next(iter(d))
# # next(iter(gfile_tfrecord.as_numpy_iterator()))
next(iter(gfile_tfrecord.as_torch_iterator()))


In [None]:
torch.tensor([1,2,3])

In [None]:
# idx_dataset = iter(bioio.tf.index.load_index_to_dataset('examples/windows.chr13.4.data.matrix.filtered.tfrecord.idx'))
# print(next(idx_dataset))
# print(next(idx_dataset))


In [None]:
gfile_tfrecord.index

In [None]:
gfile_tfrecord[42]

In [None]:
gfile_tfrecord.__getitem__(42, to_torch=True)

In [None]:
import bioio
bioio.torch.utils

In [None]:
from bioio.tf.ops import features_from_json_file
from bioio.tf.data import load_index

class GFileTFRecordsDataset(torch.utils.data.Dataset):
    def __init__(self, filepaths):
        self._gfile_tfrecords = [self._load_gfile_tfrecord(filepath) for filepath in filepaths] # [GFileTFRecord(filepath) for filepath in filepaths]
        self._gfile_tfrecords_lengths = [len(gfile_tfrecord) for gfile_tfrecord in self._gfile_tfrecords]
        self._idx_to_file_map = self._make_idx_to_file_map()

    def __getitem__(self, idx):
        file_idx, in_file_idx = self._idx_to_file_map[idx]
        return self._gfile_tfrecords[file_idx][in_file_idx]
    
    def __len__(self):
        return sum(self._gfile_tfrecords_lengths)

    def _load_gfile_tfrecord(self, filepath):
        features = features_from_json_file(filepath + '.features.json')
        index = load_index(filepath + '.idx')
        return GFileTFRecord(filepath, features, index)

    def _make_idx_to_file_map(self):
        idx_to_file_map = []
        for i, length in enumerate(self._gfile_tfrecords_lengths):
            for j in range(length):
                idx_to_file_map.append((i, j))
        return idx_to_file_map


In [None]:
dataset = GFileTFRecordsDataset(['examples/windows.chr13.4.data.matrix.filtered.tfrecord', 'examples/windows.chr13.4.data.matrix.filtered.tfrecord'])
print(len(dataset))

In [None]:
for _ in dataset:
    pass

In [None]:
import bioio

tf_dataset = bioio.tf.load_indexed_tfrecord('examples/windows.chr13.4.data.matrix.filtered.tfrecord')
tf_dataset

In [None]:
def load_index(filepath):
    return np.array(pd.read_csv(filepath, sep='\t', header=None)[0], dtype=np.int64)

load_index('examples/windows.chr13.4.data.matrix.filtered.tfrecord.idx')

In [None]:
feat = bioio.tf.ops.features_from_json_file('examples/windows.chr13.4.data.matrix.filtered.tfrecord.features.json')

In [None]:
len(gfile_tfrecord)

In [None]:
next(iter(gfile_tfrecord))

In [None]:
x = gfile_tfrecord(3401, deserialize=True, to_torch=True)
x

In [None]:
type(x['meta'])

In [None]:
torch.Tensor.byte(x['meta'], dtype=torch.uint8)

In [None]:
torch.Tensor(int.from_bytes(b'abc', 'big'), dtype=torch.uint8)

In [None]:
list(map(lambda x: int.from_bytes(x, 'big'), b'abc'))

In [None]:
np.frombuffer(bytes(b'abc', encoding='UTF-8'), dtype=np.uint8)

In [None]:
b'abc'

In [None]:
import bioio

In [None]:
dataset = bioio.tf.load_tfrecord('examples/windows.chr13.4.data.matrix.filtered.tfrecord', deserialize=False)
dataset = dataset.cache()
for _ in dataset:
    pass

In [None]:
for _ in dataset:
    pass

In [None]:
dataset_deser = bioio.tf.load_tfrecord('examples/windows.chr13.4.data.matrix.filtered.tfrecord', deserialize=True)
dataset_deser = dataset_deser.cache()
for _ in dataset_deser:
    pass

In [None]:
for _ in dataset_deser:
    pass

In [None]:
import bioio

bed = bioio.dataspec.sources.Bed('/home/marc/Downloads/er.head-100.bed')
bed

In [None]:
row = next(iter(bed))
row

In [None]:
list(map(int, next(iter(bed))['6'].split(',')))

In [None]:
import tensorflow as tf

class BedColumnToSparseLabels:
    def __init__(self, column, sep=','):
        self._column = column
        self._sep = sep

    def __call__(self, example):
        return tf.cast(tf.strings.to_number(tf.strings.split(example[self._column], sep=self._sep), tf.int32), tf.int64)

s = BedColumnToSparseLabels('6')
s(row)

In [None]:
def multi_hot(x, depth):
    return tf.reduce_sum(tf.one_hot(x, depth=depth, dtype=tf.int64), axis=0)

class BedColumnToMultihotLabels:
    def __init__(self, column, depth, sep=','):
        self._column = column
        self._sep = sep
        self._depth = depth

    def __call__(self, example):
        labels = tf.cast(tf.strings.to_number(tf.strings.split(example[self._column], sep=self._sep), tf.int32), tf.int64)
        return multi_hot(labels, self._depth)


m = BedColumnToMultihotLabels('6', 30)
m(row)

In [None]:
m(row).shape

In [None]:
def f():
    pass

In [None]:
import tensorflow as tf

In [None]:
type(tf.one_hot)

In [None]:
def args_func(fun):
    

In [None]:
from bioio.dataspec.transforms.bed import BedColumnSparseLabels

In [None]:
import tensorflow as tf

class BedColumnToSparseLabels:
    tensor_spec = tf.TensorSpec(shape=(None, ), dtype=tf.int64)

    def __init__(self, column, sep=','):
        self._column = column
        self._sep = sep

    def __call__(self, example):
        column_string = example[self._column]
        column_string.set_shape(())
        return tf.cast(tf.strings.to_number(tf.strings.split(column_string, sep=self._sep)), tf.int32)

s = BedColumnToSparseLabels('6')

In [1]:
from bioio import load_biospec

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
os.chdir('examples/basset/')

In [3]:
dataset = load_biospec('biospec.yml')
print(dataset)
print()
print(next(iter(dataset)))

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089
<ZipDataset element_spec={'inputs': {'sequence': TensorSpec(shape=(None, 4), dtype=tf.int8, name=None)}, 'outputs': {'labels': TensorSpec(shape=(20,), dtype=tf.int64, name=None)}}>

{'inputs': {'sequence': <tf.Tensor: shape=(600, 4), dtype=int8, numpy=
array([[0, 0, 0, 1],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       ...,
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 0, 0, 1]], dtype=int8)>}, 'outputs': {'labels': <tf.Tensor: shape=(20,), dtype=int64, numpy=array([0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0])>}}


In [None]:
dataset = dataset.map(lambda x: x['meta'])
dataset

In [None]:
e = next(iter(dataset))
e

In [None]:
dataset_test = dataset.map(s)
dataset_test

In [None]:
e = next(iter(dataset_test))
e

In [None]:
tf.strings.split(e, sep=',')

In [None]:
s(e)

In [None]:
next(iter(dataset))