In [1]:
from typing import Callable, Dict, List, Optional, Tuple, Union

import h5py
# не удалять! import hdf5plugin !
import hdf5plugin
import pandas as pd
import torch
from torch.utils.data import Dataset

---------------------------------------------------

In [2]:
meta_file = '/home/mks/PycharmProjects/multimodal_single_cell_integration/dataset/metadata.csv'
df_meta = pd.read_csv(meta_file, index_col='cell_id')

In [3]:
df_meta.head(10)

Unnamed: 0_level_0,day,donor,cell_type,technology
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
e0dde41ed6f2,3,27678,MasP,citeseq
25b1de7f18f6,3,27678,MkP,citeseq
59e175749a4c,3,27678,MkP,citeseq
cc43f415f240,3,27678,NeuP,citeseq
cf6cb48a1aca,3,27678,HSC,citeseq
7d03cdc2150c,3,27678,EryP,citeseq
ed27b16f6b29,3,27678,NeuP,citeseq
20a5293b5a5f,3,27678,NeuP,citeseq
9c110ee995b5,3,27678,HSC,citeseq
655fb0bf81df,3,27678,HSC,citeseq


In [32]:
df_meta['day']['ed27b16f6b29']

3

------------------------------------------------------------------------------

In [5]:
eval_ids = pd.read_csv('/home/mks/PycharmProjects/multimodal_single_cell_integration/dataset/evaluation_ids.csv', index_col='cell_id')

In [6]:
mx = '/home/mks/PycharmProjects/multimodal_single_cell_integration/dataset/train_multi_inputs.h5'
my = '/home/mks/PycharmProjects/multimodal_single_cell_integration/dataset/train_multi_targets.h5'
cx = '/home/mks/PycharmProjects/multimodal_single_cell_integration/dataset/train_cite_inputs.h5'
cy = '/home/mks/PycharmProjects/multimodal_single_cell_integration/dataset/train_cite_targets.h5'

In [29]:
test_my = '/home/mks/PycharmProjects/multimodal_single_cell_integration/dataset/test_multi_inputs.h5'
test_cy = '/home/mks/PycharmProjects/multimodal_single_cell_integration/dataset/test_cite_inputs.h5'

------------------------------------------------------------------------------

## Изучение структуры

In [8]:
mi_flow = h5py.File(mx, 'r')
col_names = list(mi_flow[list(mi_flow.keys())[0]])

In [9]:
col_names

['axis0', 'axis1', 'block0_items', 'block0_values']

In [12]:
mi_flow[list(mi_flow.keys())[0]]['axis0'].shape

(228942,)

In [24]:
mi_flow[list(mi_flow.keys())[0]]['axis0'][145]

b'chr10:100653097-100653634'

In [13]:
mi_flow[list(mi_flow.keys())[0]]['axis1'].shape

(105942,)

In [17]:
mi_flow[list(mi_flow.keys())[0]]['axis1'][0]

b'56390cf1b95e'

In [14]:
mi_flow[list(mi_flow.keys())[0]]['block0_items'].shape

(228942,)

In [23]:
mi_flow[list(mi_flow.keys())[0]]['block0_items'][145]

b'chr10:100653097-100653634'

In [15]:
mi_flow[list(mi_flow.keys())[0]]['block0_values'].shape

(105942, 228942)

In [19]:
mi_flow[list(mi_flow.keys())[0]]['block0_values'][0]

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [20]:
mi_flow[list(mi_flow.keys())[0]]['block0_values'][0].shape

(228942,)

Итак, файл состоит из **названия участка генома, уникального идентификатора клетки, и большой numpy матрицы** являющейся ATAC данными.
Строчек в numpy массиве столько же сколько уникальных id клеток, таким образом строчка в матрице является вектором фичей конкретной клетки. Мы подразумеваем что порядок совпадает и сделан без ошибок. Так же у нас есть название для каждого столбца в матрице, указывающее на позицию ATAC фичи, но он полностью совпадает со списком позиций. Информация об участке на днк нам сейчас безполезна. Однако ее тоже можно обрабатывать.

**dna_pos:** mi_flow[list(mi_flow.keys())[0]]['axis0'] (список строк в битовом виде)

**cell_id:** mi_flow[list(mi_flow.keys())[0]]['axis1'] (список строк в битовом виде)

**atac_array:** mi_flow[list(mi_flow.keys())[0]]['block0_values']

In [25]:
list(mi_flow.keys())

['train_multi_inputs']

--------------------------------------------------------

In [33]:
my_flow = h5py.File(my, 'r')
print(list(my_flow.keys()))

['train_multi_targets']


In [36]:
col_names = list(my_flow['train_multi_targets'])

In [37]:
col_names

['axis0', 'axis1', 'block0_items', 'block0_values']

In [38]:
my_flow['train_multi_targets']['block0_items'][0]

b'ENSG00000121410'

In [41]:
my_flow['train_multi_targets']['axis0'][0]

b'ENSG00000121410'

--------------------------------------------------------

--------------------------------------------------------

# Test new dataset class

In [1]:
from typing import Callable, Dict, List, Optional, Tuple, Union

import h5py
# не удалять! import hdf5plugin !
import hdf5plugin
import pandas as pd
import torch
from torch.utils.data import Dataset

In [6]:
class SCCDataset(Dataset):
    reserved_names: List[str] = ['train_multi_inputs', 'train_multi_targets', 'train_cite_inputs', 'train_cite_targets',
                                 'test_multi_inputs', 'test_cite_inputs']
    meta_names: List[str] = ['day', 'donor', 'cell_type', 'technology']
    meta_keys: List[str] = ['cell_id', 'day', 'donor', 'cell_type', 'technology']

    pos_name: str = 'position'
    index_name: str = 'cell_id'
    target_name: str = 'gene_id'

    features_name: str = "block0_values"
    cell_id_name: str = "axis1"
    col_name: str = 'axis0'

    def __init__(self,
                 meta_file: str,
                 features_file: str,
                 targets_file: Optional[str] = None,
                 transform: Optional[Callable] = None,
                 target_transform: Optional[Callable] = None):
        self.transform = transform
        self.target_transform = target_transform

        self.metadata = pd.read_csv(meta_file, index_col=self.index_name)
        self.features, self.features_shape = self.get_hdf5_flow(features_file)

        if targets_file:
            self.targets, self.targets_shape = self.get_hdf5_flow(targets_file)
            assert self.targets_shape[0] == self.features_shape[0], \
                AssertionError(f"Длины файлов фичей и таргетов не совпадают; "
                               f"features_file: {self.features_shape[0]}, targets_file: {self.targets_shape[0]};")
        else:
            self.targets = None

    def get_hdf5_flow(self, file_path: str):
        file_flow = h5py.File(file_path, 'r')

        file_keys = list(file_flow.keys())
        assert len(file_keys) == 1, AssertionError(f"Incorrect file format, '{file_path}' file have more than one "
                                                   f"group: {file_keys}.")

        file_name = file_keys[0]
        assert file_name in self.reserved_names, AssertionError(f"Incorrect file format, group name must be in "
                                                                f"{self.reserved_names}, but {file_name} was found.")

        datasets_names = list(file_flow[file_name])
        assert self.features_name in datasets_names, AssertionError(f"Incorrect file format, dataset name "
                                                                    f"{self.features_name} was not found in hdf5 file "
                                                                    f"datasets list.")
        assert self.cell_id_name in datasets_names, AssertionError(f"Incorrect file format, dataset name "
                                                                   f"{self.cell_id_name} was not found in hdf5 file "
                                                                   f"datasets list.")
        assert self.col_name in datasets_names, AssertionError(f"Incorrect file format, dataset name {self.col_name} "
                                                               f"was not found in hdf5 file datasets list.")

        lines, features_shape = file_flow[file_name][self.features_name].shape

        return file_flow[file_name], (lines, features_shape)

    def __len__(self):
        return len(self.features[self.cell_id_name])

    def __getitem__(self, item: int) -> Union[Tuple[torch.Tensor, Dict[str, str]],
                                              Tuple[torch.Tensor, torch.Tensor, Dict[str, str]]]:
        cell_id = self.features[self.cell_id_name][item].decode("utf-8")
        meta_data = {key: self.metadata[key][cell_id] for key in self.meta_names}
        meta_data[self.pos_name] = self.features[self.col_name][item].decode("utf-8")
        meta_data[self.index_name] = cell_id

        x = self.features[self.features_name][item]
        if self.transform:
            x = self.transform(x)

        if self.targets is not None:
            meta_data[self.target_name] = self.targets[self.col_name][item].decode("utf-8")
            y = self.targets[self.features_name][item]
            if self.target_transform:
                y = self.target_transform(y)

            return x, y, meta_data
        else:
            return x, meta_data


In [7]:
meta = '/home/mks/PycharmProjects/multimodal_single_cell_integration/dataset/metadata.csv'

mx = '/home/mks/PycharmProjects/multimodal_single_cell_integration/dataset/train_multi_inputs.h5'
my = '/home/mks/PycharmProjects/multimodal_single_cell_integration/dataset/train_multi_targets.h5'
test_my = '/home/mks/PycharmProjects/multimodal_single_cell_integration/dataset/test_multi_inputs.h5'

cx = '/home/mks/PycharmProjects/multimodal_single_cell_integration/dataset/train_cite_inputs.h5'
cy = '/home/mks/PycharmProjects/multimodal_single_cell_integration/dataset/train_cite_targets.h5'
test_cy = '/home/mks/PycharmProjects/multimodal_single_cell_integration/dataset/test_cite_inputs.h5'

In [8]:
dataset = SCCDataset(meta_file=meta, features_file=mx, targets_file=my)

In [9]:
dataset[0]

(array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0.      , 0.      , 0.      , ..., 5.583255, 0.      , 4.893861],
       dtype=float32),
 {'day': 2,
  'donor': 32606,
  'cell_type': 'NeuP',
  'technology': 'multiome',
  'position': 'GL000194.1:114519-115365',
  'cell_id': '56390cf1b95e',
  'gene_id': 'ENSG00000121410'})

In [11]:
dataset[0][1].shape

(23418,)