In [1]:
from typing import Callable, Dict, List, Optional, Tuple, Union
from pathlib import Path

import h5py
# не удалять! import hdf5plugin !
import hdf5plugin
import pandas as pd
import torch
from torch.utils.data import Dataset
import numpy as np
from tqdm import tqdm

In [2]:
class FSCCDataset(Dataset):
    file_types = ['inputs', 'targets']
    h5_reserved_names: List[str] = ['train_multi_inputs', 'train_multi_targets', 'train_cite_inputs',
                                    'train_cite_targets', 'test_multi_inputs', 'test_cite_inputs']

    dataflows = {'cite': {'train': {'inputs': None, 'targets': None},
                          'test': {'inputs': None}},
                 'multi': {'train': {'inputs': None, 'targets': None},
                           'test': {'inputs': None}}}

    metadata = None
    meta_unique_vals: Dict = {}
    metadata_file: str = 'metadata.csv'
    meta_transform_names: List[str] = ['day', 'donor', 'cell_type']
    meta_names: List[str] = ['day', 'donor', 'cell_type', 'technology']
    meta_keys: List[str] = ['cell_id', 'day', 'donor', 'cell_type', 'technology']

    col_name: str = 'axis0'
    pos_name: str = 'position'
    index_name: str = 'cell_id'
    cell_id_name: str = "axis1"
    target_name: str = 'gene_id'
    features_name: str = "block0_values"

    def __init__(self,
                 dataset_path: Union[str, Path],
                 task: str, mode: str,
                 meta_transform: Optional[str] = None,
                 transform: Optional[Callable] = None,
                 target_transform: Optional[Callable] = None):
        self.task = task
        self.mode = mode
        self.data_ids = None
        self.data_shapes = None
        self.dataset_path = dataset_path

        self.transform = transform
        self.target_transform = target_transform
        self.meta_transform = meta_transform
        # init dataset
        self._read_task_dataset(dataset_path)

    def _read_metadata(self, path: str) -> pd.DataFrame:
        df = pd.read_csv(path, index_col=self.index_name)
        for key in self.meta_names:
            self.meta_unique_vals[key] = list(df[key].unique())

        return df

    def _transform_metalabels(self, meta_dict: Dict, cell_id: str) -> Dict:
        if self.meta_transform:
            if self.meta_transform == 'index':
                for key in self.meta_transform_names:
                    meta_dict[key] = self.meta_unique_vals[key].index(self.metadata[key][cell_id])
            elif self.meta_transform == 'one_hot':
                for key in self.meta_transform_names:
                    one_hot_vector = np.zeros((len(self.meta_unique_vals[key]),))
                    one_hot_vector[self.meta_unique_vals[key].index(self.metadata[key][cell_id])] = 1
                    meta_dict[key] = one_hot_vector
            else:
                raise ValueError(f"The argument 'meta_transform' can only take values from a list "
                                 f"['index', 'one_hot', None], but '{self.meta_transform}' was found.")
        else:
            meta_dict = {key: self.metadata[key][cell_id] for key in self.meta_names}

        return meta_dict

    def _get_task_flow(self, folder_path: Path, mode: str, task: str, file_type: str) -> None:
        file_name = '_'.join([mode, task, file_type])
        print(f"[ Reading {file_name}.h5 file ... ]")
        f_path = str(folder_path.joinpath(f"{file_name}.h5").absolute())
        flow, feature_shape = self.get_hdf5_flow(f_path)
        # write data in structure
        self.dataflows[task][mode][file_type] = flow
        self.data_shapes[task][mode][file_type] = feature_shape
        print(f"[ Reading {file_name}.h5 file is complete. ]")

    def _read_task_dataset(self, folder_path: Union[str, Path]) -> None:
        self.data_shapes = {self.task: {self.mode: {s: None for s in self.file_types}}}

        if isinstance(folder_path, str):
            folder_path = Path(folder_path)
        # read metadata file
        self.metadata = self._read_metadata(str(folder_path.joinpath(self.metadata_file)))
        # read all h5 files
        if self.mode == 'train':
            for file_type in self.file_types:
                self._get_task_flow(folder_path, self.mode, self.task, file_type)
        elif self.mode == 'test':
            self._get_task_flow(folder_path, self.mode, self.task, self.file_types[0])
        else:
            raise ValueError(f"Argument 'mode' can only take values from a list: ['train', 'test'], "
                             f"but {self.mode} was found.")

        self.data_ids = self._set_data_ids()

    def _set_data_ids(self):
        feature_flow = self.dataflows[self.task][self.mode]['inputs']
        return [x.decode("utf-8") for x in feature_flow[self.cell_id_name]]

    def __len__(self):
        return len(self.data_ids)

    def __getitem__(self, item: int) -> Dict:
        cell_id = self.data_ids[item]
        features = self.dataflows[self.task][self.mode]['inputs']
        meta_data = {self.index_name: cell_id}  # self.pos_name: features[self.col_name][item].decode("utf-8")
        meta_data = self._transform_metalabels(meta_data, cell_id)

        x = features[self.features_name][item]
        if self.transform:
            x = self.transform(x)

        meta_data[self.file_types[0]] = x

        if self.dataflows[self.task][self.mode].get('targets'):
            targets = self.dataflows[self.task][self.mode]['targets']
            # meta_data[self.target_name] = targets[self.cell_id_name][item].decode("utf-8")
            y = targets[self.features_name][item]

            if self.target_transform:
                y = self.target_transform(y)

            meta_data[self.file_types[1]] = y

            return meta_data
        else:
            return meta_data

    def get_hdf5_flow(self, file_path: str):
        file_flow = h5py.File(file_path, 'r')

        file_keys = list(file_flow.keys())
        assert len(file_keys) == 1, AssertionError(f"Incorrect file format, '{file_path}' file have more than one "
                                                   f"group: {file_keys}.")

        file_name = file_keys[0]
        assert file_name in self.h5_reserved_names, \
            AssertionError(f"Incorrect file format, group name must be in {self.h5_reserved_names}, "
                           f"but {file_name} was found.")

        datasets_names = list(file_flow[file_name])
        assert self.features_name in datasets_names, AssertionError(f"Incorrect file format, dataset name "
                                                                    f"{self.features_name} was not found in hdf5 file "
                                                                    f"datasets list.")
        assert self.cell_id_name in datasets_names, AssertionError(f"Incorrect file format, dataset name "
                                                                   f"{self.cell_id_name} was not found in hdf5 file "
                                                                   f"datasets list.")
        assert self.col_name in datasets_names, AssertionError(f"Incorrect file format, dataset name {self.col_name} "
                                                               f"was not found in hdf5 file datasets list.")

        lines, features_shape = file_flow[file_name][self.features_name].shape

        return file_flow[file_name], (lines, features_shape)

    def reindex_dataset(self,
                        day: Optional[Union[int, List[int]]] = None,
                        donor: Optional[Union[int, List[int]]] = None,
                        cell_type: Optional[Union[str, List[str]]] = None) -> None:
        conditions = []
        if (day is not None) and isinstance(day, int):
            conditions.append((self.metadata['day'] == day))
        elif (day is not None) and isinstance(day, list):
            conditions.append((self.metadata['day'].isin(day)))

        if (donor is not None) and isinstance(donor, int):
            conditions.append((self.metadata['donor'] == donor))
        elif (donor is not None) and isinstance(donor, list):
            conditions.append((self.metadata['donor'].isin(donor)))

        if (cell_type is not None) and isinstance(cell_type, int):
            conditions.append((self.metadata['cell_type'] == cell_type))
        elif (cell_type is not None) and isinstance(cell_type, list):
            conditions.append((self.metadata['cell_type'].isin(cell_type)))

        if len(conditions) > 0:
            feature_flow = self.dataflows[self.task][self.mode]['inputs']
            ids = {x.decode("utf-8") for x in feature_flow[self.cell_id_name]}

            final_cond = conditions[0]
            if len(conditions) > 1:
                for cond in conditions[1:]:
                    final_cond &= cond

            cond_index = set(self.metadata[final_cond].index)
            self.data_ids = list(cond_index & ids)

    def rebase(self, task: Optional[str] = None, mode: Optional[str] = None):
        if task is not None:
            self.task = task
        if mode is not None:
            self.mode = mode

        self._read_task_dataset(self.dataset_path)
        self.data_ids = self._set_data_ids()

    def set_length(self, length: int) -> None:
        self.data_ids = self.data_ids[:length]


In [3]:
dataset_folder = '/home/mks/PycharmProjects/multimodal_single_cell_integration/dataset/'

# Считаем количество ненулевых элементов в ATAC фичах

In [4]:
mtrain = FSCCDataset(dataset_folder, 'multi', 'train')
mtest = FSCCDataset(dataset_folder, 'multi', 'test')

[ Reading train_multi_inputs.h5 file ... ]
[ Reading train_multi_inputs.h5 file is complete. ]
[ Reading train_multi_targets.h5 file ... ]
[ Reading train_multi_targets.h5 file is complete. ]
[ Reading test_multi_inputs.h5 file ... ]
[ Reading test_multi_inputs.h5 file is complete. ]


In [7]:
for i, d in enumerate([mtrain, mtest]):
    print(f'Dataset {i}: {d.meta_unique_vals}')
    print()

Dataset 0: {'day': [3, 4, 7, 2, 10], 'donor': [27678, 32606, 13176, 31800], 'cell_type': ['MasP', 'MkP', 'NeuP', 'HSC', 'EryP', 'MoP', 'BP', 'hidden'], 'technology': ['citeseq', 'multiome']}

Dataset 1: {'day': [3, 4, 7, 2, 10], 'donor': [27678, 32606, 13176, 31800], 'cell_type': ['MasP', 'MkP', 'NeuP', 'HSC', 'EryP', 'MoP', 'BP', 'hidden'], 'technology': ['citeseq', 'multiome']}



In [8]:
for i, d in enumerate([mtrain, mtest]):
    print(f'Dataset {i}: {d.data_shapes}')
    print()

Dataset 0: {'multi': {'train': {'inputs': (105942, 228942), 'targets': (105942, 23418)}}}

Dataset 1: {'multi': {'test': {'inputs': (55935, 228942), 'targets': None}}}



In [10]:
max_non_zero = -np.inf
min_non_zero = np.inf
non_zero_array = []
for d, mode in zip([mtrain, mtest], ['train', 'test']):
    for ind in tqdm(range(len(d))):
        nz_count = np.count_nonzero(d[ind]['inputs'])
        if nz_count > max_non_zero:
            max_non_zero = nz_count
        
        if nz_count < min_non_zero:
            min_non_zero = nz_count
        
        non_zero_array.append(nz_count)

print(f"Максимум: {max_non_zero}")
print(f"Минимум: {min_non_zero}")
print(f"Среднее: {np.mean(non_zero_array)}")
print(f"Стандартное отклонение: {np.std(non_zero_array)}")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 105942/105942 [03:26<00:00, 512.85it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 55935/55935 [01:08<00:00, 822.21it/s]

Максимум: 35837
Минимум: 746
Среднее: 5935.829401335582
Стандартное отклонение: 4122.413955322037





In [13]:
35837 / 228942

0.15653309571856627

In [14]:
746 / 228942

0.0032584672100357294

In [23]:
35837 * len(np.base_repr(228942, base=2))

645066

# Считаем количество ненулевых элементов в GeneExp фичах

In [15]:
ctrain = FSCCDataset(dataset_folder, 'cite', 'train')
ctest = FSCCDataset(dataset_folder, 'cite', 'test')

[ Reading train_cite_inputs.h5 file ... ]
[ Reading train_cite_inputs.h5 file is complete. ]
[ Reading train_cite_targets.h5 file ... ]
[ Reading train_cite_targets.h5 file is complete. ]
[ Reading test_cite_inputs.h5 file ... ]
[ Reading test_cite_inputs.h5 file is complete. ]


In [16]:
for i, d in enumerate([ctrain, ctest]):
    print(f'Dataset {i}: {d.meta_unique_vals}')
    print()

Dataset 0: {'day': [3, 4, 7, 2, 10], 'donor': [27678, 32606, 13176, 31800], 'cell_type': ['MasP', 'MkP', 'NeuP', 'HSC', 'EryP', 'MoP', 'BP', 'hidden'], 'technology': ['citeseq', 'multiome']}

Dataset 1: {'day': [3, 4, 7, 2, 10], 'donor': [27678, 32606, 13176, 31800], 'cell_type': ['MasP', 'MkP', 'NeuP', 'HSC', 'EryP', 'MoP', 'BP', 'hidden'], 'technology': ['citeseq', 'multiome']}



In [17]:
for i, d in enumerate([ctrain, ctest]):
    print(f'Dataset {i}: {d.data_shapes}')
    print()

Dataset 0: {'cite': {'train': {'inputs': (70988, 22050), 'targets': (70988, 140)}}}

Dataset 1: {'cite': {'test': {'inputs': (48203, 22050), 'targets': None}}}



In [18]:
max_non_zero = -np.inf
min_non_zero = np.inf
non_zero_array = []
for d, mode in zip([ctrain, ctest], ['train', 'test']):
    for ind in tqdm(range(len(d))):
        nz_count = np.count_nonzero(d[ind]['inputs'])
        if nz_count > max_non_zero:
            max_non_zero = nz_count
        
        if nz_count < min_non_zero:
            min_non_zero = nz_count
        
        non_zero_array.append(nz_count)

print(f"Максимум: {max_non_zero}")
print(f"Минимум: {min_non_zero}")
print(f"Среднее: {np.mean(non_zero_array)}")
print(f"Стандартное отклонение: {np.std(non_zero_array)}")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 70988/70988 [01:16<00:00, 928.55it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48203/48203 [01:20<00:00, 596.55it/s]

Максимум: 8581
Минимум: 1682
Среднее: 4835.0436610146735
Стандартное отклонение: 1049.0076257383432





In [19]:
8581 / 22050

0.3891609977324263

In [20]:
1682 / 22050

0.076281179138322

In [21]:
1049 / 4835

0.21695966907962771

In [22]:
8581 * len(np.base_repr(22050, base=2))

128715

----------------------------------------------

In [24]:
mtrain[10]['inputs']

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)