In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad

In [2]:
with open('matrix_list.txt', 'r') as file:
    content = file.read()

In [3]:
content_list = content.split("\n")
grouped_list = [content_list[i:i+3] for i in range(0, len(content_list), 3)]

In [4]:
samples_prefixes = [sample[0].split("_")[1] for sample in grouped_list[::2]]

In [5]:
samples_prefixes = [f'{sample}_CSF_' for sample in samples_prefixes] + [f'{sample}_PBMC_' for sample in samples_prefixes]

In [6]:
adatas = {}
for (barcode, feature, matrix), prefix in zip(grouped_list, samples_prefixes):
    adatas[prefix] = sc.read_10x_mtx(
        '../GSE194078/MTX',
        var_names='gene_symbols',
        gex_only=False, 
        cache=True,
        prefix=prefix)


In [7]:
adatas

{'CYG_CSF_': AnnData object with n_obs × n_vars = 2641 × 36601
     var: 'gene_ids', 'feature_types',
 'JSB_CSF_': AnnData object with n_obs × n_vars = 1917 × 36601
     var: 'gene_ids', 'feature_types',
 'JYJ_CSF_': AnnData object with n_obs × n_vars = 3827 × 36601
     var: 'gene_ids', 'feature_types',
 'KHW_CSF_': AnnData object with n_obs × n_vars = 11138 × 36601
     var: 'gene_ids', 'feature_types',
 'KJS_CSF_': AnnData object with n_obs × n_vars = 10394 × 36601
     var: 'gene_ids', 'feature_types',
 'KSH_CSF_': AnnData object with n_obs × n_vars = 11289 × 36601
     var: 'gene_ids', 'feature_types',
 'KYO_CSF_': AnnData object with n_obs × n_vars = 395 × 36601
     var: 'gene_ids', 'feature_types',
 'SYG_CSF_': AnnData object with n_obs × n_vars = 1849 × 36601
     var: 'gene_ids', 'feature_types',
 'YYJ_CSF_': AnnData object with n_obs × n_vars = 1996 × 36601
     var: 'gene_ids', 'feature_types',
 'YYS_CSF_': AnnData object with n_obs × n_vars = 329 × 36601
     var: 'gene_id

In [13]:
import itertools

adatas_csf = dict(itertools.islice(adatas.items(), 10))

adatas_pmcb = dict(itertools.islice(adatas.items(), 11, adatas.__len__()))

In [10]:
adatas_csf

{'CYG_CSF_': AnnData object with n_obs × n_vars = 2641 × 36601
     var: 'gene_ids', 'feature_types',
 'JSB_CSF_': AnnData object with n_obs × n_vars = 1917 × 36601
     var: 'gene_ids', 'feature_types',
 'JYJ_CSF_': AnnData object with n_obs × n_vars = 3827 × 36601
     var: 'gene_ids', 'feature_types',
 'KHW_CSF_': AnnData object with n_obs × n_vars = 11138 × 36601
     var: 'gene_ids', 'feature_types',
 'KJS_CSF_': AnnData object with n_obs × n_vars = 10394 × 36601
     var: 'gene_ids', 'feature_types',
 'KSH_CSF_': AnnData object with n_obs × n_vars = 11289 × 36601
     var: 'gene_ids', 'feature_types',
 'KYO_CSF_': AnnData object with n_obs × n_vars = 395 × 36601
     var: 'gene_ids', 'feature_types',
 'SYG_CSF_': AnnData object with n_obs × n_vars = 1849 × 36601
     var: 'gene_ids', 'feature_types',
 'YYJ_CSF_': AnnData object with n_obs × n_vars = 1996 × 36601
     var: 'gene_ids', 'feature_types',
 'YYS_CSF_': AnnData object with n_obs × n_vars = 329 × 36601
     var: 'gene_id

In [14]:
adatas_pmcb

{'CYG_PBMC_': AnnData object with n_obs × n_vars = 8528 × 36601
     var: 'gene_ids', 'feature_types',
 'JSB_PBMC_': AnnData object with n_obs × n_vars = 9621 × 36601
     var: 'gene_ids', 'feature_types',
 'JYJ_PBMC_': AnnData object with n_obs × n_vars = 9748 × 36601
     var: 'gene_ids', 'feature_types',
 'KHW_PBMC_': AnnData object with n_obs × n_vars = 16133 × 36601
     var: 'gene_ids', 'feature_types',
 'KJS_PBMC_': AnnData object with n_obs × n_vars = 9178 × 36601
     var: 'gene_ids', 'feature_types',
 'KSH_PBMC_': AnnData object with n_obs × n_vars = 10319 × 36601
     var: 'gene_ids', 'feature_types',
 'KYO_PBMC_': AnnData object with n_obs × n_vars = 8963 × 36601
     var: 'gene_ids', 'feature_types',
 'SYG_PBMC_': AnnData object with n_obs × n_vars = 11823 × 36601
     var: 'gene_ids', 'feature_types',
 'YYJ_PBMC_': AnnData object with n_obs × n_vars = 7640 × 36601
     var: 'gene_ids', 'feature_types',
 'YYS_PBMC_': AnnData object with n_obs × n_vars = 10079 × 36601
     

In [22]:
adatas_csf['YYJ_CSF_'].obs

AAACCCACAGAGATGC-1
AAACGAACAACCCTCT-1
AAACGAAGTGCCTGAC-1
AAACGAATCAGAATAG-1
AAACGCTAGTTGCTCA-1
...
TTTGACTGTTCTCTAT-1
TTTGACTTCCGTAGTA-1
TTTGGAGCAAATGATG-1
TTTGGTTAGTAGTGCG-1
TTTGTTGGTATTCCTT-1


In [26]:
adata_csf = ad.concat(adatas_csf.values(), keys=adatas_csf.keys(), index_unique='-', join='outer')

In [27]:
adata_pbmc = ad.concat(adatas_pmcb.values(), keys=adatas_pmcb.keys(), index_unique='-', join='outer')