In [8]:
import scanpy as sc
import pandas as pd
import numpy as np
import gzip

FILE_LIST_PATH = 'filelist.txt'
MTX_PATH = 'MTX/'

In [9]:
file_list = pd.read_csv(FILE_LIST_PATH, sep='\t')
file_list.head()

Unnamed: 0,#Archive/File,Name,Time,Size,Type
0,Archive,GSE138266_RAW.tar,12/10/2019 09:03:40,264058880,TAR
1,File,GSM4104122_MS19270_CSF_GRCh38_barcodes.tsv.gz,10/01/2019 13:51:33,17265,TSV
2,File,GSM4104122_MS19270_CSF_GRCh38_genes.tsv.gz,10/01/2019 13:51:33,264802,TSV
3,File,GSM4104122_MS19270_CSF_GRCh38_matrix.mtx.gz,10/01/2019 13:51:37,15161857,MTX
4,File,GSM4104123_MS58637_CSF_GRCh38_barcodes.tsv.gz,10/01/2019 13:51:37,10813,TSV


In [10]:
barcodes_list = file_list[file_list.index % 3 == 1]['Name']
genes_list = file_list[file_list.index % 3 == 2]['Name']
matrix_list = file_list[file_list.index % 3 == 0]['Name']
matrix_list = matrix_list.drop(matrix_list.index[0])

barcodes_list[:2], genes_list[:2], matrix_list[:2]

(1    GSM4104122_MS19270_CSF_GRCh38_barcodes.tsv.gz
 4    GSM4104123_MS58637_CSF_GRCh38_barcodes.tsv.gz
 Name: Name, dtype: object,
 2    GSM4104122_MS19270_CSF_GRCh38_genes.tsv.gz
 5    GSM4104123_MS58637_CSF_GRCh38_genes.tsv.gz
 Name: Name, dtype: object,
 3    GSM4104122_MS19270_CSF_GRCh38_matrix.mtx.gz
 6    GSM4104123_MS58637_CSF_GRCh38_matrix.mtx.gz
 Name: Name, dtype: object)

In [11]:
len(barcodes_list), len(genes_list), len(matrix_list)

(22, 22, 22)

In [12]:
control = True

for barcode, gene, matrix in zip(barcodes_list, genes_list, matrix_list):
    print(barcode, gene, matrix)
    
    adata_prov = sc.read_mtx(MTX_PATH+matrix).T

    genes_df = pd.read_csv(MTX_PATH+gene, sep='\t', header=None, names=['gene_symbols', 'gene_ids'])
    adata_prov.var['gene_symbols'] = genes_df['gene_symbols'].values
    adata_prov.var['gene_ids'] = genes_df['gene_ids'].values
    
    adata_prov.obs.index = pd.read_csv(MTX_PATH+barcode, header=None)[0].values

    sample = barcode.split('_')[0]
    adata_prov.obs['sample'] = sample
    patient = barcode.split('_')[1]
    adata_prov.obs['patient'] = patient
    adata_prov.obs['dataset'] = 'GSE138266'
    if 'MS' in patient:
        adata_prov.obs['disease'] = 'MS'
    else:
        adata_prov.obs['disease'] = patient[:3]
    if 'PBMC' in matrix:
        adata_prov.obs.index = adata_prov.obs.index + '-' + patient + '-PBMC'
        adata_prov.obs['cell_type'] = 'PBMC'
    else:
        adata_prov.obs.index = adata_prov.obs.index + '-' + patient + '-CSF'
        adata_prov.obs['cell_type'] = 'CSF'

    if control:
        adata = adata_prov
        control = False
    else:
        adata = adata.concatenate(adata_prov, join='inner', index_unique=None)

GSM4104122_MS19270_CSF_GRCh38_barcodes.tsv.gz GSM4104122_MS19270_CSF_GRCh38_genes.tsv.gz GSM4104122_MS19270_CSF_GRCh38_matrix.mtx.gz
GSM4104123_MS58637_CSF_GRCh38_barcodes.tsv.gz GSM4104123_MS58637_CSF_GRCh38_genes.tsv.gz GSM4104123_MS58637_CSF_GRCh38_matrix.mtx.gz


  adata = adata.concatenate(adata_prov, join='inner', index_unique=None)


GSM4104124_MS71658_CSF_GRCh38_barcodes.tsv.gz GSM4104124_MS71658_CSF_GRCh38_genes.tsv.gz GSM4104124_MS71658_CSF_GRCh38_matrix.mtx.gz


  adata = adata.concatenate(adata_prov, join='inner', index_unique=None)


GSM4104125_MS49131_CSF_GRCh38_barcodes.tsv.gz GSM4104125_MS49131_CSF_GRCh38_genes.tsv.gz GSM4104125_MS49131_CSF_GRCh38_matrix.mtx.gz


  adata = adata.concatenate(adata_prov, join='inner', index_unique=None)


GSM4104126_MS60249_CSF_GRCh38_barcodes.tsv.gz GSM4104126_MS60249_CSF_GRCh38_genes.tsv.gz GSM4104126_MS60249_CSF_GRCh38_matrix.mtx.gz


  adata = adata.concatenate(adata_prov, join='inner', index_unique=None)


GSM4104127_MS74594_CSF_GRCh38_barcodes.tsv.gz GSM4104127_MS74594_CSF_GRCh38_genes.tsv.gz GSM4104127_MS74594_CSF_GRCh38_matrix.mtx.gz


  adata = adata.concatenate(adata_prov, join='inner', index_unique=None)


GSM4104128_PST83775_CSF_GRCh38_barcodes.tsv.gz GSM4104128_PST83775_CSF_GRCh38_genes.tsv.gz GSM4104128_PST83775_CSF_GRCh38_matrix.mtx.gz


  adata = adata.concatenate(adata_prov, join='inner', index_unique=None)


GSM4104129_PTC32190_CSF_GRCh38_barcodes.tsv.gz GSM4104129_PTC32190_CSF_GRCh38_genes.tsv.gz GSM4104129_PTC32190_CSF_GRCh38_matrix.mtx.gz


  adata = adata.concatenate(adata_prov, join='inner', index_unique=None)


GSM4104130_PST95809_CSF_GRCh38_barcodes.tsv.gz GSM4104130_PST95809_CSF_GRCh38_genes.tsv.gz GSM4104130_PST95809_CSF_GRCh38_matrix.mtx.gz


  adata = adata.concatenate(adata_prov, join='inner', index_unique=None)


GSM4104131_PTC41540_CSF_GRCh38_barcodes.tsv.gz GSM4104131_PTC41540_CSF_GRCh38_genes.tsv.gz GSM4104131_PTC41540_CSF_GRCh38_matrix.mtx.gz


  adata = adata.concatenate(adata_prov, join='inner', index_unique=None)


GSM4104132_PST45044_CSF_GRCh38_barcodes.tsv.gz GSM4104132_PST45044_CSF_GRCh38_genes.tsv.gz GSM4104132_PST45044_CSF_GRCh38_matrix.mtx.gz


  adata = adata.concatenate(adata_prov, join='inner', index_unique=None)


GSM4104133_PTC85037_CSF_GRCh38_barcodes.tsv.gz GSM4104133_PTC85037_CSF_GRCh38_genes.tsv.gz GSM4104133_PTC85037_CSF_GRCh38_matrix.mtx.gz


  adata = adata.concatenate(adata_prov, join='inner', index_unique=None)


GSM4104134_MS19270_PBMCs_GRCh38_barcodes.tsv.gz GSM4104134_MS19270_PBMCs_GRCh38_genes.tsv.gz GSM4104134_MS19270_PBMCs_GRCh38_matrix.mtx.gz


  adata = adata.concatenate(adata_prov, join='inner', index_unique=None)


GSM4104135_MS71658_PBMCs_GRCh38_barcodes.tsv.gz GSM4104135_MS71658_PBMCs_GRCh38_genes.tsv.gz GSM4104135_MS71658_PBMCs_GRCh38_matrix.mtx.gz


  adata = adata.concatenate(adata_prov, join='inner', index_unique=None)


GSM4104136_MS49131_PBMCs_GRCh38_barcodes.tsv.gz GSM4104136_MS49131_PBMCs_GRCh38_genes.tsv.gz GSM4104136_MS49131_PBMCs_GRCh38_matrix.mtx.gz


  adata = adata.concatenate(adata_prov, join='inner', index_unique=None)


GSM4104137_MS60249_PBMCs_GRCh38_barcodes.tsv.gz GSM4104137_MS60249_PBMCs_GRCh38_genes.tsv.gz GSM4104137_MS60249_PBMCs_GRCh38_matrix.mtx.gz


  adata = adata.concatenate(adata_prov, join='inner', index_unique=None)


GSM4104138_MS74594_PBMCs_GRCh38_barcodes.tsv.gz GSM4104138_MS74594_PBMCs_GRCh38_genes.tsv.gz GSM4104138_MS74594_PBMCs_GRCh38_matrix.mtx.gz


  adata = adata.concatenate(adata_prov, join='inner', index_unique=None)


GSM4104139_PST83775_PBMCs_GRCh38_barcodes.tsv.gz GSM4104139_PST83775_PBMCs_GRCh38_genes.tsv.gz GSM4104139_PST83775_PBMCs_GRCh38_matrix.mtx.gz


  adata = adata.concatenate(adata_prov, join='inner', index_unique=None)


GSM4104140_PTC32190_PBMCs_GRCh38_barcodes.tsv.gz GSM4104140_PTC32190_PBMCs_GRCh38_genes.tsv.gz GSM4104140_PTC32190_PBMCs_GRCh38_matrix.mtx.gz


  adata = adata.concatenate(adata_prov, join='inner', index_unique=None)


GSM4104141_PST95809_PBMCs_GRCh38_barcodes.tsv.gz GSM4104141_PST95809_PBMCs_GRCh38_genes.tsv.gz GSM4104141_PST95809_PBMCs_GRCh38_matrix.mtx.gz


  adata = adata.concatenate(adata_prov, join='inner', index_unique=None)


GSM4104142_PTC41540_PBMCs_GRCh38_barcodes.tsv.gz GSM4104142_PTC41540_PBMCs_GRCh38_genes.tsv.gz GSM4104142_PTC41540_PBMCs_GRCh38_matrix.mtx.gz


  adata = adata.concatenate(adata_prov, join='inner', index_unique=None)


GSM4104143_PTC85037_PBMCs_GRCh38_barcodes.tsv.gz GSM4104143_PTC85037_PBMCs_GRCh38_genes.tsv.gz GSM4104143_PTC85037_PBMCs_GRCh38_matrix.mtx.gz


  adata = adata.concatenate(adata_prov, join='inner', index_unique=None)


In [16]:
adata

AnnData object with n_obs × n_vars = 814177 × 33694
    obs: 'sample', 'patient', 'dataset', 'disease', 'cell_type', 'batch'
    var: 'gene_symbols', 'gene_ids'

In [14]:
adata.write('GSE138266.h5ad')