In [1]:
import glob

# get all file paths
sample_paths =  glob.glob("/data/cephfs-1/work/groups/cubi/users/cemo10_c/scRNA/scRNA_preprocessing_pipeline/results/preprocessing/*/annotation.h5ad")
sample_paths[0:2]

['/data/cephfs-1/work/groups/cubi/users/cemo10_c/scRNA/scRNA_preprocessing_pipeline/results/preprocessing/CE_SC_5FU_OnOff_3/annotation.h5ad',
 '/data/cephfs-1/work/groups/cubi/users/cemo10_c/scRNA/scRNA_preprocessing_pipeline/results/preprocessing/CE_SC_5FU_OnOff_2/annotation.h5ad']

In [3]:
import re

# extract sample name
sample_ids = [match.group() for path in sample_paths if (match := re.search("CE[a-zA-Z0-9_]*", path))]

# remove CE_SC_ prefix
sample_ids = [re.sub("CE_SC_", "", name) for name in sample_ids]
sample_ids = [re.sub("5FU_", "", name) for name in sample_ids]

# from sample names split by _ and take the first element as treatment and the second as week number
sample_treatments = [name.split("_")[0] for name in sample_ids]
sample_weeks = [name.split("_")[1] for name in sample_ids]
sample_treatments = ["Control" if treatment == "C" else treatment for treatment in sample_treatments]
sample_treatments

['OnOff',
 'OnOff',
 'Control',
 'Control',
 'Conti',
 'Control',
 'Conti',
 'OnOff',
 'Control',
 'Conti',
 'OnOff',
 'OnOff',
 'Conti',
 'Control',
 'Conti']

In [5]:
# create a samples dictionary
samples = dict(zip(sample_ids, zip(sample_paths, sample_treatments, sample_weeks)))

import scanpy as sc

adatas = {}

for sample_id, (sample_path, treatment, week) in samples.items():
    print(sample_id)
    sample_adata = sc.read_h5ad(sample_path)
    sample_adata.obs['treatment'] = treatment
    sample_adata.obs['week'] = week
    adatas[sample_id] = sample_adata
    
import anndata as ad

adata = ad.concat(adatas, label="sample")
adata.obs_names_make_unique()
print(adata.obs["sample"].value_counts())
adata

OnOff_3
OnOff_2
C_5
C_4
Conti_3
C_3
Conti_5
OnOff_1
C_2
Conti_2
OnOff_4
OnOff_5
Conti_1
C_1
Conti_4
sample
OnOff_2    9837
Conti_1    9559
OnOff_5    9342
C_1        9008
Conti_3    8800
OnOff_3    8668
C_4        8047
C_3        7809
Conti_4    7591
OnOff_1    7478
C_5        7075
C_2        6787
Conti_2    6668
OnOff_4    6030
Conti_5    1296
Name: count, dtype: int64


AnnData object with n_obs × n_vars = 113995 × 11346
    obs: 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier', 'scDblFinder_score', 'scDblFinder_class', 'size_factors', 'leiden', 'leiden_res0_25', 'leiden_res0_5', 'leiden_res1', 'leiden_1', 'leiden_2', 'CMS1_score', 'CMS2_score', 'CMS3_score', 'CMS4_score', 'CMS', 'celltypist_cell_label_coarse', 'celltypist_conf_score_coarse', 'celltypist_cell_label_fine', 'celltypist_conf_score_fine', 'treatment', 'week', 'sample'
    obsm: 'X_pca', 'X_tsne', 'X_umap'
    layers: 'analytic_pearson_residuals', 'counts', 'log1p_norm', 'scran_normalization', 'soupX_counts'

In [7]:
adata.write('/data/cephfs-1/work/groups/cubi/users/cemo10_c/scRNA/scRNA_preprocessing_pipeline/results/preprocessing/merged.h5ad')