In [1]:
import os
import platform
import scanpy as sc
import pandas as pd
import numpy as np
import anndata as ad
from src.functions import get_matrices_from_dfs, normalize_proportion_ratios
import multiprocessing
from joblib import Parallel, delayed
from tqdm import tqdm
import statsmodels.formula.api as smf

In [None]:
if platform.system() == "Darwin":
    target_path = '/Volumes/mklein/FDA_project/data/Lx_Glioblastoma'
else:
    target_path = '/home/mklein/FDA_project/data/Lx_Glioblastoma'


In [None]:
samples = [s for s in os.listdir(target_path) if s != ".DS_Store"]

adata_dict = {}
adata_cor_dict = {}

condition_name = 'dataset_3'

for sample in samples:
    sample_path = os.path.join(target_path, sample)
    files = {
        'cell_sm_matrix': 'cells_spatiomolecular_adata.h5ad',
        'corr_cell_sm_matrix': 'cells_spatiomolecular_adata_corrected.h5ad',
    }

    project_files = {k: os.path.join(sample_path, v) for k, v in files.items()}

    adata = sc.read(project_files['cell_sm_matrix'])
    adata_cor = sc.read(project_files['corr_cell_sm_matrix'])

    adata_dict[sample] = adata
    adata_cor_dict[sample] = adata_cor



In [16]:
def split_dataset_info(adata):
    split = adata.obs['dataset'].str.split("_", expand=True)
    adata.obs[['dataset_' + str(col) for col in split.columns]] = split

In [17]:
def concat_batches(adata_dict):
    adata = ad.concat(adata_dict, label='well', index_unique="_", merge="same")
    #sc.tl.pca(adata)
    #sc.external.pp.bbknn(adata, batch_key='well')
    split_dataset_info(adata)
    return adata

adata = concat_batches(adata_dict)
adata_cor = concat_batches(adata_cor_dict)


In [18]:
adata.obs[condition_name].value_counts()

TMDdM        3499
NaiveWT      3149
TMDCD95WT    2479
TMDtM        1905
TMDsM        1676
TMDCD95KO    1208
TMDd\sM       410
TMDCD96KO     405
Name: dataset_3, dtype: int64

In [19]:
adata.write(os.path.join(target_path, "batch_sm_matrix.h5ad"))
adata_cor.write(os.path.join(target_path, "corrected_batch_sm_matrix.h5ad"))