# Test spatial patterns for `scDRS` scores

In [1]:
%load_ext lab_black

import numpy as np
import pandas as pd
import scanpy as sc
from os.path import join
from scipy import stats
import statsmodels.api as sm
import os

In [2]:
for dataset in [
    "zeisel_2015",
    "zeisel_2018",
    "ayhan_2021",
    "yao_2021",
    "zhong_2020",
    "habib_2016",
    "habib_2017",
]:
    adata_dir = "../00_prepare_dataset/processed/"
    if dataset == "zeisel_2015":
        adata = sc.read_h5ad(join(adata_dir, "zeisel_2015.processed.h5ad"))
        adata = adata[adata.obs["level2class"].isin(["CA1Pyr1", "CA1Pyr2"])]
    if dataset == "zeisel_2018":
        adata = sc.read_h5ad(join(adata_dir, "zeisel_2018.raw.h5ad"))
        adata = adata[
            (adata.obs.Tissue == "CA1")
            & (adata.obs.Class == "Neurons")
            & (adata.obs.Description == "Excitatory neurons, hippocampus CA1")
        ]
    elif dataset == "ayhan_2021":
        adata = sc.read_h5ad(join(adata_dir, "ayhan_2021.raw.h5ad"))
        adata = adata[adata.obs["Cluster"].isin(["Pyr1", "Pyr2"])]
    elif dataset == "yao_2021":
        adata = sc.read_h5ad(join(adata_dir, "yao_2021.processed.h5ad"))
        adata = adata[adata.obs.subclass_label == "CA1-ProS"]
    elif dataset == "zhong_2020":
        adata = sc.read_h5ad(join(adata_dir, "zhong_2020.processed.h5ad"))
        adata = adata[adata.obs.leiden.isin(["0", "11", "7"])]
    elif dataset == "habib_2016":
        adata = sc.read_h5ad(join(adata_dir, "habib_2016.processed.h5ad"))
        adata = adata[adata.obs.CLUSTER == "CA1"]
    elif dataset == "habib_2017":
        adata = sc.read_h5ad(join(adata_dir, "habib_2017.processed.h5ad"))
        adata = adata[adata.obs["Cluster"] == "3"]
    # attach scores to the adata
    print(dataset, adata.shape)

  res = method(*args, **kwargs)


zeisel_2015 (827, 14538)
zeisel_2018 (304, 15831)
ayhan_2021 (5454, 17180)
yao_2021 (1701, 15206)
zhong_2020 (5972, 17743)
habib_2016 (155, 25392)
habib_2017 (421, 32111)


In [6]:
def test_ca1_spatial(dataset, trait, n_ctrl=1000):
    assert dataset in [
        "zeisel_2015",
        "zeisel_2018",
        "ayhan_2021",
        "yao_2021",
        "zhong_2020",
        "habib_2016",
        "habib_2017",
    ]
    adata_dir = "../00_prepare_dataset/processed/"
    if dataset == "zeisel_2015":
        adata = sc.read_h5ad(join(adata_dir, "zeisel_2015.processed.h5ad"))
        adata = adata[adata.obs["level2class"].isin(["CA1Pyr1", "CA1Pyr2"])]
    if dataset == "zeisel_2018":
        adata = sc.read_h5ad(join(adata_dir, "zeisel_2018.raw.h5ad"))
        adata = adata[
            (adata.obs.Tissue == "CA1")
            & (adata.obs.Class == "Neurons")
            & (adata.obs.Description == "Excitatory neurons, hippocampus CA1")
        ]
    elif dataset == "ayhan_2021":
        adata = sc.read_h5ad(join(adata_dir, "ayhan_2021.raw.h5ad"))
        adata = adata[adata.obs["Cluster"].isin(["Pyr1", "Pyr2"])]
    elif dataset == "yao_2021":
        adata = sc.read_h5ad(join(adata_dir, "yao_2021.processed.h5ad"))
        adata = adata[adata.obs.subclass_label == "CA1-ProS"]
    elif dataset == "zhong_2020":
        adata = sc.read_h5ad(join(adata_dir, "zhong_2020.processed.h5ad"))
        adata = adata[adata.obs.leiden.isin(["0", "11", "7"])]
    elif dataset == "habib_2016":
        adata = sc.read_h5ad(join(adata_dir, "habib_2016.processed.h5ad"))
        adata = adata[adata.obs.CLUSTER == "CA1"]
    elif dataset == "habib_2017":
        adata = sc.read_h5ad(join(adata_dir, "habib_2017.processed.h5ad"))
        adata = adata[adata.obs["Cluster"] == "3"]
    # attach scores to the adata

    spatial_cols = [
        "spatial_ventral",
        "spatial_dorsal",
        "spatial_distal",
        "spatial_proximal",
        "spatial_deep",
        "spatial_superficial",
    ]

    df_list = []
    for spatial_col in spatial_cols:
        temp_df = pd.read_csv(
            join("score_file", dataset, f"{spatial_col}.score.gz"),
            sep="\t",
            index_col=0,
        )
        temp_df.columns = ["%s.%s" % (spatial_col, x) for x in temp_df.columns]
        df_list.append(temp_df)

    df_spatial_score = pd.concat(df_list, axis=1)
    df_spatial_score = df_spatial_score[
        [col for col in df_spatial_score if col.endswith(".norm_score")]
    ].rename(columns=lambda c: c[:-11])

    for spatial_col in df_spatial_score.columns:
        adata.obs[spatial_col] = df_spatial_score[spatial_col].reindex(adata.obs.index)

    spatial_cols = [col for col in adata.obs.columns if col.startswith("spatial_")]
    df_trait_score = pd.read_csv(
        join("score_file", dataset, f"{trait}.full_score.gz"), sep="\t", index_col=0
    )

    for col in ["norm_score"] + [f"ctrl_norm_score_{i}" for i in range(n_ctrl)]:
        adata.obs[col] = df_trait_score[col].reindex(adata.obs.index)

    # mask NA entries
    cell_mask = ~pd.isna(
        adata.obs[[col for col in adata.obs if "norm_score" in col]]
    ).any(axis=1)

    dict_tstats = dict()
    dict_ctrl_tstats = {col: np.zeros(n_ctrl) for col in spatial_cols}

    for spatial_col in spatial_cols:
        # regression results for `spatial_col`
        rls = sm.OLS(
            adata.obs.loc[cell_mask, "norm_score"],
            sm.add_constant(adata.obs.loc[cell_mask, spatial_col]),
        ).fit()
        dict_tstats[spatial_col] = rls.tvalues[spatial_col]

        # calculate control statistics
        for i_ctrl in range(n_ctrl):
            rls = sm.OLS(
                adata.obs.loc[cell_mask, f"ctrl_norm_score_{i_ctrl}"],
                sm.add_constant(adata.obs.loc[cell_mask, spatial_col]),
            ).fit()
            dict_ctrl_tstats[spatial_col][i_ctrl] = rls.tvalues[spatial_col]

    dict_rls = dict()
    for spatial_col in spatial_cols:
        tstats = dict_tstats[spatial_col]
        ctrl_tstats = dict_ctrl_tstats[spatial_col]
        dict_rls[spatial_col + ".tstats"] = tstats
        dict_rls[spatial_col + ".ctrl_zscore"] = (
            tstats - np.mean(ctrl_tstats)
        ) / np.std(ctrl_tstats)
        dict_rls[spatial_col + ".ctrl_pval"] = (np.sum(tstats <= ctrl_tstats) + 1) / (
            n_ctrl + 1
        )

    df_rls = pd.DataFrame(dict_rls, index=[trait])

    os.makedirs(f"regression_file/{dataset}", exist_ok=True)

    df_rls.to_csv(f"regression_file/{dataset}/{trait}.csv")

In [7]:
df_gs = pd.read_csv("gs_file/human.gs", sep="\t")
trait_list = df_gs.TRAIT.values

In [19]:
import submitit

executor = submitit.AutoExecutor(folder="submitit_log/")
executor.update_parameters(timeout_min=15, mem_gb=40, slurm_partition="serial_requeue")

dict_jobs = {
    dataset: executor.map_array(
        lambda trait: test_ca1_spatial(dataset, trait), trait_list
    )
    for dataset in [
        "zeisel_2015",
        "zeisel_2018",
        "ayhan_2021",
        "yao_2021",
        "zhong_2020",
        "habib_2016",
        "habib_2017",
    ]
}