In [2]:
%load_ext lab_black
import os, sys

%load_ext autoreload
%autoreload 2
import pandas as pd
from os.path import join
import scanpy as sc
import numpy as np
from statsmodels.stats.multitest import multipletests

import matplotlib.pyplot as plt

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
DATA_PATH = "/n/holystore01/LABS/price_lab/Users/mjzhang/scDRS_data"

In [8]:
df_hom = pd.read_csv(
    join(DATA_PATH, "gene_annotation/", "mouse_human_homologs.txt"),
    sep="\t",
)
dict_hom = {row[1]: row[0] for _, row in df_hom.iterrows()}

In [10]:
URL_SUPP_TABLE = "https://www.dropbox.com/s/qojbzu5zln33j7f/supp_tables.xlsx?dl=1"

df_trait_info = pd.read_excel(
    URL_SUPP_TABLE,
    sheet_name=0,
)

list_trait = list(
    df_trait_info[df_trait_info.Category == "brain"]["Trait_Identifier"].values
)
list_trait += ["UKB_460K.body_HEIGHTz"]

gs_path = join(DATA_PATH, "gs_file", "magma_10kb_1000.gs")
df_magma_gs = pd.read_csv(gs_path, sep="\t")
df_magma_gs = df_magma_gs[df_magma_gs.TRAIT.isin(list_trait)].reset_index(drop=True)

df_magma_gs["GENESET"] = df_magma_gs["GENESET"].apply(
    lambda r: ",".join([dict_hom[g] for g in r.split(",") if g in dict_hom])
)

In [12]:
diff_expr = pd.read_csv("data/GSE67403_gene_exp.diff", sep="\t")
regions = [
    "dorsal",
    "intermediate",
    "ventral",
    "proximal",
    "distal",
    "superficial",
    "deep",
]

dict_diff_genes = dict()
FOLD_CHANGE_THRES = 2

diff_expr = diff_expr[
    diff_expr.sample_1.isin(regions) & diff_expr.sample_2.isin(regions)
]
diff_expr = diff_expr[
    ((diff_expr.value_1 > 10) | (diff_expr.value_2 > 10)) & (diff_expr.q_value < 0.05)
]

diff_long = diff_expr[
    (diff_expr.sample_1 == "dorsal") & (diff_expr.sample_2 == "ventral")
]
diff_long = diff_long[
    (np.abs(diff_long["log2(fold_change)"]) > np.log2(FOLD_CHANGE_THRES))
]
diff_transverse = diff_expr[
    (diff_expr.sample_1 == "proximal") & (diff_expr.sample_2 == "distal")
]
diff_transverse = diff_transverse[
    (np.abs(diff_transverse["log2(fold_change)"]) > np.log2(FOLD_CHANGE_THRES))
]
diff_radial = diff_expr[
    (diff_expr.sample_1 == "superficial") & (diff_expr.sample_2 == "deep")
]
diff_radial = diff_radial[
    (np.abs(diff_radial["log2(fold_change)"]) > np.log2(FOLD_CHANGE_THRES))
]

dict_diff_genes[f"ventral"] = diff_long[diff_long.test_stat > 0].gene.values
dict_diff_genes[f"dorsal"] = diff_long[diff_long.test_stat < 0].gene.values

dict_diff_genes[f"distal"] = diff_transverse[diff_transverse.test_stat > 0].gene.values
dict_diff_genes[f"proximal"] = diff_transverse[
    diff_transverse.test_stat < 0
].gene.values

dict_diff_genes[f"deep"] = diff_radial[diff_radial.test_stat > 0].gene.values
dict_diff_genes[f"superficial"] = diff_radial[diff_radial.test_stat < 0].gene.values

from os.path import join

df_spatial_gs = {"TRAIT": [], "GENESET": []}
for trait in dict_diff_genes:
    df_spatial_gs["TRAIT"].append("spatial_" + trait)
    df_spatial_gs["GENESET"].append(
        ",".join([g for g in dict_diff_genes[trait] if g in dict_hom.values()])
    )
df_spatial_gs = pd.DataFrame(df_spatial_gs)

# Spatial geneset

In [19]:
df_gs = pd.concat([df_magma_gs, df_spatial_gs])

In [20]:
df_mouse_gs = df_gs.copy()
df_mouse_gs.to_csv("gs_file/mouse.gs", sep="\t", index=False)

# mouse to human
dict_hom = {row[0]: row[1] for _, row in df_hom.iterrows()}

df_human_gs = df_mouse_gs.copy()
df_human_gs["GENESET"] = df_mouse_gs["GENESET"].apply(
    lambda gs: ",".join([dict_hom[g] for g in gs.split(",")])
)
df_human_gs.to_csv("gs_file/human.gs", sep="\t", index=False)

In [21]:
# divide the gene set into several pieces for parallel submission to the cluster
def divide_gs(df_gs, out_dir, batch_size=1):
    batch_dfs = np.array_split(df_gs, int(np.ceil(df_gs.shape[0] / batch_size)))
    if os.path.exists(out_dir):
        print(f"{out_dir} already exists. Clean up or use another directory")
        return
    else:
        os.makedirs(out_dir)
    for batch_i, batch_df in enumerate(batch_dfs):
        batch_df.to_csv(join(out_dir, f"batch{batch_i}.gs"), sep="\t", index=False)

In [22]:
divide_gs(df_mouse_gs, "gs_file/mouse.gs.batch")
divide_gs(df_human_gs, "gs_file/human.gs.batch")