# Create sbatch scripts for various data analysis pipeline

In [1]:
import os
import numpy as np
import pandas as pd
from glob import glob
from pathlib import Path
import schicluster

In [2]:
PKG_DIR = schicluster.__path__[0]

In [3]:
# Load meta file
meta_file = "/tscc/projects/ps-renlab2/sel041/scmethylhic/human_hippocampus/concat/human_aging_final_metadata.csv.gz"

## Load Data

In [4]:
metadata = pd.read_csv(meta_file, header=0, index_col=1)

In [5]:
def prep_dir(output_dir, chunk_df, template, params):
    os.makedirs(output_dir, exist_ok=True)
    cell_table_path = os.path.join(output_dir, "cell_table.csv")
    chunk_df.to_csv(cell_table_path, header=False, index=True)
    params_str = "\n".join(f"{k} = {v}" for k, v in params.items())

    with open(os.path.join(output_dir, "Snakefile_master"), "w") as f:
        f.write(params_str + template)
    return

In [6]:
root = "/tscc/projects/ps-epigen/users/biy022/scmethylhic/human_hippocampus/snm3c/"
parent_root = "/tscc/projects/ps-epigen/users/biy022/scmethylhic"
output_dir = os.path.join(root, "Combined", "hic")
chrom_size_path = os.path.join(parent_root, "genome", "genome_hg38", "hg38.autosomal.chrom.sizes")
black_list_path = os.path.join(parent_root, "genome", "genome_hg38", "hg38_blacklist.v2.bed.gz")

## Raw contact

In [13]:
cell_table = pd.read_csv(
    os.path.join(output_dir, "contact_table_all.tsv"), 
    sep="\t", 
    header=None, 
    index_col=0,
    names=["cell_id", "tsv_path"]
)
cell_table = cell_table.loc[metadata.index]
cell_table["final_cluster"] = metadata["final_cluster"].copy()
cell_table["final_subclass"] = metadata["final_subclass"].copy()
cell_table.final_cluster[cell_table.final_cluster.isin(["Endo", "VLMC"])] = "Endo_VLMC"

In [14]:
leg = {}
chunksize = 200
raw_output_dir = os.path.join(output_dir, "raw")

for cluster, sub_df in cell_table.groupby("final_cluster"):
    legtmp = []
    tmp = sub_df.copy()
    os.makedirs(os.path.join(raw_output_dir, cluster), exist_ok=True)
    for i, chunkstart in enumerate(np.arange(0, tmp.shape[0], chunksize)):
        os.makedirs(os.path.join(raw_output_dir, "{}_chunk{}".format(cluster, i)), exist_ok=True)
        tmp["tsv_path"].iloc[chunkstart:(chunkstart+chunksize)].to_csv(
            os.path.join(raw_output_dir, "{}_chunk{}/cell_table.tsv".format(cluster, i)),
            sep="\t",
            header=False,
            index=True
        )
        legtmp.append("{}_chunk{}".format(cluster, i))
    tmp["tsv_path"].to_csv(
        os.path.join(raw_output_dir, "{}/cell_table.tsv".format(cluster)), 
        sep="\t", 
        header=False, 
        index=True
    )
    leg[cluster] = legtmp
    print(cluster, tmp.shape[0])

for cluster, sub_df in cell_table.groupby("final_subclass"):
    if not cluster in ["Astro", "Micro"]:
        continue
        
    legtmp = []
    tmp = sub_df.copy()
    os.makedirs(os.path.join(raw_output_dir, cluster), exist_ok=True)
    for i, chunkstart in enumerate(np.arange(0, tmp.shape[0], chunksize)):
        os.makedirs(os.path.join(raw_output_dir, "{}_chunk{}".format(cluster, i)), exist_ok=True)
        tmp["tsv_path"].iloc[chunkstart:(chunkstart+chunksize)].to_csv(
            os.path.join(raw_output_dir, "{}_chunk{}/cell_table.tsv".format(cluster, i)),
            sep="\t",
            header=False,
            index=True
        )
        legtmp.append("{}_chunk{}".format(cluster, i))
    tmp["tsv_path"].to_csv(
        os.path.join(raw_output_dir, "{}/cell_table.tsv".format(cluster)), 
        sep="\t", 
        header=False, 
        index=True
    )
    leg[cluster] = legtmp
    print(cluster, tmp.shape[0])

Astro1 1338
Astro2 1159
CA 131
DG 310
Endo_VLMC 240
Micro1 1329
Micro2 1275
NR2F2-LAMP5 173
OPC 1118
Oligo 13188
PVALB 180
SST 195
SUB 1298
VIP 306
Astro 2497
Micro 2604


In [15]:
os.makedirs(os.path.join(raw_output_dir, "commands"), exist_ok=True)
f1 = open(os.path.join(raw_output_dir, "commands/snakemake_cmd_step1.txt"), "w")
f2 = open(os.path.join(raw_output_dir, "commands/snakemake_cmd_step2.txt"), "w")

for cluster in leg:
    for group in leg[cluster]:
        cmd = (
            "hicluster merge-cell-raw "
            "--cell_table {} "
            "--chrom_size_path {} "
            "--output_file {}"
        )
        cmd = cmd.format(
            os.path.join(raw_output_dir, "{}/cell_table.tsv".format(group)),
            chrom_size_path,
            os.path.join(raw_output_dir, "{}/raw.cool".format(group))
        )
        f1.write(cmd + "\n")
    if len(leg[cluster]) < 2:
        group = leg[cluster][0]
        cmd = "rsync -arv {} {}".format(
            os.path.join(raw_output_dir, "{}/raw.cool".format(group)),
            os.path.join(raw_output_dir, "{0}/{0}.raw.cool".format(cluster))
        )
        f2.write(cmd + "\n")
    else:
        cmd = "cooler merge {}".format(os.path.join(raw_output_dir, "{0}/{0}.raw.cool".format(cluster)))
        for group in leg[cluster]:
            cmd += " {}".format(os.path.join(raw_output_dir, "{}/raw.cool".format(group)))
        f2.write(cmd + "\n")
f1.close()
f2.close()

In [16]:
sbatch_header = (
    "#! /bin/bash\n"
    "#SBATCH -p condo\n"
    "#SBATCH -q condo\n"
    "#SBATCH -A csd772\n"
    "#SBATCH -J hh_{0}_s{1}_{2}\n"
    "#SBATCH -N {3}\n"
    "#SBATCH -c {4}\n"
    "#SBATCH --mem {5}G\n"
    "#SBATCH -t {6}:00:00\n"
    "#SBATCH -o /tscc/projects/ps-epigen/users/biy022/scmethylhic/human_hippocampus/snm3c/Combined/hic/{0}/commands/step{1}_{2}.out\n"
    "#SBATCH -e /tscc/projects/ps-epigen/users/biy022/scmethylhic/human_hippocampus/snm3c/Combined/hic/{0}/commands/step{1}_{2}.err\n"
    "#SBATCH --mail-user biy022@health.ucsd.edu\n"
    "#SBATCH --mail-type FAIL\n"
    "\n"
    "source ~/.bashrc\n"
    "conda activate schicluster\n"
    "cd {7}\n"
)

In [17]:
chunksize = 40
with open(os.path.join(raw_output_dir, "commands/snakemake_cmd_step1.txt"), "r") as finput:
    cmds = finput.readlines()

index = 0
for i in range(0, len(cmds), chunksize):
    with open(os.path.join(raw_output_dir, "commands/snakemake_cmd_step1_{}.txt".format(index)), "w") as f:
        f.write("\n".join([xx.strip() for xx in cmds[i:(i+chunksize)]]) + "\n")
    with open(os.path.join(raw_output_dir, "commands/step1_{}.sbatch".format(index)), "w") as f:
        f.write(sbatch_header.format("raw", 1, index, 1, 16, 100, 4, raw_output_dir + "/commands") + "\n")
        f.write("bash snakemake_cmd_step1_{}.txt\n".format(index))
    index += 1

## Loop

In [18]:
cool_list = glob(root + "*deep/hic/impute/10kb/chunk*/*cool")

In [19]:
cell_table = pd.DataFrame(
    cool_list, 
    index=[xx.split("/")[-1].replace(".cool", "") for xx in cool_list],
    columns=["cool_path"]
)
cell_table = cell_table.loc[metadata.index]
cell_table["final_cluster"] = metadata["final_cluster"].copy()
cell_table["final_subclass"] = metadata["final_subclass"].copy()

In [20]:
cell_table.final_cluster[cell_table.final_cluster.isin(["Endo", "VLMC"])] = "Endo_VLMC"

In [21]:
loop_output_dir = os.path.join(output_dir, "loop")

In [22]:
params = {
    "cpu": 128,
    "resolution": 10000,
    "chrom_size_path": '"{}"'.format(chrom_size_path),
    "black_list_path": '"{}"'.format(black_list_path),
}

In [23]:
with open(f"{PKG_DIR}/loop/snakemake_template_loop.txt") as tmp:
    GENERATE_MATRIX_CHUNK_TEMPLATE = tmp.read()

In [24]:
for cluster, sub_df in cell_table.groupby("final_cluster"):
    tmp = sub_df.iloc[:, [0,1]].copy()
    prep_dir(os.path.join(loop_output_dir, cluster), tmp, GENERATE_MATRIX_CHUNK_TEMPLATE, params)

In [25]:
for cluster, sub_df in cell_table.groupby("final_subclass"):
    if not cluster in ["Astro", "Micro"]:
        continue
    tmp = sub_df.iloc[:, [0,2]].copy()
    prep_dir(os.path.join(loop_output_dir, cluster), tmp, GENERATE_MATRIX_CHUNK_TEMPLATE, params)

In [26]:
os.makedirs(os.path.join(loop_output_dir, "commands"), exist_ok=True)
with open(os.path.join(loop_output_dir, "commands/snakemake_cmds.txt"), "w") as f:
    for cluster, sub_df in cell_table.groupby("final_cluster"):
        cluster_dir = os.path.join(loop_output_dir, cluster)
        f.write("snakemake -d {0} -s {0}/Snakefile_master -j {1}\n".format(cluster_dir, params["cpu"]))
    for cluster, sub_df in cell_table.groupby("final_subclass"):
        if not cluster in ["Astro", "Micro"]:
            continue
        cluster_dir = os.path.join(loop_output_dir, cluster)
        f.write("snakemake -d {0} -s {0}/Snakefile_master -j {1}\n".format(cluster_dir, params["cpu"]))

In [27]:
sbatch_header = (
    "#! /bin/bash\n"
    "#SBATCH -p condo\n"
    "#SBATCH -q condo\n"
    "#SBATCH -A csd772\n"
    "#SBATCH -J hh_{0}_s{1}_{2}\n"
    "#SBATCH -N {3}\n"
    "#SBATCH -c {4}\n"
    "#SBATCH --mem {5}G\n"
    "#SBATCH -t {6}:00:00\n"
    "#SBATCH -o /tscc/projects/ps-epigen/users/biy022/scmethylhic/human_hippocampus/snm3c/Combined/hic/{0}/commands/step{1}_{2}.out\n"
    "#SBATCH -e /tscc/projects/ps-epigen/users/biy022/scmethylhic/human_hippocampus/snm3c/Combined/hic/{0}/commands/step{1}_{2}.err\n"
    "#SBATCH --mail-user biy022@health.ucsd.edu\n"
    "#SBATCH --mail-type FAIL\n"
    "\n"
    "source ~/.bashrc\n"
    "conda activate schicluster\n"
    "cd {7}\n"
)

In [28]:
with open(os.path.join(loop_output_dir, "commands/snakemake_cmds.txt"), "r") as f:
    cmds = f.readlines()

for index, cmd in enumerate(cmds):
    with open(os.path.join(loop_output_dir, "commands/step1_{}.sbatch".format(index)), "w") as f:
        f.write(sbatch_header.format("loop", 1, index, 4, 32, 60, 24, os.path.join(loop_output_dir, "commands")))
        f.write(cmd + "\n")

## Domain (aggregate bulk)

In [29]:
cool_list = glob(root + "*deep/hic/impute/25kb/chunk*/*cool")

In [30]:
cell_table = pd.DataFrame(
    cool_list, 
    index=[xx.split("/")[-1].replace(".cool", "") for xx in cool_list],
    columns=["cool_path"]
)
cell_table = cell_table.loc[metadata.index]
cell_table["final_cluster"] = metadata["final_cluster"].copy()
cell_table["final_subclass"] = metadata["final_subclass"].copy()

In [31]:
cell_table.final_cluster[cell_table.final_cluster.isin(["Endo", "VLMC"])] = "Endo_VLMC"

In [32]:
domain_output_dir = os.path.join(output_dir, "domain")

In [33]:
for cluster, sub_df in cell_table.groupby("final_cluster"):
    os.makedirs(os.path.join(domain_output_dir, cluster), exist_ok=True)
    sub_df.to_csv(os.path.join(domain_output_dir, cluster, "cell_table.csv"), header=False, index=True)
    print(cluster, sub_df.shape[0])

Astro1 1338
Astro2 1159
CA 131
DG 310
Endo_VLMC 240
Micro1 1329
Micro2 1275
NR2F2-LAMP5 173
OPC 1118
Oligo 13188
PVALB 180
SST 195
SUB 1298
VIP 306


In [34]:
for cluster, sub_df in cell_table.groupby("final_subclass"):
    if not cluster in ["Astro", "Micro"]:
        continue
    os.makedirs(os.path.join(domain_output_dir, cluster), exist_ok=True)
    sub_df.to_csv(os.path.join(domain_output_dir, cluster, "cell_table.csv"), header=False, index=True)
    print(cluster, sub_df.shape[0])

Astro 2497
Micro 2604


In [35]:
params = {
    "resolution": 25000,
    "chrom_size_path": '"{}"'.format(chrom_size_path),
}

In [36]:
chunksize = 200
resolution = 25000
total_chunk_dirs = []
group_chunks = {}

with open("{}/cool/Snakefile_chunk_template".format(PKG_DIR)) as f:
    GENERATE_MATRIX_CHUNK_TEMPLATE = f.read()

for group, group_df in cell_table.groupby("final_cluster"):
    group_chunks[group] = []

    if group_df.shape[0] <= chunksize:
        curr_dir = os.path.join(domain_output_dir, group + "_chunk0")
        params["cell_table_path"] = '"{}"'.format(os.path.join(curr_dir, "cell_table.csv"))
        prep_dir(curr_dir, group_df, GENERATE_MATRIX_CHUNK_TEMPLATE, params)
        total_chunk_dirs.append(curr_dir)
        group_chunks[group].append(curr_dir)
    else:
        group_df["chunk"] = [i // chunksize for i in range(0, group_df.shape[0])]
        for chunk, chunk_df in group_df.groupby("chunk"):
            curr_dir = os.path.join(domain_output_dir, group + "_chunk{}".format(chunk))
            params["cell_table_path"] = '"{}"'.format(os.path.join(curr_dir, "cell_table.csv"))
            prep_dir(curr_dir, chunk_df, GENERATE_MATRIX_CHUNK_TEMPLATE, params)
            total_chunk_dirs.append(curr_dir)
            group_chunks[group].append(curr_dir)

for group, group_df in cell_table.groupby("final_subclass"):
    if not group in ["Astro", "Micro"]:
        continue
        
    group_chunks[group] = []

    if group_df.shape[0] <= chunksize:
        curr_dir = os.path.join(domain_output_dir, group + "_chunk0")
        params["cell_table_path"] = '"{}"'.format(os.path.join(curr_dir, "cell_table.csv"))
        prep_dir(curr_dir, group_df, GENERATE_MATRIX_CHUNK_TEMPLATE, params)
        total_chunk_dirs.append(curr_dir)
        group_chunks[group].append(curr_dir)
    else:
        group_df["chunk"] = [i // chunksize for i in range(0, group_df.shape[0])]
        for chunk, chunk_df in group_df.groupby("chunk"):
            curr_dir = os.path.join(domain_output_dir, group + "_chunk{}".format(chunk))
            params["cell_table_path"] = '"{}"'.format(os.path.join(curr_dir, "cell_table.csv"))
            prep_dir(curr_dir, chunk_df, GENERATE_MATRIX_CHUNK_TEMPLATE, params)
            total_chunk_dirs.append(curr_dir)
            group_chunks[group].append(curr_dir)

In [37]:
os.makedirs(os.path.join(domain_output_dir, "commands"), exist_ok=True)
with open(os.path.join(domain_output_dir, "commands", "snakemake_cmd_step1.txt"), "w") as f:
    for chunk_dir in total_chunk_dirs:
        cmd = "snakemake -d {0} --snakefile {0}/Snakefile_master -j 5 --rerun-incomplete".format(chunk_dir)
        f.write(cmd + "\n")

In [38]:
if "cell_table_path" in params.keys():
    params.pop("cell_table_path")
params["output_dir"] = '"{}"'.format(domain_output_dir)
params_str = "\n".join("{} = {}".format(k, v) for k, v in params.items())

with open("{}/cool/Snakefile_group_template".format(PKG_DIR), "r") as f:
    GENERATE_MATRIX_GROUP_TEMPLATE = f.read()

with open(os.path.join(domain_output_dir, "commands", "Snakefile"), "w") as f:
    f.write(params_str + "\n" + GENERATE_MATRIX_GROUP_TEMPLATE)

with open(os.path.join(domain_output_dir, "commands", "snakemake_cmd_step2.txt"), "w") as f:
    cmd = "snakemake -d {} --snakefile {} -j 10 --rerun-incomplete".format(
        os.path.join(domain_output_dir),
        os.path.join(domain_output_dir, "commands", "Snakefile")
    )
    f.write(cmd + "\n")

In [39]:
sbatch_header = (
    "#! /bin/bash\n"
    "#SBATCH -p condo\n"
    "#SBATCH -q condo\n"
    "#SBATCH -A csd772\n"
    "#SBATCH -J hh_{0}_s{1}_{2}\n"
    "#SBATCH -N {3}\n"
    "#SBATCH -c {4}\n"
    "#SBATCH --mem {5}G\n"
    "#SBATCH -t {6}:00:00\n"
    "#SBATCH -o /tscc/projects/ps-epigen/users/biy022/scmethylhic/human_hippocampus/snm3c/Combined/hic/{0}/commands/step{1}_{2}.out\n"
    "#SBATCH -e /tscc/projects/ps-epigen/users/biy022/scmethylhic/human_hippocampus/snm3c/Combined/hic/{0}/commands/step{1}_{2}.err\n"
    "#SBATCH --mail-user biy022@health.ucsd.edu\n"
    "#SBATCH --mail-type FAIL\n"
    "\n"
    "source ~/.bashrc\n"
    "conda activate schicluster\n"
    "cd {7}\n"
)

In [40]:
chunksize = 40
with open(os.path.join(domain_output_dir, "commands/snakemake_cmd_step1.txt"), "r") as finput:
    cmds = finput.readlines()

index = 0
for i in range(0, len(cmds), chunksize):
    with open(os.path.join(domain_output_dir, "commands/snakemake_cmd_step1_{}.txt".format(index)), "w") as f:
        f.write("\n".join([xx.strip() for xx in cmds[i:(i+chunksize)]]) + "\n")
    with open(os.path.join(domain_output_dir, "commands/step1_{}.sbatch".format(index)), "w") as f:
        f.write(sbatch_header.format("domain", 1, index, 1, 16, 100, 4, domain_output_dir + "/commands") + "\n")
        f.write("bash snakemake_cmd_step1_{}.txt\n".format(index))
    index += 1

## Domain (subclass x donor), add Exc and Inh neurons

In [7]:
from pathlib import Path

In [8]:
root = Path("/tscc/projects/ps-epigen/users/biy022/scmethylhic/human_hippocampus/snm3c/")
parent_root = Path("/tscc/projects/ps-epigen/users/biy022/scmethylhic")
output_dir = root / "Combined" / "hic"
chrom_size_path = parent_root / "genome" / "genome_hg38" / "hg38.autosomal.chrom.sizes"
black_list_path = parent_root / "genome" / "genome_hg38" / "hg38_blacklist.v2.bed.gz"

In [9]:
cool_list = list(root.glob("*_deep/hic/impute/25kb/chunk*/*cool"))

In [10]:
cell_table = pd.DataFrame(
    [str(xx) for xx in cool_list],
    index=[str(xx.stem) for xx in cool_list],
    columns=["cool_path"]
)
cell_table = cell_table.loc[metadata.index]
cell_table["final_cluster"] = metadata["final_cluster"].copy()
cell_table["final_cluster"][cell_table.final_cluster.isin(["Astro1", "Astro2"])] = "Astro"
cell_table["final_cluster"][cell_table.final_cluster.isin(["Micro1", "Micro2"])] = "Micro"
cell_table["final_cluster"][cell_table.final_cluster.isin(["VLMC", "Endo"])] = "Endo_VLMC"
cell_table["final_cluster"][cell_table.final_cluster.isin(["CA", "SUB", "DG"])] = "ExcNeurons"
cell_table["final_cluster"][cell_table.final_cluster.isin(["VIP", "SST", "PVALB", "NR2F2-LAMP5"])] = "InhNeurons"
cell_table["donor"] = metadata.donor.copy()
cell_table["group"] = cell_table[["final_cluster", "donor"]].agg("_".join, axis=1)

In [11]:
cell_table = cell_table[cell_table.final_cluster.isin(["ExcNeurons", "InhNeurons"])]
cell_table = cell_table.drop(["final_cluster", "donor"], axis=1)

In [12]:
domain_output_dir = output_dir / "domain_subclass_donor"
if not domain_output_dir.exists():
    domain_output_dir.mkdir(exist_ok=True)

In [13]:
for cluster, sub_df in cell_table.groupby("group"):
    subclass, donor = cluster.rsplit("_", 1)
    cluster_output_dir = domain_output_dir / subclass / donor
    cluster_output_dir.mkdir(parents=True, exist_ok=True)
    sub_df.to_csv(cluster_output_dir / "cell_table.csv", header=False, index=True)
    # print(cluster, sub_df.shape[0])

In [14]:
params = {
    "resolution": 25000,
    "chrom_size_path": '"{}"'.format(chrom_size_path),
}

In [15]:
chunksize = 200
resolution = 25000
total_chunk_dirs = []
group_chunks = {}

with open("{}/cool/Snakefile_chunk_template".format(PKG_DIR)) as f:
    GENERATE_MATRIX_CHUNK_TEMPLATE = f.read()

for group, group_df in cell_table.groupby("group"):
    group_chunks[group] = []
    subclass, donor = group.rsplit("_", 1)

    if group_df.shape[0] <= chunksize:
        curr_dir = domain_output_dir / subclass / (str(donor) + "_chunk0")
        params["cell_table_path"] = '"{}"'.format(curr_dir / "cell_table.csv")
        prep_dir(str(curr_dir), group_df, GENERATE_MATRIX_CHUNK_TEMPLATE, params)
        total_chunk_dirs.append(curr_dir)
        group_chunks[group].append(curr_dir)
    else:
        group_df["chunk"] = [i // chunksize for i in range(0, group_df.shape[0])]
        for chunk, chunk_df in group_df.groupby("chunk"):
            curr_dir = domain_output_dir / subclass / (str(donor) + f"_chunk{chunk}")
            params["cell_table_path"] = '"{}"'.format(curr_dir / "cell_table.csv")
            prep_dir(str(curr_dir), chunk_df, GENERATE_MATRIX_CHUNK_TEMPLATE, params)
            total_chunk_dirs.append(curr_dir)
            group_chunks[group].append(curr_dir)

In [16]:
command_dir = domain_output_dir / "commands"
command_dir.mkdir(exist_ok=True)
with (command_dir / "snakemake_cmd_step1.txt").open("w") as f:
    for chunk_dir in total_chunk_dirs:
        cmd = "snakemake -d {0} --snakefile {0}/Snakefile_master -j 5 --rerun-incomplete".format(chunk_dir)
        f.write(cmd + "\n")

In [17]:
for group, group_df in cell_table.groupby("group"):
    subclass, donor = group.rsplit("_", 1)
    
    if "cell_table_path" in params.keys():
        params.pop("cell_table_path")
    params["output_dir"] = '"{}"'.format(domain_output_dir / subclass)
    params_str = "\n".join("{} = {}".format(k, v) for k, v in params.items())
    
    with open("{}/cool/Snakefile_group_template".format(PKG_DIR), "r") as f:
        GENERATE_MATRIX_GROUP_TEMPLATE = f.read()
    
    with open(domain_output_dir / "commands" / f"Snakefile_{subclass}", "w") as f:
        f.write(params_str + "\n" + GENERATE_MATRIX_GROUP_TEMPLATE)
    
    with open(domain_output_dir / "commands" / f"snakemake_cmd_step2_{subclass}.txt", "w") as f:
        cmd = "snakemake -d {} --snakefile {} -j 10 --rerun-incomplete".format(
            domain_output_dir / subclass,
            domain_output_dir / "commands" / f"Snakefile_{subclass}"
        )
        f.write(cmd + "\n")

In [18]:
sbatch_header = (
    "#! /bin/bash\n"
    "#SBATCH -p condo\n"
    "#SBATCH -q condo\n"
    "#SBATCH -A csd772\n"
    "#SBATCH -J hh_{0}_s{1}_{2}\n"
    "#SBATCH -N {3}\n"
    "#SBATCH -c {4}\n"
    "#SBATCH --mem {5}G\n"
    "#SBATCH -t {6}:00:00\n"
    "#SBATCH -o /tscc/projects/ps-epigen/users/biy022/scmethylhic/human_hippocampus/snm3c/Combined/hic/{0}/commands/step{1}_{2}.out\n"
    "#SBATCH -e /tscc/projects/ps-epigen/users/biy022/scmethylhic/human_hippocampus/snm3c/Combined/hic/{0}/commands/step{1}_{2}.err\n"
    "#SBATCH --mail-user biy022@health.ucsd.edu\n"
    "#SBATCH --mail-type FAIL\n"
    "\n"
    "source ~/.bashrc\n"
    "conda activate schicluster\n"
    "cd {7}\n"
)

In [19]:
chunksize = 40
with open(domain_output_dir / "commands" / "snakemake_cmd_step1.txt", "r") as finput:
    cmds = finput.readlines()

index = 0
for i in range(0, len(cmds), chunksize):
    with open(domain_output_dir / "commands" / f"snakemake_cmd_step1_{index}.txt", "w") as f:
        f.write("\n".join([xx.strip() for xx in cmds[i:(i+chunksize)]]) + "\n")
    with open(domain_output_dir / "commands" / f"step1_{index}.sbatch", "w") as f:
        f.write(sbatch_header.format(
            "domain_subclass_donor", 1, index, 1, 16, 100, 4, domain_output_dir / "commands") + "\n")
        f.write("bash snakemake_cmd_step1_{}.txt\n".format(index))
    index += 1

## Domain (subclass x donor)

In [41]:
from pathlib import Path

In [42]:
root = Path("/tscc/projects/ps-epigen/users/biy022/scmethylhic/human_hippocampus/snm3c/")
parent_root = Path("/tscc/projects/ps-epigen/users/biy022/scmethylhic")
output_dir = root / "Combined" / "hic"
chrom_size_path = parent_root / "genome" / "genome_hg38" / "hg38.autosomal.chrom.sizes"
black_list_path = parent_root / "genome" / "genome_hg38" / "hg38_blacklist.v2.bed.gz"

In [43]:
cool_list = list(root.glob("*_deep/hic/impute/25kb/chunk*/*cool"))

In [44]:
cell_table = pd.DataFrame(
    [str(xx) for xx in cool_list],
    index=[str(xx.stem) for xx in cool_list],
    columns=["cool_path"]
)
cell_table = cell_table.loc[metadata.index]
cell_table["final_cluster"] = metadata["final_cluster"].copy()
cell_table["final_cluster"][cell_table.final_cluster.isin(["Astro1", "Astro2"])] = "Astro"
cell_table["final_cluster"][cell_table.final_cluster.isin(["Micro1", "Micro2"])] = "Micro"
cell_table["final_cluster"][cell_table.final_cluster.isin(["VLMC", "Endo"])] = "Endo_VLMC"
cell_table["donor"] = metadata.donor.copy()
cell_table["group"] = cell_table[["final_cluster", "donor"]].agg("_".join, axis=1)
cell_table = cell_table.drop(["final_cluster", "donor"], axis=1)

In [45]:
domain_output_dir = output_dir / "domain_subclass_donor"
if not domain_output_dir.exists():
    domain_output_dir.mkdir(exist_ok=True)

In [46]:
for cluster, sub_df in cell_table.groupby("group"):
    subclass, donor = cluster.rsplit("_", 1)
    cluster_output_dir = domain_output_dir / subclass / donor
    cluster_output_dir.mkdir(parents=True, exist_ok=True)
    sub_df.to_csv(cluster_output_dir / "cell_table.csv", header=False, index=True)
    # print(cluster, sub_df.shape[0])

In [47]:
params = {
    "resolution": 25000,
    "chrom_size_path": '"{}"'.format(chrom_size_path),
}

In [48]:
chunksize = 200
resolution = 25000
total_chunk_dirs = []
group_chunks = {}

with open("{}/cool/Snakefile_chunk_template".format(PKG_DIR)) as f:
    GENERATE_MATRIX_CHUNK_TEMPLATE = f.read()

for group, group_df in cell_table.groupby("group"):
    group_chunks[group] = []
    subclass, donor = group.rsplit("_", 1)

    if group_df.shape[0] <= chunksize:
        curr_dir = domain_output_dir / subclass / (str(donor) + "_chunk0")
        params["cell_table_path"] = '"{}"'.format(curr_dir / "cell_table.csv")
        prep_dir(str(curr_dir), group_df, GENERATE_MATRIX_CHUNK_TEMPLATE, params)
        total_chunk_dirs.append(curr_dir)
        group_chunks[group].append(curr_dir)
    else:
        group_df["chunk"] = [i // chunksize for i in range(0, group_df.shape[0])]
        for chunk, chunk_df in group_df.groupby("chunk"):
            curr_dir = domain_output_dir / subclass / (str(donor) + f"_chunk{chunk}")
            params["cell_table_path"] = '"{}"'.format(curr_dir / "cell_table.csv")
            prep_dir(str(curr_dir), chunk_df, GENERATE_MATRIX_CHUNK_TEMPLATE, params)
            total_chunk_dirs.append(curr_dir)
            group_chunks[group].append(curr_dir)

In [49]:
command_dir = domain_output_dir / "commands"
command_dir.mkdir(exist_ok=True)
with (command_dir / "snakemake_cmd_step1.txt").open("w") as f:
    for chunk_dir in total_chunk_dirs:
        cmd = "snakemake -d {0} --snakefile {0}/Snakefile_master -j 5 --rerun-incomplete".format(chunk_dir)
        f.write(cmd + "\n")

In [50]:
for group, group_df in cell_table.groupby("group"):
    subclass, donor = group.rsplit("_", 1)
    
    if "cell_table_path" in params.keys():
        params.pop("cell_table_path")
    params["output_dir"] = '"{}"'.format(domain_output_dir / subclass)
    params_str = "\n".join("{} = {}".format(k, v) for k, v in params.items())
    
    with open("{}/cool/Snakefile_group_template".format(PKG_DIR), "r") as f:
        GENERATE_MATRIX_GROUP_TEMPLATE = f.read()
    
    with open(domain_output_dir / "commands" / f"Snakefile_{subclass}", "w") as f:
        f.write(params_str + "\n" + GENERATE_MATRIX_GROUP_TEMPLATE)
    
    with open(domain_output_dir / "commands" / f"snakemake_cmd_step2_{subclass}.txt", "w") as f:
        cmd = "snakemake -d {} --snakefile {} -j 10 --rerun-incomplete".format(
            domain_output_dir / subclass,
            domain_output_dir / "commands" / f"Snakefile_{subclass}"
        )
        f.write(cmd + "\n")

In [51]:
sbatch_header = (
    "#! /bin/bash\n"
    "#SBATCH -p condo\n"
    "#SBATCH -q condo\n"
    "#SBATCH -A csd772\n"
    "#SBATCH -J hh_{0}_s{1}_{2}\n"
    "#SBATCH -N {3}\n"
    "#SBATCH -c {4}\n"
    "#SBATCH --mem {5}G\n"
    "#SBATCH -t {6}:00:00\n"
    "#SBATCH -o /tscc/projects/ps-epigen/users/biy022/scmethylhic/human_hippocampus/snm3c/Combined/hic/{0}/commands/step{1}_{2}.out\n"
    "#SBATCH -e /tscc/projects/ps-epigen/users/biy022/scmethylhic/human_hippocampus/snm3c/Combined/hic/{0}/commands/step{1}_{2}.err\n"
    "#SBATCH --mail-user biy022@health.ucsd.edu\n"
    "#SBATCH --mail-type FAIL\n"
    "\n"
    "source ~/.bashrc\n"
    "conda activate schicluster\n"
    "cd {7}\n"
)

In [52]:
chunksize = 40
with open(domain_output_dir / "commands" / "snakemake_cmd_step1.txt", "r") as finput:
    cmds = finput.readlines()

index = 0
for i in range(0, len(cmds), chunksize):
    with open(domain_output_dir / "commands" / f"snakemake_cmd_step1_{index}.txt", "w") as f:
        f.write("\n".join([xx.strip() for xx in cmds[i:(i+chunksize)]]) + "\n")
    with open(domain_output_dir / "commands" / f"step1_{index}.sbatch", "w") as f:
        f.write(sbatch_header.format(
            "domain_subclass_donor", 1, index, 1, 16, 100, 4, domain_output_dir / "commands") + "\n")
        f.write("bash snakemake_cmd_step1_{}.txt\n".format(index))
    index += 1

## CGN from DMR

In [53]:
root = Path("/tscc/projects/ps-epigen/users/biy022/scmethylhic/human_hippocampus/snm3c/")
parent_root = Path("/tscc/projects/ps-epigen/users/biy022/scmethylhic")
output_dir = root / "Combined" / "methylation"
chrom_size_path = parent_root / "genome" / "genome_hg38" / "hg38.autosomal.chrom.sizes"
black_list_path = parent_root / "genome" / "genome_hg38" / "hg38_blacklist.v2.bed.gz"

In [54]:
allc_table = metadata[["AllcPath", "final_cluster"]].copy()
allc_table.final_cluster[allc_table.final_cluster.isin(["Astro1", "Astro2"])] = "Astro"
allc_table.final_cluster[allc_table.final_cluster.isin(["Micro1", "Micro2"])] = "Micro"
allc_table.final_cluster[allc_table.final_cluster.isin(["VLMC", "Endo"])] = "Endo_VLMC"
allc_table.AllcPath = "/tscc" + allc_table.AllcPath

In [55]:
cgn_output_dir = output_dir / "CGN"
if not cgn_output_dir.exists():
    cgn_output_dir.mkdir(exist_ok=True)
for cluster, sub_df in allc_table.groupby("final_cluster"):
    curr_dir = cgn_output_dir / cluster
    curr_dir.mkdir(exist_ok=True)

    sub_df[["AllcPath"]].to_csv(curr_dir / "allc_table.tsv", sep="\t", index=True, header=False)

In [56]:
allc_cmd = (
    "allcools generate-dataset \\\n"
    "--allc_table {0} \\\n"
    "--output_path {1} \\\n"
    "--chrom_size_path {2} \\\n"
    "--obs_dim cell \\\n"
    "--cpu 16 \\\n"
    "--chunk_size 10 \\\n"
    "--regions DMR {3} \\\n"
    "--quantifiers DMR count CGN \\\n"
    "--quantifiers DMR hypo-score CGN cutoff=0.9 \\\n"
    "--quantifiers DMR hyper-score CGN cutoff=0.9\n"
)

In [57]:
sbatch_header = (
    "#! /bin/bash\n"
    "#SBATCH -p condo\n"
    "#SBATCH -q condo\n"
    "#SBATCH -A csd772\n"
    "#SBATCH -J hh_{0}_{1}\n"
    "#SBATCH -N {2}\n"
    "#SBATCH -c {3}\n"
    "#SBATCH --mem {4}G\n"
    "#SBATCH -t {5}:00:00\n"
    "#SBATCH -o /tscc/projects/ps-epigen/users/biy022/scmethylhic/human_hippocampus/snm3c/Combined/methylation/{0}/commands/{1}.out\n"
    "#SBATCH -e /tscc/projects/ps-epigen/users/biy022/scmethylhic/human_hippocampus/snm3c/Combined/methylation/{0}/commands/{1}.err\n"
    "#SBATCH --mail-user biy022@health.ucsd.edu\n"
    "#SBATCH --mail-type FAIL\n"
    "\n"
    "source ~/.bashrc\n"
    "conda activate allcools\n"
    "cd {6}\n"
)

In [58]:
dmr_path = (
    "/tscc/projects/ps-epigen/users/biy022/scmethylhic/"
    "human_hippocampus/snm3c/Combined/methylation/CGN/commands/union_dmr_filtered.bed"
)

In [59]:
cmd_dir = cgn_output_dir / "commands"
if not cmd_dir.exists():
    cmd_dir.mkdir(exist_ok=True)

for cluster, sub_df in allc_table.groupby("final_cluster"):
    with open(cmd_dir / f"{cluster}.sbatch", "w") as f:
        curr_header = sbatch_header.format(
            "CGN",
            cluster,
            1,
            16,
            100,
            8,
            cgn_output_dir / cluster
        )
        curr_allc_cmd = allc_cmd.format(
            cgn_output_dir / cluster / "allc_table.tsv",
            cgn_output_dir / cluster / f"{cluster}_CGN.mcds",
            chrom_size_path,
            dmr_path
        )
        f.write(curr_header + "\n" + curr_allc_cmd)

## Domain (cluster)

In [7]:
from pathlib import Path

In [8]:
root = Path("/tscc/projects/ps-epigen/users/biy022/scmethylhic/human_hippocampus/snm3c/")
parent_root = Path("/tscc/projects/ps-epigen/users/biy022/scmethylhic")
output_dir = root / "Combined" / "hic"
chrom_size_path = parent_root / "genome" / "genome_hg38" / "hg38.autosomal.chrom.sizes"
black_list_path = parent_root / "genome" / "genome_hg38" / "hg38_blacklist.v2.bed.gz"

In [9]:
cool_list = list(root.glob("*_deep/hic/impute/25kb/chunk*/*cool"))

In [10]:
cell_table = pd.DataFrame(
    [str(xx) for xx in cool_list],
    index=[str(xx.stem) for xx in cool_list],
    columns=["cool_path"]
)
cell_table = cell_table.loc[metadata.index]
cell_table["final_cluster"] = metadata["final_cluster"].copy()
cell_table["final_cluster"][cell_table.final_cluster.isin(["Astro1", "Astro2"])] = "Astro"
cell_table["final_cluster"][cell_table.final_cluster.isin(["Micro1", "Micro2"])] = "Micro"
cell_table["final_cluster"][cell_table.final_cluster.isin(["VLMC", "Endo"])] = "Endo_VLMC"
cell_table["final_cluster"][cell_table.final_cluster.isin(["CA", "SUB", "DG"])] = "ExcNeurons"
cell_table["final_cluster"][cell_table.final_cluster.isin(["VIP", "SST", "PVALB", "NR2F2-LAMP5"])] = "InhNeurons"

In [11]:
cell_table.drop(cell_table.index[cell_table.final_cluster == "Endo_VLMC"], inplace=True)

In [12]:
domain_output_dir = output_dir / "domain_cluster"
if not domain_output_dir.exists():
    domain_output_dir.mkdir(exist_ok=True)

In [13]:
for cluster, sub_df in cell_table.groupby("final_cluster"):
    cluster_output_dir = domain_output_dir / cluster
    cluster_output_dir.mkdir(parents=True, exist_ok=True)
    sub_df.to_csv(cluster_output_dir / "cell_table.csv", header=False, index=True)

In [14]:
params = {
    "resolution": 25000,
    "chrom_size_path": '"{}"'.format(chrom_size_path),
}

In [15]:
chunksize = 200
resolution = 25000
total_chunk_dirs = []
group_chunks = {}

with open("{}/cool/Snakefile_chunk_template".format(PKG_DIR)) as f:
    GENERATE_MATRIX_CHUNK_TEMPLATE = f.read()

for group, group_df in cell_table.groupby("final_cluster"):
    group_chunks[group] = []

    if group_df.shape[0] <= chunksize:
        curr_dir = domain_output_dir / (group + "_chunk0")
        params["cell_table_path"] = '"{}"'.format(curr_dir / "cell_table.csv")
        prep_dir(str(curr_dir), group_df, GENERATE_MATRIX_CHUNK_TEMPLATE, params)
        total_chunk_dirs.append(curr_dir)
        group_chunks[group].append(curr_dir)
    else:
        group_df["chunk"] = [i // chunksize for i in range(0, group_df.shape[0])]
        for chunk, chunk_df in group_df.groupby("chunk"):
            curr_dir = domain_output_dir / (group + f"_chunk{chunk}")
            params["cell_table_path"] = '"{}"'.format(curr_dir / "cell_table.csv")
            prep_dir(str(curr_dir), chunk_df, GENERATE_MATRIX_CHUNK_TEMPLATE, params)
            total_chunk_dirs.append(curr_dir)
            group_chunks[group].append(curr_dir)

In [16]:
command_dir = domain_output_dir / "commands"
command_dir.mkdir(exist_ok=True)
with (command_dir / "snakemake_cmd_step1.txt").open("w") as f:
    for chunk_dir in total_chunk_dirs:
        cmd = "snakemake -d {0} --snakefile {0}/Snakefile_master -j 5 --rerun-incomplete".format(chunk_dir)
        f.write(cmd + "\n")

In [17]:
if "cell_table_path" in params.keys():
    params.pop("cell_table_path")
params["output_dir"] = '"{}"'.format(domain_output_dir)
params_str = "\n".join("{} = {}".format(k, v) for k, v in params.items())

with open("{}/cool/Snakefile_group_template".format(PKG_DIR), "r") as f:
    GENERATE_MATRIX_GROUP_TEMPLATE = f.read()

with open(domain_output_dir / "commands" / f"Snakefile", "w") as f:
    f.write(params_str + "\n" + GENERATE_MATRIX_GROUP_TEMPLATE)

with open(domain_output_dir / "commands" / f"snakemake_cmd_step2.txt", "w") as f:
    cmd = "snakemake -d {} --snakefile {} -j 10 --rerun-incomplete".format(
        domain_output_dir,
        domain_output_dir / "commands" / f"Snakefile"
    )
    f.write(cmd + "\n")

In [18]:
sbatch_header = (
    "#! /bin/bash\n"
    "#SBATCH -p condo\n"
    "#SBATCH -q condo\n"
    "#SBATCH -A csd772\n"
    "#SBATCH -J hh_{0}_s{1}_{2}\n"
    "#SBATCH -N {3}\n"
    "#SBATCH -c {4}\n"
    "#SBATCH --mem {5}G\n"
    "#SBATCH -t {6}:00:00\n"
    "#SBATCH -o /tscc/projects/ps-epigen/users/biy022/scmethylhic/human_hippocampus/snm3c/Combined/hic/{0}/commands/step{1}_{2}.out\n"
    "#SBATCH -e /tscc/projects/ps-epigen/users/biy022/scmethylhic/human_hippocampus/snm3c/Combined/hic/{0}/commands/step{1}_{2}.err\n"
    "#SBATCH --mail-user biy022@health.ucsd.edu\n"
    "#SBATCH --mail-type FAIL\n"
    "\n"
    "source ~/.bashrc\n"
    "conda activate schicluster\n"
    "cd {7}\n"
)

In [19]:
chunksize = 40
with open(os.path.join(domain_output_dir, "commands/snakemake_cmd_step1.txt"), "r") as finput:
    cmds = finput.readlines()

index = 0
for i in range(0, len(cmds), chunksize):
    with open(os.path.join(domain_output_dir, "commands/snakemake_cmd_step1_{}.txt".format(index)), "w") as f:
        f.write("\n".join([xx.strip() for xx in cmds[i:(i+chunksize)]]) + "\n")
    with open(os.path.join(domain_output_dir, "commands/step1_{}.sbatch".format(index)), "w") as f:
        f.write(sbatch_header.format("domain_cluster", 1, index, 1, 16, 100, 4, str(domain_output_dir) + "/commands") + "\n")
        f.write("bash snakemake_cmd_step1_{}.txt\n".format(index))
    index += 1

## Domain (cluster x age)

In [20]:
from pathlib import Path

In [21]:
root = Path("/tscc/projects/ps-epigen/users/biy022/scmethylhic/human_hippocampus/snm3c/")
parent_root = Path("/tscc/projects/ps-epigen/users/biy022/scmethylhic")
output_dir = root / "Combined" / "hic"
chrom_size_path = parent_root / "genome" / "genome_hg38" / "hg38.autosomal.chrom.sizes"
black_list_path = parent_root / "genome" / "genome_hg38" / "hg38_blacklist.v2.bed.gz"

In [22]:
cool_list = list(root.glob("*_deep/hic/impute/25kb/chunk*/*cool"))

In [23]:
cell_table = pd.DataFrame(
    [str(xx) for xx in cool_list],
    index=[str(xx.stem) for xx in cool_list],
    columns=["cool_path"]
)
cell_table = cell_table.loc[metadata.index]
cell_table["final_cluster"] = metadata["final_cluster"].copy()
cell_table["final_cluster"][cell_table.final_cluster.isin(["Astro1", "Astro2"])] = "Astro"
cell_table["final_cluster"][cell_table.final_cluster.isin(["Micro1", "Micro2"])] = "Micro"
cell_table["final_cluster"][cell_table.final_cluster.isin(["VLMC", "Endo"])] = "Endo_VLMC"
cell_table["final_cluster"][cell_table.final_cluster.isin(["CA", "SUB", "DG"])] = "ExcNeurons"
cell_table["final_cluster"][cell_table.final_cluster.isin(["VIP", "SST", "PVALB", "NR2F2-LAMP5"])] = "InhNeurons"
cell_table.drop(cell_table.index[cell_table.final_cluster == "Endo_VLMC"], inplace=True)
cell_table["age_group"] = metadata.group.copy()
cell_table["age_group"] = cell_table.age_group.map(lambda x: "Age{0}{1}".format(*(x.split("-"))))
cell_table["group"] = cell_table[["final_cluster", "age_group"]].agg("_".join, axis=1)
cell_table = cell_table.drop(["final_cluster", "age_group"], axis=1)

In [24]:
domain_output_dir = output_dir / "domain_cluster_age"
if not domain_output_dir.exists():
    domain_output_dir.mkdir(exist_ok=True)

In [25]:
for cluster, sub_df in cell_table.groupby("group"):
    subclass, age = cluster.rsplit("_", 1)
    cluster_output_dir = domain_output_dir / subclass / age
    cluster_output_dir.mkdir(parents=True, exist_ok=True)
    sub_df.to_csv(cluster_output_dir / "cell_table.csv", header=False, index=True)

In [26]:
params = {
    "resolution": 25000,
    "chrom_size_path": '"{}"'.format(chrom_size_path),
}

In [27]:
chunksize = 200
resolution = 25000
total_chunk_dirs = []
group_chunks = {}

with open("{}/cool/Snakefile_chunk_template".format(PKG_DIR)) as f:
    GENERATE_MATRIX_CHUNK_TEMPLATE = f.read()

for group, group_df in cell_table.groupby("group"):
    group_chunks[group] = []
    subclass, age = group.rsplit("_", 1)

    if group_df.shape[0] <= chunksize:
        curr_dir = domain_output_dir / subclass / (str(age) + "_chunk0")
        params["cell_table_path"] = '"{}"'.format(curr_dir / "cell_table.csv")
        prep_dir(str(curr_dir), group_df, GENERATE_MATRIX_CHUNK_TEMPLATE, params)
        total_chunk_dirs.append(curr_dir)
        group_chunks[group].append(curr_dir)
    else:
        group_df["chunk"] = [i // chunksize for i in range(0, group_df.shape[0])]
        for chunk, chunk_df in group_df.groupby("chunk"):
            curr_dir = domain_output_dir / subclass / (str(age) + f"_chunk{chunk}")
            params["cell_table_path"] = '"{}"'.format(curr_dir / "cell_table.csv")
            prep_dir(str(curr_dir), chunk_df, GENERATE_MATRIX_CHUNK_TEMPLATE, params)
            total_chunk_dirs.append(curr_dir)
            group_chunks[group].append(curr_dir)

In [28]:
command_dir = domain_output_dir / "commands"
command_dir.mkdir(exist_ok=True)
with (command_dir / "snakemake_cmd_step1.txt").open("w") as f:
    for chunk_dir in total_chunk_dirs:
        cmd = "snakemake -d {0} --snakefile {0}/Snakefile_master -j 5 --rerun-incomplete".format(chunk_dir)
        f.write(cmd + "\n")

In [29]:
for group, group_df in cell_table.groupby("group"):
    subclass, age = group.rsplit("_", 1)
    
    if "cell_table_path" in params.keys():
        params.pop("cell_table_path")
    params["output_dir"] = '"{}"'.format(domain_output_dir / subclass)
    params_str = "\n".join("{} = {}".format(k, v) for k, v in params.items())
    
    with open("{}/cool/Snakefile_group_template".format(PKG_DIR), "r") as f:
        GENERATE_MATRIX_GROUP_TEMPLATE = f.read()
    
    with open(domain_output_dir / "commands" / f"Snakefile_{subclass}", "w") as f:
        f.write(params_str + "\n" + GENERATE_MATRIX_GROUP_TEMPLATE)
    
    with open(domain_output_dir / "commands" / f"snakemake_cmd_step2_{subclass}.txt", "w") as f:
        cmd = "snakemake -d {} --snakefile {} -j 10 --rerun-incomplete".format(
            domain_output_dir / subclass,
            domain_output_dir / "commands" / f"Snakefile_{subclass}"
        )
        f.write(cmd + "\n")

In [30]:
sbatch_header = (
    "#! /bin/bash\n"
    "#SBATCH -p condo\n"
    "#SBATCH -q condo\n"
    "#SBATCH -A csd772\n"
    "#SBATCH -J hh_{0}_s{1}_{2}\n"
    "#SBATCH -N {3}\n"
    "#SBATCH -c {4}\n"
    "#SBATCH --mem {5}G\n"
    "#SBATCH -t {6}:00:00\n"
    "#SBATCH -o /tscc/projects/ps-epigen/users/biy022/scmethylhic/human_hippocampus/snm3c/Combined/hic/{0}/commands/step{1}_{2}.out\n"
    "#SBATCH -e /tscc/projects/ps-epigen/users/biy022/scmethylhic/human_hippocampus/snm3c/Combined/hic/{0}/commands/step{1}_{2}.err\n"
    "#SBATCH --mail-user biy022@health.ucsd.edu\n"
    "#SBATCH --mail-type FAIL\n"
    "\n"
    "source ~/.bashrc\n"
    "conda activate schicluster\n"
    "cd {7}\n"
)

In [31]:
chunksize = 40
with open(domain_output_dir / "commands" / "snakemake_cmd_step1.txt", "r") as finput:
    cmds = finput.readlines()

index = 0
for i in range(0, len(cmds), chunksize):
    with open(domain_output_dir / "commands" / f"snakemake_cmd_step1_{index}.txt", "w") as f:
        f.write("\n".join([xx.strip() for xx in cmds[i:(i+chunksize)]]) + "\n")
    with open(domain_output_dir / "commands" / f"step1_{index}.sbatch", "w") as f:
        f.write(sbatch_header.format(
            "domain_cluster_age", 1, index, 1, 16, 100, 4, domain_output_dir / "commands") + "\n")
        f.write("bash snakemake_cmd_step1_{}.txt\n".format(index))
    index += 1

## Domain (cluster x age), add neuronal subclasses

In [20]:
from pathlib import Path

In [21]:
root = Path("/tscc/projects/ps-epigen/users/biy022/scmethylhic/human_hippocampus/snm3c/")
parent_root = Path("/tscc/projects/ps-epigen/users/biy022/scmethylhic")
output_dir = root / "Combined" / "hic"
chrom_size_path = parent_root / "genome" / "genome_hg38" / "hg38.autosomal.chrom.sizes"
black_list_path = parent_root / "genome" / "genome_hg38" / "hg38_blacklist.v2.bed.gz"

In [22]:
cool_list = list(root.glob("*_deep/hic/impute/25kb/chunk*/*cool"))

In [23]:
cell_table = pd.DataFrame(
    [str(xx) for xx in cool_list],
    index=[str(xx.stem) for xx in cool_list],
    columns=["cool_path"]
)
cell_table = cell_table.loc[metadata.index]
cell_table["final_cluster"] = metadata["final_cluster"].copy()
cell_table["final_cluster"][cell_table.final_cluster.isin(["Astro1", "Astro2"])] = "Astro"
cell_table["final_cluster"][cell_table.final_cluster.isin(["Micro1", "Micro2"])] = "Micro"
cell_table["final_cluster"][cell_table.final_cluster.isin(["VLMC", "Endo"])] = "Endo_VLMC"
cell_table.drop(cell_table.index[cell_table.final_cluster.isin(["Astro", "Micro", "OPC", "Oligo"])], inplace=True)
cell_table["age_group"] = metadata.group.copy()
cell_table["age_group"] = cell_table.age_group.map(lambda x: "Age{0}{1}".format(*(x.split("-"))))
cell_table["group"] = cell_table[["final_cluster", "age_group"]].agg("_".join, axis=1)
cell_table = cell_table.drop(["final_cluster", "age_group"], axis=1)

In [26]:
domain_output_dir = output_dir / "domain_cluster_age"
if not domain_output_dir.exists():
    domain_output_dir.mkdir(exist_ok=True)

In [27]:
for cluster, sub_df in cell_table.groupby("group"):
    subclass, age = cluster.rsplit("_", 1)
    cluster_output_dir = domain_output_dir / subclass / age
    cluster_output_dir.mkdir(parents=True, exist_ok=True)
    sub_df.to_csv(cluster_output_dir / "cell_table.csv", header=False, index=True)

In [28]:
params = {
    "resolution": 25000,
    "chrom_size_path": '"{}"'.format(chrom_size_path),
}

In [29]:
chunksize = 200
resolution = 25000
total_chunk_dirs = []
group_chunks = {}

with open("{}/cool/Snakefile_chunk_template".format(PKG_DIR)) as f:
    GENERATE_MATRIX_CHUNK_TEMPLATE = f.read()

for group, group_df in cell_table.groupby("group"):
    group_chunks[group] = []
    subclass, age = group.rsplit("_", 1)

    if group_df.shape[0] <= chunksize:
        curr_dir = domain_output_dir / subclass / (str(age) + "_chunk0")
        params["cell_table_path"] = '"{}"'.format(curr_dir / "cell_table.csv")
        prep_dir(str(curr_dir), group_df, GENERATE_MATRIX_CHUNK_TEMPLATE, params)
        total_chunk_dirs.append(curr_dir)
        group_chunks[group].append(curr_dir)
    else:
        group_df["chunk"] = [i // chunksize for i in range(0, group_df.shape[0])]
        for chunk, chunk_df in group_df.groupby("chunk"):
            curr_dir = domain_output_dir / subclass / (str(age) + f"_chunk{chunk}")
            params["cell_table_path"] = '"{}"'.format(curr_dir / "cell_table.csv")
            prep_dir(str(curr_dir), chunk_df, GENERATE_MATRIX_CHUNK_TEMPLATE, params)
            total_chunk_dirs.append(curr_dir)
            group_chunks[group].append(curr_dir)

In [30]:
command_dir = domain_output_dir / "commands"
command_dir.mkdir(exist_ok=True)
with (command_dir / "snakemake_cmd_step1.txt").open("w") as f:
    for chunk_dir in total_chunk_dirs:
        cmd = "snakemake -d {0} --snakefile {0}/Snakefile_master -j 5 --rerun-incomplete".format(chunk_dir)
        f.write(cmd + "\n")

In [31]:
for group, group_df in cell_table.groupby("group"):
    subclass, age = group.rsplit("_", 1)
    
    if "cell_table_path" in params.keys():
        params.pop("cell_table_path")
    params["output_dir"] = '"{}"'.format(domain_output_dir / subclass)
    params_str = "\n".join("{} = {}".format(k, v) for k, v in params.items())
    
    with open("{}/cool/Snakefile_group_template".format(PKG_DIR), "r") as f:
        GENERATE_MATRIX_GROUP_TEMPLATE = f.read()
    
    with open(domain_output_dir / "commands" / f"Snakefile_{subclass}", "w") as f:
        f.write(params_str + "\n" + GENERATE_MATRIX_GROUP_TEMPLATE)
    
    with open(domain_output_dir / "commands" / f"snakemake_cmd_step2_{subclass}.txt", "w") as f:
        cmd = "snakemake -d {} --snakefile {} -j 10 --rerun-incomplete".format(
            domain_output_dir / subclass,
            domain_output_dir / "commands" / f"Snakefile_{subclass}"
        )
        f.write(cmd + "\n")

In [32]:
sbatch_header = (
    "#! /bin/bash\n"
    "#SBATCH -p condo\n"
    "#SBATCH -q condo\n"
    "#SBATCH -A csd772\n"
    "#SBATCH -J hh_{0}_s{1}_{2}\n"
    "#SBATCH -N {3}\n"
    "#SBATCH -c {4}\n"
    "#SBATCH --mem {5}G\n"
    "#SBATCH -t {6}:00:00\n"
    "#SBATCH -o /tscc/projects/ps-epigen/users/biy022/scmethylhic/human_hippocampus/snm3c/Combined/hic/{0}/commands/step{1}_{2}.out\n"
    "#SBATCH -e /tscc/projects/ps-epigen/users/biy022/scmethylhic/human_hippocampus/snm3c/Combined/hic/{0}/commands/step{1}_{2}.err\n"
    "#SBATCH --mail-user biy022@health.ucsd.edu\n"
    "#SBATCH --mail-type FAIL\n"
    "\n"
    "source ~/.bashrc\n"
    "conda activate schicluster\n"
    "cd {7}\n"
)

In [33]:
chunksize = 40
with open(domain_output_dir / "commands" / "snakemake_cmd_step1.txt", "r") as finput:
    cmds = finput.readlines()

index = 0
for i in range(0, len(cmds), chunksize):
    with open(domain_output_dir / "commands" / f"snakemake_cmd_step1_{index}.txt", "w") as f:
        f.write("\n".join([xx.strip() for xx in cmds[i:(i+chunksize)]]) + "\n")
    with open(domain_output_dir / "commands" / f"step1_{index}.sbatch", "w") as f:
        f.write(sbatch_header.format(
            "domain_cluster_age", 1, index, 1, 16, 100, 4, domain_output_dir / "commands") + "\n")
        f.write("bash snakemake_cmd_step1_{}.txt\n".format(index))
    index += 1