# Create Allc table for MCDS for the DMR classifier analysis

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import anndata
import scanpy as sc

from ALLCools.clustering import \
    tsne, \
    significant_pc_test, \
    filter_regions, \
    remove_black_list_region, \
    lsi, \
    binarize_matrix
from ALLCools.plot import *
from ALLCools.mcds import MCDS
from pathlib import Path

In [2]:
import os
os.chdir("/tscc/projects/ps-epigen/users/biy022/scmethylhic/human_hippocampus/snm3c/hh_micro_mono/")

In [3]:
hh_adata = sc.read_h5ad("/tscc/projects/ps-renlab2/sel041/scmethylhic/human_hippocampus/concat/human_aging_5kb-CGN-hypo_cluster_bbknn_filtered_annotated_revision4.h5ad")

In [4]:
metadata = hh_adata.obs.copy()

In [5]:
def get_allc_path(index, upper_dir):
    sample, r, c, w, well = index.split("-")
    position = "-".join([r, c, w])
    place = "-".join([position, well])
    return os.path.join(upper_dir, "{}_deep/fastq_demultiplex/{}/allc/{}.allc.tsv.gz".format(sample, position, place))

In [6]:
upper_dir = "/tscc/projects/ps-epigen/users/biy022/scmethylhic/human_hippocampus/snm3c/"
metadata["allc_path"] = metadata.index.map(lambda x: get_allc_path(x, upper_dir))

In [7]:
## Human Hippocampus paths
hh_paths = metadata.loc[:, ["allc_path", "subcluster"]].copy()
hh_paths["subcluster"] = hh_paths["subcluster"].astype(str)
hh_paths.columns = ["allc_path", "celltype"]

In [8]:
## Human Atlas paths
ha_df_list = []
for file in Path("HumanCellEpigenomAtlas_Mono_2/immune_subtypes/").glob("*-AllcPaths.tsv"):
    file_suffix = file.name
    celltype, _ = file_suffix.split("-")
    df = pd.read_csv(file, sep="\t", header=None)
    df.columns = ["allc_path"]
    df["celltype"] = celltype
    df["index"] = df["allc_path"].map(lambda x: Path(x).name.split(".")[0])
    df.set_index("index", inplace=True)
    ha_df_list.append(df)
ha_paths = pd.concat(ha_df_list, axis=0)

In [9]:
## Infant MGC-1 path
indices = []
allc_paths = []
for file in Path("DevelopingHumanBrain_MGC/data/infant_MGC-1/").glob("*.tsv.gz"):
    file_suffix = file.name
    indices.append(file_suffix.split(".")[0])
    allc_paths.append(file.absolute())
infant_paths = pd.DataFrame({
    "index": indices,
    "allc_path": allc_paths
})
infant_paths["celltype"] = "infant_MGC-1"
infant_paths.set_index("index", inplace=True)

In [10]:
all_paths = pd.concat([hh_paths, ha_paths, infant_paths], axis=0)

In [11]:
all_paths.to_csv("DMR_classifier/allc_meta.tsv", sep="\t", header=True, index=True)

In [13]:
all_paths[["allc_path"]].to_csv("DMR_classifier/AllcPaths.tsv", sep="\t", header=False, index=True)