# Pre-processing data

In [1]:
import pandas as pd
import os
from pathlib import Path
import scanpy as sc
from ALLCools.mcds.utilities import calculate_posterior_mc_frac
from tqdm import tqdm

In [2]:
os.chdir("/tscc/projects/ps-epigen/users/biy022/scmethylhic/human_hippocampus/snm3c/hh_micro_mono/")

In [3]:
os.chdir("DevelopingHumanBrain_MGC/")

In [4]:
metadata = pd.read_csv("meta.tsv", header=0, sep="\t")
metadata.set_index("Cell_Id", inplace=True)

In [5]:
metadata = metadata[
    (metadata["L3"].str.startswith("MGC")) & 
    (metadata["Age_groups"] != "adult") & 
    (metadata["Region"] == "HPC")
].copy()

In [6]:
metadata.shape

(519, 13)

In [7]:
def rename_bin(col: str) -> str:
    parts = col.split("_")
    start, end = parts[2].split("-")
    index = int(start) / 100000
    return "{}_{}".format(parts[1], int(index))

In [8]:
mcg_frac_list = []
mch_frac_list = []
cells = []
samples = []
for file in Path("data").glob("*_bmCG_coverage.tsv.gz"):
    filename = file.name
    parts = str(filename).split("_")
    prefix = "{}_{}".format(parts[0], parts[1])
    samples.append(prefix)

with tqdm(samples, desc="Processing") as pbar:
    for prefix in pbar:
        pbar.set_postfix(current=prefix)
        # mCG
        mcg_cov = pd.read_csv("data/{}_bmCG_coverage.tsv.gz".format(prefix), header=0, sep="\t", index_col=0)
        mcg_cov.set_index(mcg_cov.columns[0], inplace=True)
        mcg_cov.index = mcg_cov.index.str.replace(r"^allc_|_indexed_bin_mCG$", "", regex=True)
        mcg_cov = mcg_cov.filter(regex=r"^bin")
        mcg_cov = mcg_cov.rename(columns=rename_bin)
        mcg_count = pd.read_csv("data/{}_bmCG_counts.tsv.gz".format(prefix), header=0, sep="\t", index_col=0)
        mcg_count.set_index(mcg_count.columns[0], inplace=True)
        mcg_count.index = mcg_count.index.str.replace(r"^allc_|_indexed_bin_mCG$", "", regex=True)
        mcg_count = mcg_count.filter(regex=r"^bin")
        mcg_count = mcg_count.rename(columns=rename_bin)
        selected_cells = mcg_cov.index[mcg_cov.index.isin(metadata.index)].to_list()
        mcg_frac = calculate_posterior_mc_frac(
            mcg_count.to_numpy(), mcg_cov.to_numpy(), normalize_per_cell=False)#True, clip_norm_value=10)
        mcg_frac = pd.DataFrame(mcg_frac, index=mcg_cov.index, columns=mcg_cov.columns)
        mcg_frac_list.append(mcg_frac.loc[selected_cells, :].copy())
        # mCH
        mch_cov = pd.read_csv("data/{}_bmCH_coverage.tsv.gz".format(prefix), header=0, sep="\t", index_col=0)
        mch_cov.set_index(mch_cov.columns[0], inplace=True)
        mch_cov.index = mch_cov.index.str.replace(r"^allc_|_indexed_bin_mCH$", "", regex=True)
        mch_cov = mch_cov.filter(regex=r"^bin")
        mch_cov = mch_cov.rename(columns=rename_bin)
        mch_count = pd.read_csv("data/{}_bmCH_counts.tsv.gz".format(prefix), header=0, sep="\t", index_col=0)
        mch_count.set_index(mch_count.columns[0], inplace=True)
        mch_count.index = mch_count.index.str.replace(r"^allc_|_indexed_bin_mCH$", "", regex=True)
        mch_count = mch_count.filter(regex=r"^bin")
        mch_count = mch_count.rename(columns=rename_bin)
        mch_frac = calculate_posterior_mc_frac(
            mch_count.to_numpy(), mch_cov.to_numpy(), normalize_per_cell=False)#True, clip_norm_value=10)
        mch_frac = pd.DataFrame(mch_frac, index=mch_cov.index, columns=mch_cov.columns)
        mch_frac_list.append(mch_frac.loc[selected_cells, :].copy())
        cells.extend(selected_cells)

Processing: 100%|██████████| 13/13 [12:35<00:00, 58.13s/it, current=GSM6596816_211130-4267-HPC]     


In [9]:
mcg_frac = pd.concat(mcg_frac_list, axis=0)
mch_frac = pd.concat(mch_frac_list, axis=0)

In [10]:
mcg_adata = sc.AnnData(mcg_frac)
mch_adata = sc.AnnData(mch_frac)

In [13]:
mcg_adata.write_h5ad("processed/mCG_raw.mcad")
mch_adata.write_h5ad("processed/mCH_raw.mcad")