In [1]:
import pandas as pd
import bioframe as bf
import higlass as hg
import cooler
import os
import tempfile
from clodius.cli import aggregate
from typing import Dict
import pyBigWig
import numpy as np
import numpy.typing as npt

In [2]:
def compute_domain(coords, clr: cooler.Cooler, padding=2.0):
    df = clr.matrix(as_pixels=True).fetch(coords)

    pos1 = df["bin1_id"].min() * clr.binsize
    pos2 = df["bin1_id"].max() * clr.binsize

    if padding != 0.0:
        extent = pos2 - pos1
        new_extent = int(padding * extent)
        pos1 -= (new_extent - extent) // 2
        pos2 += (new_extent - extent) // 2

    return max(0, pos1), pos2

def compute_max_tad_clique_size(cliques: pd.DataFrame) -> pd.DataFrame:
    clique_sizes = {}
    clique_domains = {}

    for _, (tads, size) in cliques[["tad_ids", "size"]].iterrows():
        for tad in tads.split(","):
            if tad in clique_sizes:
                if size > clique_sizes[tad]:
                    clique_sizes[tad] = size
                    clique_domains[tad] = tads
            else:
                clique_sizes[tad] = size
                clique_domains[tad] = tads

    return pd.DataFrame({"tad": clique_sizes.keys(), "tad_ids": clique_domains.values(), "size": clique_sizes.values()}).sort_values("size", ascending=False)


def prepare_cliques(path_to_doms, outdir):
    name = os.path.basename(path_to_doms).removesuffix(".bedpe")
    out_path = os.path.join(outdir, f"{name}.beddb")
    aggregate._bedpe(
        path_to_doms,
        out_path,
        assembly="hg38",
        chr1_col=1,
        from1_col=2,
        to1_col=3,
        chr2_col=4,
        from2_col=5,
        to2_col=6
    )
    return out_path


def write_bigwig(outname: str, df: pd.DataFrame, chrom_sizes: Dict[str, int]):
    with pyBigWig.open(outname, "w") as bw:
        bw.addHeader([(chrom, size) for chrom, size in chrom_sizes.items()])
        bw.addEntries(df["chrom"].tolist(), df["start"].tolist(), ends=df["end"].tolist(), values=df["value"].tolist())

    return outname

def compute_clique_span(cliques: pd.DataFrame, domains: pd.DataFrame) -> npt.NDArray:
    spans = []

    for _, row in cliques.iterrows():
        first = None
        last = None

        for tid in row["tad_ids"].split(","):
            dom = domains[domains["id"] == int(tid)]
            if first is None:
                first = dom["start"].min()
            if last is None:
                last = dom["end"].max()
            
            first = min(first, dom["start"].min())
            last = max(last, dom["end"].max())
        spans.append(last - first)

    return np.array(spans)


In [3]:
tmpdir = "/tmp/cliques_tmp/"
os.makedirs(tmpdir, exist_ok=True)

chrom_sizes = pd.read_table("../../data/input/hg38/hg38.filtered.chrom.sizes", names=["chrom", "size"]).set_index("chrom")["size"].to_dict()
cliques = pd.read_table("../../data/output/cliques/repl/cliques/hg38_001_MCF10A_WT_REP1_cis_cliques.tsv.gz").sort_values("size", ascending=False).reset_index(drop=True)
domains = pd.read_table("../../data/output/cliques/repl/cliques/hg38_001_MCF10A_WT_REP1_cis_domains.bed.gz", names=["chrom", "start", "end", "id"])
compartments = pd.read_table("../../data/output/compartment_analysis/10000/MCF10A_WT_T1_C1_10000.pcQnm.bedGraph.gz")[["chr", "start", "end", "MCF10A_WT"]].rename(columns={"MCF10A_WT": "value", "chr": "chrom"})
compartments = bf.sort_bedframe(compartments, chrom_sizes)

cliques = cliques[cliques["size"] >= 3]

max_cliques = compute_max_tad_clique_size(cliques)
max_cliques["span"] = compute_clique_span(max_cliques, domains)

max_cliques = max_cliques[max_cliques["span"].between(10_000_000, 15_000_000)]
max_cliques = max_cliques.sort_values("span").reset_index(drop=True)


dfs = []
for tid in max_cliques.iloc[2]["tad_ids"].split(","):
    dfs.append(domains[domains["id"] == int(tid)])

clique_domains = pd.concat(dfs).drop(columns="id")
clique_domains = clique_domains.merge(clique_domains, how="cross", suffixes=("1", "2")).sort_values(["start1", "start2"])

clique_domains = clique_domains[clique_domains["start2"] >= clique_domains["start1"]]

clique_domains.to_csv(f"{tmpdir}/domains.bedpe", sep="\t", header=False, index=False)
prepare_cliques(f"{tmpdir}/domains.bedpe", tmpdir)
write_bigwig(f"{tmpdir}/compartment_pc1.bigWig", compartments, chrom_sizes)

clique_domains

Unnamed: 0,chrom1,start1,end1,chrom2,start2,end2
0,chr20,41700000,43400000,chr20,41700000,43400000
1,chr20,41700000,43400000,chr20,52200000,52750000
2,chr20,41700000,43400000,chr20,53200000,53550000
4,chr20,52200000,52750000,chr20,52200000,52750000
5,chr20,52200000,52750000,chr20,53200000,53550000
8,chr20,53200000,53550000,chr20,53200000,53550000


In [4]:
gene_annotation_tile = hg.remote(
    uid="P0PLbQMwTYGy-5uPIQid7A",
    server="https://higlass.io/api/v1/",
    name="hg38 Gene annotation",
)

cooler_file = "../../data/output/nfcore_hic/mcools/hg38_MCF10A_WT_merged.mcool"

domains = f"{tmpdir}/domains.beddb"
compartments = f"{tmpdir}/compartment_pc1.bigWig"

domains_track = hg.bed2ddb(domains).track("2d-rectangle-domains")
compartments_track = hg.bigwig(compartments).track("line")

heatmap_track = hg.combine(hg.cooler(cooler_file).track("heatmap", height=500), domains_track, height=500)

chrom_sizes_track = hg.cooler(cooler_file).track("chromosome-labels")

left_track = hg.combine(compartments_track, chrom_sizes_track, width=200)

uri = cooler_file
if cooler.fileops.is_multires_file(uri):
    grps = cooler.fileops.list_coolers(uri)
    uri = f"{uri}::{grps[0]}"

chrom = clique_domains["chrom1"].iloc[0]
start = clique_domains["start1"].min()
end = clique_domains["end1"].max()
domain = compute_domain(f"{chrom}:{start}-{end}", cooler.Cooler(uri))

width = 12

view = hg.view(
    (hg.cooler(cooler_file).track("chromosome-labels"), "left"),
    # (compartments_track, "top"),
    (hg.cooler(cooler_file).track("chromosome-labels"), "top"),
    (heatmap_track, "center"),
    width=width).domain(x=domain, y=domain)

view