In [1]:
import hdbscan
import pickle
import pandas as pd
from typing import Tuple
import pathlib
from functools import reduce

In [2]:
def import_clusters(
    path_to_tsv: pathlib.Path, path_to_picke: pathlib.Path
) -> Tuple[pd.DataFrame, hdbscan.HDBSCAN]:
    df = pd.read_table(path_to_tsv)
    with open(path_to_picke, "rb") as f:
        return df, pickle.load(f)

In [3]:
df, clusters = import_clusters(
    "../data/output/comparative_analysis/annotated_domains/10000/clusters/clique.clusters.tsv.gz",
    "../data/output/comparative_analysis/annotated_domains/10000/clusters/clique.clusterer.pickle.xz",
)

In [4]:
df1 = df.groupby(["label", "cluster"]).size().to_frame()
df1.columns = ["size"]

dfs = []
for label, dff in df1.groupby("label"):
    dff = dff.reset_index("label", drop=True).reset_index()
    dff["cluster"] = dff["cluster"].astype(str)
    dff = dff.rename(columns={"size": label})
    dfs.append(dff)

df1 = reduce(lambda l, r: pd.merge(l, r, on="cluster"), dfs)
df1.to_csv("/tmp/clique_cluster_sizes.tsv", sep="\t", index=False)
df1

Unnamed: 0,cluster,001_MCF10A_REP1,002_MCF10A_REP2,003_MCF10AT1_REP1,004_MCF10AT1_REP2,005_MCF10CA1a_REP1,006_MCF10CA1a_REP2
0,-1,847,811,526,597,595,563
1,0,135,127,173,176,182,170
2,1,97,85,56,65,74,74
3,2,210,195,219,234,274,252
4,3,41,37,23,24,56,53
5,4,323,301,316,330,341,334
6,5,165,154,80,81,130,117
7,6,534,520,260,296,521,483
8,7,348,340,168,177,289,274
9,8,44,39,25,30,54,49
