In [1]:
import pandas as pd
import numpy as np
from statsmodels.stats.contingency_tables import mcnemar
from statsmodels.stats.multitest import fdrcorrection

In [2]:
def compute_max_tad_clique_size(cliques: pd.DataFrame) -> pd.DataFrame:
    clique_sizes = {}

    for _, (tads, size) in cliques[["tad_ids", "size"]].iterrows():
        for tad in tads.split(","):
            if tad in clique_sizes:
                clique_sizes[tad] = max(clique_sizes[tad], size)
            else:
                clique_sizes[tad] = size

    return pd.DataFrame({"tad": clique_sizes.keys(), "size": clique_sizes.values()})

In [3]:
df1 = compute_max_tad_clique_size(
    pd.read_table(
        "../data/output/cliques/repl/cliques/hg38_001_MCF10A_WT_REP1_cis_cliques.tsv.gz"
    )
)
df2 = compute_max_tad_clique_size(
    pd.read_table(
        "../data/output/cliques/repl/cliques/hg38_002_MCF10A_WT_REP2_cis_cliques.tsv.gz"
    )
)

df3 = compute_max_tad_clique_size(
    pd.read_table(
        "../data/output/cliques/repl/cliques/hg38_003_MCF10A_T1_REP1_cis_cliques.tsv.gz"
    )
)
df4 = compute_max_tad_clique_size(
    pd.read_table(
        "../data/output/cliques/repl/cliques/hg38_004_MCF10A_T1_REP2_cis_cliques.tsv.gz"
    )
)

df5 = compute_max_tad_clique_size(
    pd.read_table(
        "../data/output/cliques/repl/cliques/hg38_005_MCF10A_C1_REP1_cis_cliques.tsv.gz"
    )
)
df6 = compute_max_tad_clique_size(
    pd.read_table(
        "../data/output/cliques/repl/cliques/hg38_006_MCF10A_C1_REP2_cis_cliques.tsv.gz"
    )
)

max_clique_size = max(
    df1["size"].max(),
    df2["size"].max(),
    df3["size"].max(),
    df4["size"].max(),
    df5["size"].max(),
    df6["size"].max(),
)
max_clique_size

11

In [4]:
m = np.zeros([2, 2], dtype=int)

data = []
for size in range(1, 12):
    m[0, 0] = len(df1[df1["size"] == size])
    m[0, 1] = len(df3[df3["size"] == size])
    m[1, 0] = len(df2[df2["size"] == size])
    m[1, 1] = len(df4[df4["size"] == size])

    res = mcnemar(m, exact=True)
    data.append(["WT", "T1", size, res.statistic, res.pvalue])

dff1 = pd.DataFrame(
    data, columns=["cond1", "cond2", "clique_size", "mcnemar_stat", "mcnemar_pval"]
)

dff1

Unnamed: 0,cond1,cond2,clique_size,mcnemar_stat,mcnemar_pval
0,WT,T1,1,789.0,2.15535e-09
1,WT,T1,2,690.0,1.286264e-31
2,WT,T1,3,807.0,0.02294875
3,WT,T1,4,477.0,8.590977e-11
4,WT,T1,5,241.0,4.731121e-23
5,WT,T1,6,158.0,9.645272e-18
6,WT,T1,7,60.0,2.736843e-06
7,WT,T1,8,8.0,5.021389e-15
8,WT,T1,9,0.0,4.547474e-13
9,WT,T1,10,0.0,1.0


In [5]:
m = np.zeros([2, 2], dtype=int)

data = []
for size in range(1, 12):
    m[0, 0] = len(df1[df1["size"] == size])
    m[0, 1] = len(df5[df5["size"] == size])
    m[1, 0] = len(df2[df2["size"] == size])
    m[1, 1] = len(df6[df6["size"] == size])

    res = mcnemar(m, exact=True)
    data.append(["WT", "C1", size, res.statistic, res.pvalue])

dff2 = pd.DataFrame(
    data, columns=["cond1", "cond2", "clique_size", "mcnemar_stat", "mcnemar_pval"]
)

dff2

Unnamed: 0,cond1,cond2,clique_size,mcnemar_stat,mcnemar_pval
0,WT,C1,1,770.0,0.648491
1,WT,C1,2,690.0,0.004042217
2,WT,C1,3,807.0,7.314546e-08
3,WT,C1,4,700.0,0.1354671
4,WT,C1,5,315.0,1.158624e-11
5,WT,C1,6,225.0,2.094141e-07
6,WT,C1,7,90.0,0.02385522
7,WT,C1,8,66.0,0.450208
8,WT,C1,9,23.0,0.02481197
9,WT,C1,10,0.0,1.0


In [6]:
df = pd.concat([dff1, dff2])
df["mcnemar_pval_adj"] = fdrcorrection(df["mcnemar_pval"])[1]
df.to_csv("/tmp/clique_mcnemar.tsv", sep="\t", index=False)
df

Unnamed: 0,cond1,cond2,clique_size,mcnemar_stat,mcnemar_pval,mcnemar_pval_adj
0,WT,T1,1,789.0,2.15535e-09,5.927211e-09
1,WT,T1,2,690.0,1.286264e-31,2.82978e-30
2,WT,T1,3,807.0,0.02294875,0.0363909
3,WT,T1,4,477.0,8.590977e-11,2.700021e-10
4,WT,T1,5,241.0,4.731121e-23,5.204233e-22
5,WT,T1,6,158.0,9.645272e-18,7.0732e-17
6,WT,T1,7,60.0,2.736843e-06,5.473686e-06
7,WT,T1,8,8.0,5.021389e-15,2.761764e-14
8,WT,T1,9,0.0,4.547474e-13,2.000888e-12
9,WT,T1,10,0.0,1.0,1.0
