In [20]:
# Analysis Community Modules
import pandas as pd
import gseapy as gp 
from pathlib import Path
import plotly.graph_objects as go
from scipy.stats import fisher_exact, hypergeom
import numpy as np
from scipy.stats import false_discovery_control


In [19]:
#load in file with node names and layer names
path_vertices = Path("/Users/paulinestaiber/Documents/Network/Pauline_Praktikum/results/multiplex_clustering_explicit_new_vertices.tsv")
vertices_df = pd.read_csv(path_vertices, sep="\t", header=0, names=["node:id", "node_name"])
vertices_df

Unnamed: 0,node:id,node_name
0,1,C19orf48
1,2,PIEZO2
2,3,FLYWCH1
3,4,ZCCHC10
4,5,NCR1
...,...,...
15256,15257,ALDH7A1
15257,15258,REG1A
15258,15259,SEPTIN4
15259,15260,PEX11B


In [21]:
#load community detection results 
path = Path("/Users/paulinestaiber/Documents/Network/Pauline_Praktikum/cluster_output_infomap_new/multiplex_clustering_explicit_new.clu")
agg = pd.read_csv(path, comment="#", header=None, sep=" ", names=[ "node_id","module_merged","flow" ])
agg_small = agg[["node_id", "module_merged"]]
agg_small



Unnamed: 0,node_id,module_merged
0,327,1
1,2737,1
2,3315,1
3,3601,1
4,4369,1
...,...,...
15372,12390,129
15373,9019,130
15374,13712,130
15375,2680,131


In [22]:
# remove duplicates
agg_small = agg_small.drop_duplicates()
len(agg_small)

15371

In [23]:
# Merge node names and layer names into clu_df
merge_df = agg_small.merge(vertices_df, left_on="node_id", right_on="node:id", how="left")
merge_df = merge_df.drop(columns=["node:id"])
merge_df




Unnamed: 0,node_id,module_merged,node_name
0,327,1,RPL3L
1,2737,1,EIF4A2
2,3315,1,RPS18
3,3601,1,RPS4Y2
4,4369,1,RPL28
...,...,...,...
15366,12390,129,NAIF1
15367,9019,130,HMX2
15368,13712,130,HMX3
15369,2680,131,CLC


In [24]:
#safe the merged dataframe
merge_df.to_csv("/Users/paulinestaiber/Documents/Network/Pauline_Praktikum/results/merged_dataframe_new.tsv", sep="\t", index=False)

In [25]:
# which modules overlap ? 
# Step 1: Find genes that appear in multiple merged modules.
multi_module_genes = merge_df.groupby('node_name')['module_merged'].nunique()

multi_module_genes = multi_module_genes[multi_module_genes > 1].index
# Step 2: Filter the dataframe to only include those genes.
df_multi = merge_df[merge_df['node_name'].isin(multi_module_genes)]

multiple_df = (
    df_multi.groupby("node_name")["module_merged"]
            .apply(lambda x: sorted(set(x)))     # remove duplicates & sort
            .reset_index()
            
)

multiple_df

Unnamed: 0,node_name,module_merged
0,ADCY7,"[5, 8]"
1,AGER,"[3, 6]"
2,AGO4,"[3, 7]"
3,AKT1,"[2, 3]"
4,ANAPC16,"[1, 3]"
...,...,...
104,TUBB,"[2, 3]"
105,UCHL1,"[4, 8]"
106,WDR34,"[2, 6]"
107,YBX3,"[7, 10]"


In [26]:
# ------------------------------------
# Step 2: for each gene, create all module pairs it connects
# ------------------------------------
from itertools import combinations
pairs = []

for _, row in multiple_df.iterrows():
    mods = row["module_merged"]
    if len(mods) > 1:
        # all unordered pairs of modules for this gene
        for a, b in combinations(mods, 2):
            # sort so (1,3) and (3,1) are treated the same
            a, b = sorted((a, b))
            pairs.append((a, b, row["node_name"]))

pairs_df = pd.DataFrame(pairs, columns=["module_1", "module_2", "node_name"])

# Count how many genes connect each pair of modules
flow_counts = (
    pairs_df
    .groupby(["module_1", "module_2"], sort=False)
    .agg(
        count=("node_name", "size"),
        genes=("node_name", list)        #  full list of genes for hover
    )
    .reset_index()
)

In [27]:
# ------------------------------------
# Step 3: build Sankey diagram
# ------------------------------------

# All modules that appear in any pair
all_modules = pd.unique(
    flow_counts[["module_1", "module_2"]].values.ravel()
)

# Create labels like "Merged M1", "Merged M3", ...
module_labels = [f"Merged M{m}" for m in all_modules]
label_to_index = {m: i for i, m in enumerate(all_modules)}

# Sources, targets, values
sources = [label_to_index[m] for m in flow_counts["module_1"]]
targets = [label_to_index[m] for m in flow_counts["module_2"]]
values  = flow_counts["count"].tolist()


# ðŸ”¹ Build hover text: genes per link, as HTML with <br> line breaks
link_customdata = [
    "<br>".join(map(str, genes_list))    # genes_list is flow_counts["genes"][i]
    for genes_list in flow_counts["genes"]
]

fig = go.Figure(data=[go.Sankey(
    node=dict(
        label=module_labels,
        pad=15,
        thickness=20,
    ),
  link=dict(
        source=sources,
        target=targets,
        value=values,
        customdata=link_customdata,   # ðŸ‘ˆ attach gene lists here
        hovertemplate=(
            "From: %{source.label}<br>"
            "To: %{target.label}<br>"
            "Count: %{value}<br><br>"
            "Genes:<br>%{customdata}"
            "<extra></extra>"
        )
    )
)])

fig.update_layout(
    title_text="Overlap Between Merged Modules (shared multi-module genes)",
    font=dict(size=12),
    width=1200,
    height=800,
)

fig.show()


In [None]:
#load gene list 
# caÃ¶culate in which modules genes enriched 
# select modules for Overrepresentation analysis, perform Overrepresentation analysis 

In [28]:
#read txt file
with open("/Users/paulinestaiber/Documents/Network/Pauline_Praktikum/disease_genes.txt", "r") as f:
    gene_list = f.read().splitlines()

In [29]:
# 1. ZÃ¤hle Disease-Gene pro Modul
disease_in_modules = merge_df[merge_df["node_name"].isin(gene_list)].groupby("module_merged")["node_name"].nunique().reset_index()
disease_in_modules.columns = ["module_merged", "disease_genes_count"]

# 2. ZÃ¤hle alle Gene pro Modul
total_in_modules = merge_df.groupby("module_merged")["node_name"].nunique().reset_index()
total_in_modules.columns = ["module_merged", "total_genes_count"]

# 3. Merge
module_stats = disease_in_modules.merge(total_in_modules, on="module_merged")

# 4. Background: Gesamtzahl Gene und Disease-Gene
total_genes = merge_df["node_name"].nunique()
total_disease_genes = len(gene_list)

# 5. Hypergeometrischer Test fÃ¼r jedes Modul
p_values = []
for _, row in module_stats.iterrows():
    k = row["disease_genes_count"]  # Disease-Gene im Modul
    M = total_genes                  # Alle Gene im Netzwerk
    n = total_disease_genes         # Alle Disease-Gene
    N = row["total_genes_count"]    # Gene im Modul
    
    # P-Wert: Wahrscheinlichkeit, k oder mehr zu sehen
    p_val = hypergeom.sf(k-1, M, n, N)
    p_values.append(p_val)

module_stats["p_value"] = p_values
module_stats["enrichment"] = (module_stats["disease_genes_count"] / module_stats["total_genes_count"]) / (total_disease_genes / total_genes)

module_stats = module_stats.sort_values("p_value")
module_stats

Unnamed: 0,module_merged,disease_genes_count,total_genes_count,p_value,enrichment
6,12,11,250,5.39763e-12,19.185257
1,2,5,624,0.0132209,3.493819
7,15,3,282,0.02643395,4.638602
5,8,3,390,0.05937488,3.354066
2,4,4,675,0.06724212,2.583873
9,28,1,56,0.1208608,7.786224
8,22,1,144,0.2826515,3.027976
4,6,3,1441,0.6548171,0.907762
3,5,1,1106,0.9283666,0.394239
0,1,3,3768,0.9962874,0.347157


In [38]:
# Benjamini-Hochberg (FDR, weniger konservativ - empfohlen)
module_stats["p_value_fdr"] = false_discovery_control(module_stats["p_value"])


module_stats = module_stats.sort_values("p_value_fdr")
module_stats



Unnamed: 0,module_merged,disease_genes_count,total_genes_count,p_value,enrichment,p_value_fdr
6,12,11,250,5.39763e-12,19.185257,5.39763e-11
1,2,5,624,0.0132209,3.493819,0.0661045
7,15,3,282,0.02643395,4.638602,0.08811316
5,8,3,390,0.05937488,3.354066,0.1344842
2,4,4,675,0.06724212,2.583873,0.1344842
9,28,1,56,0.1208608,7.786224,0.2014346
8,22,1,144,0.2826515,3.027976,0.4037879
4,6,3,1441,0.6548171,0.907762,0.8185213
3,5,1,1106,0.9283666,0.394239,0.9962874
0,1,3,3768,0.9962874,0.347157,0.9962874


In [None]:
#filter for modules sig enriched with disease genes 
# Filter fÃ¼r signifikant angereicherte Module (FDR < 0.05)
modules_sig_enriched = module_stats[module_stats["p_value_fdr"] < 0.05]
modules_sig_enriched


Unnamed: 0,module_merged,disease_genes_count,total_genes_count,p_value,enrichment,p_value_fdr
6,12,11,250,5.39763e-12,19.185257,5.39763e-11


In [42]:
modules_sig_enriched.to_csv(
    "/Users/paulinestaiber/Documents/Network/Pauline_Praktikum/sig_enriched_modules_stats.csv",
    index=True
)