This notebook is used to extract the max connected component of each interactome graph in `treeoflife.interactomes`.

In [None]:
import os
import numpy as np
from tqdm.auto import tqdm
import networkx as nx

def get_max_cc_edgelist(in_path: str, out_path: str) -> None:
    G = nx.read_edgelist(in_path, comments="#", nodetype=str, data=False)
    if G.number_of_nodes() == 0:
        open(out_path, "w").close()
        return
    # largest connected component by size
    cc_nodes = max(nx.connected_components(G), key=len)
    H = G.subgraph(cc_nodes).copy()
    nx.write_edgelist(H, out_path, data=False)

    return H.number_of_nodes(), H.number_of_edges()

txt_dir = '../data/treeoflife.interactomes'
save_dir = '../data/treeoflife.interactomes_max_cc'
interactome_list = []
for file in os.listdir(txt_dir):
    if file.endswith('.txt'):
        interactome_list.append(file)
print(f'Found {len(interactome_list)} interactome files.')
os.makedirs(save_dir, exist_ok=True)
for interactome in tqdm(interactome_list):
    in_path = os.path.join(txt_dir, interactome)
    out_path = os.path.join(save_dir, interactome)
    num_nodes, num_edges = get_max_cc_edgelist(in_path, out_path)


Found 1840 interactome files.


  0%|          | 0/1840 [00:00<?, ?it/s]

In [None]:
import os
import numpy as np
from tqdm.auto import tqdm
import networkx as nx

def get_max_cc_edgelist(in_path: str, out_path: str):
    # 用无向简单图读取来计算连通分量（不会影响我们后面行级过滤）
    G = nx.read_edgelist(in_path, comments="#", nodetype=str, data=False)
    if G.number_of_nodes() == 0:
        # 空图就输出空文件，并安全返回 (0, 0)
        open(out_path, "w").close()
        return 0, 0

    # 最大连通分量节点集合
    cc_nodes = set(max(nx.connected_components(G), key=len))
    H = G.subgraph(cc_nodes).copy()

    # 行级过滤：按输入文件逐行写出，只要该行两个端点都在最大CC中
    with open(in_path, "r") as fin, open(out_path, "w") as fout:
        for line in fin:
            s = line.strip()
            if not s or s.startswith("#"):
                # 跳过空行/注释（也可以选择保留注释，看你的需要）
                continue
            parts = s.split()
            if len(parts) < 2:
                continue
            u, v = parts[0], parts[1]
            if u in cc_nodes and v in cc_nodes:
                # 原样写出该行，保留方向与重复
                fout.write(line if line.endswith("\n") else line + "\n")

    # 这里的边数仍返回 NetworkX 的无向唯一边数；如果你想返回写出的行数，可另外统计
    return H.number_of_nodes(), H.number_of_edges()

# ===== 批处理部分保持不变 =====
txt_dir = '../data/treeoflife.interactomes'
save_dir = '../data/treeoflife.interactomes.max_cc'
interactome_list = []
for file in os.listdir(txt_dir):
    if file.endswith('.txt'):
        interactome_list.append(file)
print(f'Found {len(interactome_list)} interactome files.')
os.makedirs(save_dir, exist_ok=True)

for interactome in tqdm(interactome_list):
    in_path = os.path.join(txt_dir, interactome)
    out_path = os.path.join(save_dir, interactome)
    num_nodes, num_edges = get_max_cc_edgelist(in_path, out_path)


Found 1840 interactome files.


  0%|          | 0/1840 [00:00<?, ?it/s]