This notebook is used to extract the max connected component of each interactome graph in `treeoflife.interactomes`.

In [5]:
import os
import numpy as np
from tqdm.auto import tqdm
import networkx as nx

def get_max_cc_edgelist(in_path: str, out_path: str) -> None:
    G = nx.read_edgelist(in_path, comments="#", nodetype=str, data=False)
    if G.number_of_nodes() == 0:
        open(out_path, "w").close()
        return
    # largest connected component by size
    cc_nodes = max(nx.connected_components(G), key=len)
    H = G.subgraph(cc_nodes).copy()
    nx.write_edgelist(H, out_path, data=False)

    return H.number_of_nodes(), H.number_of_edges()

txt_dir = '../data/treeoflife.interactomes'
save_dir = '../data/treeoflife.interactomes_max_cc'
interactome_list = []
for file in os.listdir(txt_dir):
    if file.endswith('.txt'):
        interactome_list.append(file)
print(f'Found {len(interactome_list)} interactome files.')
os.makedirs(save_dir, exist_ok=True)
for interactome in tqdm(interactome_list):
    in_path = os.path.join(txt_dir, interactome)
    out_path = os.path.join(save_dir, interactome)
    num_nodes, num_edges = get_max_cc_edgelist(in_path, out_path)
    # print(f'Processed {interactome}: {num_nodes} nodes, {num_edges} edges.')

Found 1840 interactome files.


  0%|          | 0/1840 [00:00<?, ?it/s]