In [None]:
import os
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import seaborn as sns
import json

def read_edge_list_as_adjacency(filename):
    """
    Reads an edge list file into a NumPy adjacency matrix.
    Each line should have two node IDs separated by space.

    Returns:
        adj_matrix (np.ndarray): NxN adjacency matrix
        nodes (list): list of node names in the same order as matrix indices
    """
    edges = []
    nodes = set()

    # read file and collect nodes + edges
    with open(filename, "r") as f:
        for line in f:
            u, v = line.strip().split()
            edges.append((u, v))
            nodes.update([u, v])

    # assign index to each node
    nodes = sorted(nodes)
    node2idx = {node: i for i, node in enumerate(nodes)}

    # build adjacency matrix
    n = len(nodes)
    adj = np.zeros((n, n), dtype=int)
    for u, v in edges:
        i, j = node2idx[u], node2idx[v]
        adj[i, j] = 1

    return adj, nodes

data_dir = '../data/treeoflife.interactomes'
interactome_list = []
for file in os.listdir(data_dir):
    if file.endswith('.txt'):
        interactome_list.append(os.path.join(data_dir, file))
print(f"Found {len(interactome_list)} interactome files.")

num_nodes = []
save_dir = '../data/treeoflife.interactomes_adj'
node_save_dir = '../data/treeoflife.interactomes_nodes'
os.makedirs(save_dir, exist_ok=True)
os.makedirs(node_save_dir, exist_ok=True)
for interactome in tqdm(interactome_list):
    adj, nodes = read_edge_list_as_adjacency(interactome)
    num_nodes.append(len(nodes))
    np.savez_compressed(os.path.join(save_dir, os.path.basename(interactome).replace('.txt', '.npz')), adj=adj)
    np.savetxt(os.path.join(node_save_dir, os.path.basename(interactome).replace('.txt', '_nodes.txt')), nodes, fmt='%s')
print(f'mean: {np.mean(num_nodes)}, max: {np.max(num_nodes)}, min: {np.min(num_nodes)}')

# plot the distribution of number of nodes
fig, ax = plt.subplots(figsize=(8, 6), dpi=200)
sns.histplot(num_nodes, bins=30, kde=True, ax=ax)
ax.set_title('Distribution of Number of Nodes in Interactomes')
ax.set_xlabel('Number of Nodes')
ax.set_ylabel('Frequency')
plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import os

data_dir = '../data/treeoflife.interactomes_adj'
interactome_list = []
for file in os.listdir(data_dir):
    if file.endswith('.npz'):
        interactome_list.append(os.path.join(data_dir, file))
print(f"Found {len(interactome_list)} interactome files.")
for interactome in interactome_list:
    data = np.load(interactome)
    adj = data['adj']
    if len(adj) > 10000:
        print(f'{interactome} {len(adj)}')

In [None]:
import os
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt

def visualize_network(G, title="Network Graph"):
    # G is a numpy adjacency matrix
    graph = nx.from_numpy_array(G)
    plt.figure(figsize=(6, 6), dpi=200)
    nx.draw(graph, node_color='blue', edge_color='gray', node_size=20, alpha=0.7)

data_file = '../data/treeoflife.interactomes_adj/10090.npz'
data = np.load(data_file)
adj = data['adj']
print(f'Adjacency matrix shape: {adj.shape}')
visualize_network(adj, title="Interactome Network Visualization")