# Script for converting a tsv edge list into a networkx graph

In [61]:
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import os

path = "../../data/gfa-tsv"
small = path + "/small.tsv"
medium = path + "/medium.tsv"
big = path + "/big.tsv"

In [62]:
def tsv_to_graph(file_path) -> nx.DiGraph:
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"No file found at {file_path}")

    with open(file_path, 'r') as file:
        n = int(file.readline()) # number of nodes
        G = nx.DiGraph()

        # add the nodes id, just the first column of the file
        for _ in range(n):
            # node_id = int(file.readline().split()[0])
            # G.add_node(node_id)
            node_id = file.readline()

        for line in file:
            u, v = line.strip().split()
            G.add_edge(int(u), int(v))

    return G

def get_dict(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"No file found at {file_path}")

    with open(file_path, 'r') as file:
        n = int(file.readline()) # number of nodes
        dict_counter = {}
        for _ in range(n):
            node_id, node_label = file.readline().strip().split()
            dict_counter[int(node_id)] = Counter(node_label)

    # root node
    dict_counter[0] = Counter()

    return dict_counter

def get_alphabet(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"No file found at {file_path}")

    with open(file_path, 'r') as file:
        n = int(file.readline())
        alphabet = set()
        for _ in range(n):
            _, node_label = file.readline().strip().split()
            alphabet.update(node_label)

    return alphabet

def add_root_and_dummies(G):
    roots = [node for node in G.nodes() if G.in_degree(node) == 0] # Find the root nodes
    new_root_id = 0 # Create a new root node
    for _ in roots:
        G.add_edge(new_root_id, _) # Add an edge from the root node to each root node

    G.nodes[new_root_id]['out'] = set()
    G.nodes[new_root_id]['out'].add(0)
    G.nodes[new_root_id]['weight'] = 0

def compute_node_out(G, node_id):
    node_weight = G.nodes[node_id]['weight']
    predecessors = list(G.predecessors(node_id))

    node_out = set()
    for predecessor in predecessors:
        predecessor_out = G.nodes[predecessor]['out']
        node_out = node_out.union(predecessor_out)

    node_out = set([x + node_weight for x in node_out])
    return node_out

def build_graph(file_path, char):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"No file found at {file_path}")

    G = tsv_to_graph(file_path)
    G.remove_nodes_from(list(nx.isolates(G)))

    dict_counter = get_dict(file_path)
    alphabet = get_alphabet(file_path)

    if char not in alphabet:
        raise ValueError(f"Character {char} not in alphabet")

    # add a label to each node with the counter of the character, or 0 if it doesn't exist
    for node in G.nodes():
        G.nodes[node]['weight'] = dict_counter[node].get(char, 0)

    add_root_and_dummies(G)
    for node in list(nx.topological_sort(G))[1::]:
        G.nodes[node]['out'] = compute_node_out(G, node)

    return G

In [69]:
# G_medium_A = build_graph(medium, 'G')
G_big_A = build_graph(big, 'A')

In [70]:
# print only node id indegree if the outdegree is more then 2
for node in G_big_A.nodes():
    if G_big_A.in_degree(node) > 2:
        print(node, G_big_A.in_degree(node))