In [27]:
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import os

path = "../../data/gfa-tsv"
small = path + "/small.tsv"
medium = path + "/medium.tsv"
big = path + "/big.tsv"

In [28]:
def tsv_to_graph(file_path) -> nx.DiGraph:
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"No file found at {file_path}")

    with open(file_path, 'r') as file:
        n = int(file.readline())
        G = nx.DiGraph()

        # ignore the first n lines and just add the edges
        for _ in range(n):
            file.readline()

        for line in file:
            u, v = line.strip().split()
            G.add_edge(u, v)

    return G

def get_dict(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"No file found at {file_path}")

    with open(file_path, 'r') as file:
        n = int(file.readline())
        dict_counter = {}
        for _ in range(n):
            node_id, node_label = file.readline().strip().split()
            dict_counter[node_id] = Counter(node_label)

    return dict_counter

def get_alphabet(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"No file found at {file_path}")

    with open(file_path, 'r') as file:
        n = int(file.readline())
        alphabet = set()
        for _ in range(n):
            _, node_label = file.readline().strip().split()
            alphabet.update(node_label)

    return alphabet

def build_graph(file_path, char):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"No file found at {file_path}")

    G = tsv_to_graph(file_path)
    dict_counter = get_dict(file_path)
    alphabet = get_alphabet(file_path)

    if char not in alphabet:
        raise ValueError(f"Character {char} not in alphabet")

    # add a label to each node with the counter of the character, or 0 if it doesn't exist
    for node in G.nodes:
        G.nodes[node]["weight"] = dict_counter.get(node, {}).get(char, 0)

    return G

In [29]:
G_medium_A = build_graph(medium, 'A')
G_medium_C = build_graph(medium, 'C')
G_medium_G = build_graph(medium, 'G')
G_medium_T = build_graph(medium, 'T')

In [30]:
for node in G_medium_A.nodes:
    # print the node and its label "count"
    print(node, G_medium_A.nodes[node]["weight"])

698 1
699 0
700 0
1181 0
1182 0
1082 0
1083 0
1084 1
320 5
321 0
322 1
142 4
143 0
144 9
948 7
949 0
950 0
1008 0
1009 11
1064 0
1065 4
343 19
344 0
345 0
859 0
860 16
349 11
350 0
351 1
761 0
762 73
721 9
722 0
723 1
601 78
602 0
603 7
174 21
175 0
176 0
656 0
657 0
658 1
1190 59
1191 0
1192 0
469 5
470 0
471 1
1169 0
1170 2
504 21
505 0
506 1
185 0
186 4
713 41
714 0
715 0
1198 0
1199 50
24 98
25 0
26 18
630 4
631 0
632 0
279 6
280 0
281 0
573 0
574 44
751 14
752 0
753 0
974 16
975 0
976 0
524 19
525 0
526 0
257 0
258 29
1216 156
1217 0
1218 351
1140 0
1141 23
404 6
405 0
406 1
424 1
425 0
426 0
282 4
815 0
816 1
1247 0
1248 55
217 5
218 0
219 0
1117 1
1118 0
1119 0
707 16
708 0
709 0
284 0
285 21
655 1
697 0
183 46
184 0
767 1
768 0
93 2
94 0
95 0
918 15
919 0
920 0
296 1
297 2
37 1
38 16
608 0
609 20
769 0
770 0
1193 5
1194 0
1195 1
302 0
303 17
317 3
318 0
319 0
409 0
410 61
1091 2
1092 0
1093 4
1183 0
1184 2
894 0
895 22
300 9
301 0
1209 0
1210 93
5 9
6 3
963 5
964 0
965 0
763 0
