In [1]:
import re
import shlex

import networkx as nx

In [2]:
# states
# 'NODETYPES', 'EDGETYPES', 'NODE', 'NODEFIELDS', 'EDGE', 'EDGEFIELDS', 'NOTHING'
current_st = 'NOTHING'

In [3]:
graph = nx.DiGraph()
node_list = []
edge_list = []

path_db = 'data/BulliCompleto.nlg'
with open(path_db) as f:
    for line in f:
        stripped_line = line.strip()
        if stripped_line == '"<NodesTypes>"' and current_st == 'NOTHING':
            current_st = 'NODETYPES'
        elif stripped_line == '"<EndNodesTypes>"' and current_st == 'NODETYPES':
            current_st = 'NOTHING'
        elif stripped_line == '"<EdgesTypes>"' and current_st == 'NOTHING':
            current_st = 'EDGETYPES'
        elif stripped_line == '"<EndEdgesTypes>"' and current_st == 'EDGETYPES':
            current_st = 'NOTHING'
        elif stripped_line == '"<Nodes>"' and current_st == 'NOTHING':
            current_st = 'NODEFIELDS'
        elif stripped_line == '"<EndNodes>"' and current_st == 'NODE':
            current_st = 'NOTHING'
        elif stripped_line == '"<Edges>"' and current_st == 'NOTHING':
            current_st = 'EDGEFIELDS'
        elif stripped_line == '"<EndEdges>"' and current_st == 'EDGE':
            current_st = 'NOTHING'
        elif current_st == 'NODETYPES':
            values = shlex.split(stripped_line)
            node_type = values[0]
            node_list.append(node_type)
        elif current_st == 'EDGETYPES':
            values = shlex.split(stripped_line)
            edge_type = values[0]
            edge_list.append(edge_type)
        elif current_st == 'NODEFIELDS':
            fields = shlex.split(stripped_line)
            fields = [field[:-3].lower() for field in fields]
            current_st = 'NODE'
        elif current_st == 'EDGEFIELDS':
            fields = shlex.split(stripped_line)
            fields = [field[:-3].lower() for field in fields]
            current_st = 'EDGE'
        elif current_st == 'NODE':
            values = shlex.split(stripped_line)
            ide = values[0]
            attr_dict = dict(zip(fields[1:], values[1:]))
            graph.add_node(ide, attr_dict)
        elif current_st == 'EDGE':
            id_list_str = re.findall('\[.+\]', stripped_line)[0]
            id_str = id_list_str[1:-1]
            id_list = shlex.split(id_str)
            assert(len(id_list) == 2)
            other_values_str = stripped_line.replace(id_list_str + ' ', '')
            other_values = shlex.split(other_values_str)
            attr_dict = dict(zip(fields[1:], other_values))
            graph.add_edge(id_list[0], id_list[1], attr_dict)

In [4]:
node_set = set()
for n, data in graph.nodes(data=True):
    t = data['nodetype']
    node_set.add(t)

In [5]:
edge_set = set()
for n1, n2, data in graph.edges(data=True):
    t = data['edgetype']
    edge_set.add(t)

In [6]:
node_set == set(node_list)

True

In [7]:
edge_set == set(edge_list)

True

In [8]:
graph.number_of_nodes()

9764

In [9]:
graph.number_of_edges()

33252

In [10]:
path_gexf = 'out/elbulli_nlg.gexf'
nx.write_gexf(graph, path_gexf)