In [None]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
from itertools import islice

import jsonlines
import matplotlib
import orjson
import numpy as np
from graph_tool.all import Graph, sfdp_layout, graph_draw

In [None]:
def users_from_path(path):
    for path in crawl_path.glob("*.json"):
        user_id = path.name.split("_")[0]
        with jsonlines.open(path, "r", loads=orjson.loads) as reader:
            try:
                yield np.array([(user_id, data["id"]) for data in reader], dtype=np.uint64)
            except Exception as e:
                print(e, path)

            
def load_graph(path: Path, limit:int|None=None) -> Graph:
    graph = Graph(directed=True)
    users = users_from_path(path)
    for edges in islice(users, limit):
        graph.add_edge_list(edges, hashed=True)
    return graph

In [None]:
%%time
# numpy arrary init and populated
crawl_path = Path("data/ned_first_run/")
graph = load_graph(crawl_path, limit=1000)

In [None]:
%%time
# numpy arrary from list comprehension
crawl_path = Path("data/ned_first_run/")
graph = load_graph(crawl_path, limit=1000)

In [None]:
%%time
crawl_path = Path("data/ned_first_run/")
graph = load_graph(crawl_path)

In [None]:
graph.save("ned_tweeps.xml.gz")

In [None]:
# TODO: filter nodes below inbound edge count threshold

In [None]:
pos = sfdp_layout(graph)

In [None]:
graph_draw(
    graph,
    pos,
    output_size=(1000, 1000),
    vertex_color=[1,1,1,0],
    vertex_size=1,
    edge_pen_width=1.2,
    vcmap=matplotlib.cm.gist_heat_r, 
    output="ned_tweeps.pdf"
)