In [None]:
%load_ext autoreload
%autoreload 2

%load_ext sql
%sql duckdb:///:memory:
%config SqlMagic.autopandas = False
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

import json

import duckdb
import pandas as pd
import plotly.express as px

from birbnet import config
from birbnet.data_utils import RunDataset

In [None]:
# for accessing the dataset we'll analyse
DATASET = RunDataset("ned_first_run")

## Analysing Crawl Stats

In [None]:
crawl_stats_df = pd.read_parquet(DATASET.crawl_stats_path)

In [None]:
crawl_stats_df["nodes_counts"].cumsum().plot(figsize=(10,5));

In [None]:
crawl_stats_df["edge_counts"].cumsum().plot(figsize=(10,5));

In [None]:
json_files_glob = DATASET.users_path / "*.json"

In [None]:
%%sql json_structure << 
SELECT json_group_structure(json) as schema
FROM (
  SELECT *
  FROM read_ndjson_objects('{{json_files_glob}}')
  LIMIT 1000
)

In [None]:
data = json.loads(json_structure[0].schema)
print(json.dumps(data, indent=4))

In [None]:
%%time
%%sql
SELECT COUNT(*) FROM read_ndjson_auto('{{json_files_glob}}')

In [None]:
%%time
%%sql users <<
SELECT * FROM read_ndjson_auto('{{json_files_glob}}', columns={id:UBIGINT, name:VARCHAR})

In [None]:
%%time
users_df = users.DataFrame()

The jupysql way of doing this query seems to be about 7x slower than going straight though :(

Looks like it's not paralelising the load, which duckdb does do for this query. 

Maybe due to having to go through SQLAlchemy?

*TODO:* Submit bug report

In [None]:
%%time
sql = f"""
SELECT id, 
       name,
       public_metrics.following_count AS following,
       public_metrics.followers_count AS followers
FROM read_ndjson('{json_files_glob}', columns={{id: UBIGINT, name: VARCHAR, public_metrics: 'STRUCT(following_count INTEGER, followers_count INTEGER)'}})
"""
result = duckdb.sql(sql)
df = result.to_df()

In [None]:
df.head()

In [None]:
%%time
sql = f"""
SELECT id, 
       name,
       public_metrics.following_count AS following,
       public_metrics.followers_count AS followers
FROM read_ndjson_auto('{json_files_glob}', sample_size=1000000000)
"""
result = duckdb.sql(sql)
df = result.to_df()