In [1]:
import sqlite3 as sqlite
from pathlib import Path
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from omegaconf import OmegaConf
from tqdm import tqdm

In [3]:
config = OmegaConf.load("config.yaml")
config.db.path

'/mnt/ssd/AcademicGraph/AcademicGraph.sqlite'

In [4]:
config.db.views.keys()

dict_keys(['authors', 'papers', 'affiliations', 'collab', 'citing'])

In [5]:
con = sqlite.connect(config.db.path)

In [6]:
for name, query in config.db.views.items():
    con.execute(query)

In [8]:
def db_to_file(con, sql_source, destination, chunksize=10_000, schema=None, view_prefix="v_"):
    pqwriter = None
    if schema is not None:
        schema = pa.schema(schema)
    for i, chunk in tqdm(enumerate(pd.read_sql(f"select * from {view_prefix}{sql_source}", con, chunksize=chunksize)),
                      desc=f"Processing {sql_source}"
                     ):
        table = pa.Table.from_pandas(chunk, schema=schema)

        if i == 0:
            pqwriter = pq.ParquetWriter(destination, table.schema)

        pqwriter.write_table(table)
        if i > 5:
            break

    # close the parquet writer
    if pqwriter:
        pqwriter.close()

In [9]:
myschema = [(k,v) for k,v in config.output.schema["papers"].items()]
pa.schema(myschema)

PaperId: int64
AuthorId: int64
DocType: string
PaperTitle: string
Date: string
JournalId: int64
ConferenceSeriesId: int64
AuthorCount: int64
CitationCount_y10: int64

In [10]:
schema_dict = config.output.schema
for sql_source in config.db.views.keys():
    print(sql_source)
    outfile = Path(config.output.path) / sql_source
    outfile = outfile.with_suffix(config.output.extension)
    schema = None
    if sql_source in schema_dict:
        schema = [(k,v) for k,v in schema_dict[sql_source].items()]
    db_to_file(con, sql_source, outfile, schema=schema)

authors


Processing authors: 6it [00:00,  9.47it/s]


papers


Processing papers: 6it [02:42, 27.07s/it]


affiliations


Processing affiliations: 6it [00:08,  1.43s/it]


collab


Processing collab: 6it [00:00, 20.92it/s]


citing


Processing citing: 6it [00:05,  1.06it/s]


In [20]:
con.close()