In [74]:
'''
- Extract the official cora dataset from the mariadb.
- Prepare official data to be processed in Neo4j.
- Export data in CSV format then upload to AWS S3 Buckets.
'''

import sys, os
import concurrent.futures

import pandas as pd
import numpy as np
import sqlalchemy
import mariadb

try:
    conn = mariadb.connect(
        user="guest",
        password="relational",
        host="relational.fit.cvut.cz",
        port=3306,
        database="CORA"
    )
    engine = sqlalchemy.create_engine("mariadb://guest:relational@relational.fit.cvut.cz:3306/CORA", pool_size=10, max_overflow=20)
except mariadb.Error as e:
    print(f"Error connecting to MariaDB Platform: {e}")
    sys.exit(1)


In [None]:
if not os.path.exists("dataset"):
    os.mkdir(os.path.join(os.getcwd(), "dataset"), mode=0o666)
# generate edges.csv
df_edges = pd.read_sql("SELECT * FROM cites", engine)
df_edges.to_csv("dataset/edges.csv", index=False)

In [None]:
# paper_id must be unique for each call
def extract(paper_id, word_count, gm):
    words_query = "select word_cited_id from content where paper_id = %s"
    df_words = pd.read_sql_query(words_query, engine, params=(paper_id,))
    gm[paper_id] = [0]*word_count
    for word in df_words['word_cited_id']:
        word_id = (lambda x: int(x.split("word")[1]))(word)
        gm[paper_id][word_id] = 1
    return f"Done: {paper_id}"

m = {}
supportedWordCount = 1434
df_unique_content = pd.read_sql("select distinct paper_id from content", engine)
with concurrent.futures.ThreadPoolExecutor(len(df_unique_content)) as executor:
    futures = []
    for pid in df_unique_content['paper_id']:
        params = [pid, supportedWordCount, m]
        executor.submit(lambda p: extract(*p), params)
        futures.append(executor.submit(lambda p: extract(*p), params))
    for future in concurrent.futures.as_completed(futures):
        print(future.result())


In [None]:
df_paper = pd.read_sql("SELECT * FROM paper", engine)
paper = list(zip(df_paper.paper_id, df_paper.class_label))
data = [{'paper_id': paper_id, 'subject': subject, 'words': m[paper_id]} for paper_id, subject in paper]
df_nodes = pd.DataFrame(data, columns=['paper_id', 'subject', 'words'])
df_nodes.to_csv("dataset/nodes.csv", index=False)
