In [2]:
import os
from ogb.nodeproppred import PygNodePropPredDataset
import torch_geometric.transforms as T
import pandas as pd
from tqdm import tqdm
import json
import pickle
import os
import gzip
import re

os.chdir("/Users/lyk/ds_graph_clf_diverse_experiments")

### Description ###

Process additional metadata for the `ogbn-arxiv` dataset, using the dataset's provided node ID to MAG ID mapping. Since MAG has been taken down, we use a July 2020 snapshot of MAG, made available by the Open Academic Graph project, hosted on AMiner [(link)](https://www.aminer.cn/oag-2-1). The ~240M MAG papers are split into 17 ~10GB chunks, each chunk containing 3 ~10GB text files of records adhering to the schema listed under **Data Description**. All chunks were downloaded beforehand.  
  
We also use the raw texts of titles and abstracts linked under the `ogbn-arxiv` description [on OGB](https://ogb.stanford.edu/docs/nodeprop/#ogbn-arxiv).  

### Output ###
Pickled DataFrame (`ogbn_arxiv_full_metadata`) with paper MAG IDs (ordered by their node IDs), title, abstract and features for defining the studied edge types: authors, published venues, fields of study. 

In [44]:
dataset = PygNodePropPredDataset("ogbn-arxiv", root="data/")

titleabs = pd.read_csv("data/tables/titleabs.tsv", sep="\t", names=["aid", "title", "abstract"])

with gzip.open("data/ogbn_arxiv/mapping/nodeidx2paperid.csv.gz", "rb") as f:
    mapping = pd.read_csv(f)
mapping.set_index("node idx", inplace=True)

In [22]:
titleabs = titleabs.set_index("aid").loc[mapping["paper id"].values]
titleabs.head()

Unnamed: 0_level_0,title,abstract
aid,Unnamed: 1_level_1,Unnamed: 2_level_1
9657784,evasion attacks against machine learning at te...,"In security-sensitive applications, the succes..."
39886162,how hard is computing parity with noisy commun...,We show a tight lower bound of $\Omega(N \log\...
116214155,on the absence of the rip in real world applic...,The purpose of this paper is twofold. The firs...
121432379,a promise theory perspective on data networks,Networking is undergoing a transformation thro...
231147053,analysis of asymptotically optimal sampling ba...,Over the last 20 years significant effort has ...


In [29]:
joined = mapping.merge(titleabs, left_on="paper id", right_on="aid")

In [None]:
def process_mag_folder(folder, ids):
    """Filters all text files in a MAG chunk, saves out the records corresponding to IDs in `ogbn-arxiv`. 

    Args:
        folder (str): name of chunk folder i.e. mag_papers_{0-16}.
        ids (set): set of MAG IDs in `ogbn-arxiv`.

    Output:
        results (List[dict]): list of relevant records.
    """
    results = []
    files = os.listdir(f"data/mag/{folder}")
    for i in tqdm(range(len(files))):
        with open(f"data/mag/{folder}/{files[i]}", "rb") as f:
            for line in f:
                res = json.loads(line)
                if res["id"] in ids:
                    results.append(res)
    with open(f"data/mag/in_ogbn_arxiv/results_{folder}.pkl", "wb") as f:
        pickle.dump(results, f)
    return results

In [None]:
# All MAG chunks are too large to be unzipped simultaneously, so we unzip and run `process_mag_folder` for each chunk procedurally. 
# Change the `folder` arg.

results = process_mag_folder("mag_papers_16", set(joined["paper id"]))

In [None]:
# Concatenate the per-chunk results of `process_mag_folder` into one list. 

files = os.listdir(f"data/mag/in_ogbn_arxiv")
in_ogbn_arxiv = []
for i in tqdm(range(len(files))):
    with open(f"data/mag/in_ogbn_arxiv/{files[i]}", "rb") as f:
        temp = pickle.load(f)
        in_ogbn_arxiv.extend(temp)
        f.close()

In [None]:
df_ogbn = pd.DataFrame.from_records(in_ogbn_arxiv)
df_ogbn = joined.merge(df_ogbn, left_on="paper id", right_on="id", how="left")
df_ogbn = df_ogbn.drop(["id", "title_y"], axis=1).rename(columns={"title_x":"title"})
df_ogbn.head()

In [35]:
df_ogbn.to_parquet("ogbnarxiv_mag_metadata.parquet.gzip", compression="gzip")