In [None]:
from pathlib import Path


PROJECT_ROOT_DIR = str(Path.cwd().parent)
PROJECT_ROOT_DIR

# Intro

# Loading graph dataset

In [None]:
from ogb.nodeproppred import NodePropPredDataset

dataset = NodePropPredDataset(name="ogbn-arxiv")
graph, labels = dataset[0]

# Loading nodes textual features

OGB provides a file with all text features related to a papar (title and abstract): https://ogb.stanford.edu/docs/nodeprop/#ogbn-arxiv

In [None]:
# Download file from https://snap.stanford.edu/ogb/data/misc/ogbn_arxiv/titleabs.tsv.gz and puts into PROJECT_ROOT_DIR
import requests
import os


url = "https://snap.stanford.edu/ogb/data/misc/ogbn_arxiv/titleabs.tsv.gz"
text_features_filename = Path(PROJECT_ROOT_DIR) / "titleabs.tsv"

if not os.path.isfile(str(text_features_filename) + ".gz"):
    response = requests.get(url)

    if response.status_code == 200:
        with open(str(text_features_filename) + ".gz", "wb") as f:
            f.write(response.content)
    else:
        print("Failed to download file.")
else:
    print("File already downloaded.")

In [4]:
import gzip
import shutil


with gzip.open(str(text_features_filename) + ".gz", "rb") as f_in:
    with open(text_features_filename, "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)

In [5]:
node_idx_2_paper_id_f_name = "./dataset/ogbn_arxiv/mapping/nodeidx2paperid.csv"

if not os.path.isfile(node_idx_2_paper_id_f_name):
    with gzip.open(node_idx_2_paper_id_f_name + ".gz", "rb") as f_in:
        with open(node_idx_2_paper_id_f_name, "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)

In [None]:
import pandas as pd

text_features = pd.read_csv(
    text_features_filename,
    sep="\t",
    header=None,
    skiprows=1,
    names=["paper_id", "title", "abstract"],
    index_col="paper_id",
)

text_features = text_features.dropna()
text_features.index = text_features.index.map(int)
text_features

In [None]:
node_idx_to_paper_id = pd.read_csv(
    node_idx_2_paper_id_f_name,
    index_col=0,
    names=["node_idx", "paper_id"],
    header=None,
    skiprows=1,
)
node_idx_to_paper_id

In [8]:
node_text_features = pd.merge(
    node_idx_to_paper_id,
    text_features,
    left_on="paper_id",
    right_on=text_features.index,
)
node_text_features.index.names = ["node_idx"]

In [None]:
node_text_features["publishing_year"] = graph["node_year"]
node_text_features

In [None]:
node_text_features["label"] = labels
node_text_features

In [None]:
# Select all rows in node_text_features where publishing_year is 2018
node_text_features[node_text_features["publishing_year"] == 2018]

# Loading graph instructions

In [12]:
from datasets import load_dataset


train_instructions = pd.read_json(
    "hf://datasets/Jiabin99/Arxiv-PubMed-mix-NC-LP/arxiv_pub_node_st_cot_link_mix.json"
)
train_instructions = train_instructions[
    train_instructions["id"].str.startswith("arxiv_train")
]
train_instructions["node_idx"] = train_instructions["id"].apply(
    lambda x: x.split("_")[-1]
)
train_instructions.index = train_instructions["node_idx"]
train_instructions.index = train_instructions.index.map(int)

In [None]:
from datasets import load_dataset

test_instructions = load_dataset(
    "Jiabin99/GraphGPT-eval-instruction",
    split="test",
    data_files={"test": "arxiv_test_instruct_std.json"},
)
test_instructions = pd.DataFrame(test_instructions)
test_instructions["node_idx"] = test_instructions["id"].apply(
    lambda x: x.split("_")[-1]
)
test_instructions.index = test_instructions["node_idx"]
test_instructions.index = test_instructions.index.map(int)

# Merging text features and instructions

In [None]:
# Create a new dataframe that makes an inner join between `node_text_features` and `train_instructions` on the indexes of both dataframes
train_data = pd.merge(
    node_text_features, train_instructions, left_index=True, right_index=True
)
train_data

In [None]:
min(train_data["publishing_year"]), max(train_data["publishing_year"])

In [None]:
# Create a new dataframe that makes an inner join between `node_text_features` and `test_instructions` on the indexes of both dataframes
test_data = pd.merge(
    node_text_features, test_instructions, left_index=True, right_index=True
)
test_data

In [None]:
min(test_data["publishing_year"]), max(test_data["publishing_year"])

# Generating full set of instructions

In [18]:
def replace_graph_in_conversations(conversation, graph_data):
    parsed_human_message = conversation[0]["value"]
    parsed_human_message = parsed_human_message.replace("<graph>", str(graph_data))

    return [{"from": "human", "value": parsed_human_message}, conversation[1]]

In [19]:
train_data["formatted_conversations"] = train_data.apply(
    lambda row: replace_graph_in_conversations(row["conversations"], row["graph"]),
    axis=1,
)

In [20]:
test_data["formatted_conversations"] = test_data.apply(
    lambda row: replace_graph_in_conversations(row["conversations"], row["graph"]),
    axis=1,
)

In [None]:
test_data

# Splitting eval data from train data

In [None]:
eval_data = train_data[train_data["publishing_year"] == 2017]
eval_data

In [None]:
train_data = train_data[train_data["publishing_year"] != 2017]
train_data

# Saving data in disk

In [24]:
!mkdir -p $PROJECT_ROOT_DIR/dataset/

In [25]:
train_data.to_parquet(f"{PROJECT_ROOT_DIR}/dataset/train.parquet")

In [26]:
eval_data.to_parquet(f"{PROJECT_ROOT_DIR}/dataset/eval.parquet")

In [27]:
test_data.to_parquet(f"{PROJECT_ROOT_DIR}/dataset/test.parquet")