# Loading graph

In [6]:
from ogb.nodeproppred import NodePropPredDataset

In [None]:
dataset = NodePropPredDataset(name="ogbn-arxiv")

In [8]:
graph, labels = dataset[0]

In [9]:
import numpy as np
import matplotlib.pyplot as plt
import random
from umap.umap_ import UMAP
from matplotlib.colors import ListedColormap


def reduce_dimensions(vectors):
    reducer = UMAP()
    reducer.fit(vectors)
    vectors = reducer.transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]

    return x_vals, y_vals


def plot_embeddings(x_vals, y_vals, labels):
    random.seed(0)
    plt.figure(figsize=(12, 12))

    scatter = plt.scatter(
        x_vals,
        y_vals,
        c=labels,
        cmap=ListedColormap(plt.cm.tab20(np.linspace(0, 1, 20)).tolist() * 2),
        s=10,  # Adjust size of points if needed
        alpha=0.8,  # Adjust transparency for better visibility
    )

    plt.colorbar(scatter, ticks=np.arange(0, 40, step=1), label="Classes")
    plt.title("2D Embedding Visualization with Class Colors")
    plt.xlabel("Dimension 1")
    plt.ylabel("Dimension 2")
    plt.show()

In [None]:
x, y = reduce_dimensions(graph["node_feat"])

In [None]:
plot_embeddings(x, y, labels)

# Loading text features

OGB provides a file with all text features related to a papar (title and abstract): https://ogb.stanford.edu/docs/nodeprop/#ogbn-arxiv

In [None]:
import pandas as pd

text_features = pd.read_csv(
    "../titleabs.tsv",
    sep="\t",
    header=None,
    skiprows=1,
    names=["paper_id", "title", "abstract"],
    index_col="paper_id",
)

text_features = text_features.dropna()
text_features.index = text_features.index.map(int)
text_features

In [None]:
node_idx_to_paper_id = pd.read_csv(
    "./dataset/ogbn_arxiv/mapping/nodeidx2paperid.csv",
    index_col=0,
    names=["node_idx", "paper_id"],
    header=None,
    skiprows=1,
)
node_idx_to_paper_id

In [9]:
node_text_features = pd.merge(
    node_idx_to_paper_id,
    text_features,
    left_on="paper_id",
    right_on=text_features.index,
)
node_text_features.index.names = ["node_idx"]

In [None]:
node_text_features["publishing_year"] = graph["node_year"]
node_text_features

In [None]:
node_text_features["label"] = labels
node_text_features

In [None]:
# Select all rows in node_text_features where publishing_year is 2018
node_text_features[node_text_features["publishing_year"] == 2018]

# Loading graph instructions

In [None]:
import pandas as pd

train_instructions = pd.read_json("hf://datasets/Jiabin99/Arxiv-PubMed-mix-NC-LP/arxiv_pub_node_st_cot_link_mix.json")
train_instructions = train_instructions[train_instructions['id'].str.startswith("arxiv_train")]
train_instructions["node_idx"] = train_instructions["id"].apply(lambda x: x.split("_")[-1])
train_instructions.index = train_instructions["node_idx"]
train_instructions.index = train_instructions.index.map(int)

In [15]:
from datasets import load_dataset

eval_instructions = load_dataset("Jiabin99/GraphGPT-eval-instruction", split="test", data_files={"test": "arxiv_test_instruct_std.json"})
eval_instructions = pd.DataFrame(eval_instructions)
eval_instructions["node_idx"] = eval_instructions["id"].apply(lambda x: x.split("_")[-1])
eval_instructions.index = eval_instructions["node_idx"]
eval_instructions.index = eval_instructions.index.map(int)

# Merging text features and instructions

In [None]:
# Create a new dataframe that makes an inner join between `node_text_features` and `train_instructions` on the indexes of both dataframes
train_data = pd.merge(node_text_features, train_instructions, left_index=True, right_index=True)
train_data

In [None]:
min(train_data["publishing_year"]), max(train_data["publishing_year"])

In [None]:
# Create a new dataframe that makes an inner join between `node_text_features` and `eval_insstructions` on the indexes of both dataframes
eval_data = pd.merge(node_text_features, eval_instructions, left_index=True, right_index=True)
eval_data

In [None]:
min(eval_data["publishing_year"]), max(eval_data["publishing_year"])

# Generating full set of instructions

In [120]:
def replace_graph_in_conversations(conversation, graph_data):
    parsed_human_message = conversation[0]["value"]
    parsed_human_message = parsed_human_message.replace("<graph>", str(graph_data))

    return [
        {
            "from": "human",
            "value": parsed_human_message
        },
        conversation[1]
    ]


In [None]:
train_data['formatted_conversations'] = train_data.apply(
    lambda row: replace_graph_in_conversations(row['conversations'], row['graph']), axis=1
)

In [122]:
eval_data['formatted_conversations'] = eval_data.apply(
    lambda row: replace_graph_in_conversations(row['conversations'], row['graph']), axis=1
)

In [None]:
train_data

In [None]:
eval_data

# Download Gemma 2B-it model

In [None]:
import kagglehub

path = kagglehub.model_download("google/gemma/pyTorch/2b-it")

print("Path to model files:", path)