# Downloading text features for graph

In [1]:
from pathlib import Path


PROJECT_ROOT_DIR = str(Path.cwd().parent)
PROJECT_ROOT_DIR

'/home/marcovinha/llm_graph_embedder'

In [None]:
# Download file from https://snap.stanford.edu/ogb/data/misc/ogbn_arxiv/titleabs.tsv.gz and puts into PROJECT_ROOT_DIR
import requests


url = "https://snap.stanford.edu/ogb/data/misc/ogbn_arxiv/titleabs.tsv.gz"
filename = Path(PROJECT_ROOT_DIR) / "titleabs.tsv.gz"

response = requests.get(url)

if response.status_code == 200:
    with open(filename, "wb") as f:
        f.write(response.content)
else:
    print("Failed to download file.")

In [5]:
import gzip
import shutil


with gzip.open(filename, "rb") as f_in:
    with open(str(filename).replace(".gz", ""), "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)

# Loading graph

In [1]:
from ogb.nodeproppred import NodePropPredDataset

In [2]:
dataset = NodePropPredDataset(name="ogbn-arxiv")

  loaded_dict = torch.load(pre_processed_file_path)


In [3]:
graph, labels = dataset[0]

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import random
from umap.umap_ import UMAP
from matplotlib.colors import ListedColormap


def reduce_dimensions(vectors):
    reducer = UMAP()
    reducer.fit(vectors)
    vectors = reducer.transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]

    return x_vals, y_vals


def plot_embeddings(x_vals, y_vals, labels):
    random.seed(0)
    plt.figure(figsize=(12, 12))

    scatter = plt.scatter(
        x_vals,
        y_vals,
        c=labels,
        cmap=ListedColormap(plt.cm.tab20(np.linspace(0, 1, 20)).tolist() * 2),
        s=10,  # Adjust size of points if needed
        alpha=0.8,  # Adjust transparency for better visibility
    )

    plt.colorbar(scatter, ticks=np.arange(0, 40, step=1), label="Classes")
    plt.title("2D Embedding Visualization with Class Colors")
    plt.xlabel("Dimension 1")
    plt.ylabel("Dimension 2")
    plt.show()

KeyboardInterrupt: 

In [5]:
# x, y = reduce_dimensions(graph["node_feat"])

In [6]:
# plot_embeddings(x, y, labels)

# Loading text features

OGB provides a file with all text features related to a papar (title and abstract): https://ogb.stanford.edu/docs/nodeprop/#ogbn-arxiv

In [None]:
import pandas as pd

text_features = pd.read_csv(
    "../titleabs.tsv",
    sep="\t",
    header=None,
    skiprows=1,
    names=["paper_id", "title", "abstract"],
    index_col="paper_id",
)

text_features = text_features.dropna()
text_features.index = text_features.index.map(int)
text_features

In [None]:
node_idx_to_paper_id = pd.read_csv(
    "./dataset/ogbn_arxiv/mapping/nodeidx2paperid.csv",
    index_col=0,
    names=["node_idx", "paper_id"],
    header=None,
    skiprows=1,
)
node_idx_to_paper_id

In [9]:
node_text_features = pd.merge(
    node_idx_to_paper_id,
    text_features,
    left_on="paper_id",
    right_on=text_features.index,
)
node_text_features.index.names = ["node_idx"]

In [None]:
node_text_features["publishing_year"] = graph["node_year"]
node_text_features

In [None]:
node_text_features["label"] = labels
node_text_features

In [None]:
# Select all rows in node_text_features where publishing_year is 2018
node_text_features[node_text_features["publishing_year"] == 2018]

# Loading graph instructions

In [13]:
import pandas as pd

train_instructions = pd.read_json("hf://datasets/Jiabin99/Arxiv-PubMed-mix-NC-LP/arxiv_pub_node_st_cot_link_mix.json")
train_instructions = train_instructions[train_instructions['id'].str.startswith("arxiv_train")]
train_instructions["node_idx"] = train_instructions["id"].apply(lambda x: x.split("_")[-1])
train_instructions.index = train_instructions["node_idx"]
train_instructions.index = train_instructions.index.map(int)

In [14]:
from datasets import load_dataset

eval_instructions = load_dataset("Jiabin99/GraphGPT-eval-instruction", split="test", data_files={"test": "arxiv_test_instruct_std.json"})
eval_instructions = pd.DataFrame(eval_instructions)
eval_instructions["node_idx"] = eval_instructions["id"].apply(lambda x: x.split("_")[-1])
eval_instructions.index = eval_instructions["node_idx"]
eval_instructions.index = eval_instructions.index.map(int)

# Merging text features and instructions

In [None]:
# Create a new dataframe that makes an inner join between `node_text_features` and `train_instructions` on the indexes of both dataframes
train_data = pd.merge(node_text_features, train_instructions, left_index=True, right_index=True)
train_data

In [None]:
min(train_data["publishing_year"]), max(train_data["publishing_year"])

In [None]:
# Create a new dataframe that makes an inner join between `node_text_features` and `eval_insstructions` on the indexes of both dataframes
eval_data = pd.merge(node_text_features, eval_instructions, left_index=True, right_index=True)
eval_data

In [None]:
min(eval_data["publishing_year"]), max(eval_data["publishing_year"])

# Generating full set of instructions

In [19]:
def replace_graph_in_conversations(conversation, graph_data):
    parsed_human_message = conversation[0]["value"]
    parsed_human_message = parsed_human_message.replace("<graph>", str(graph_data))

    return [
        {
            "from": "human",
            "value": parsed_human_message
        },
        conversation[1]
    ]


In [20]:
train_data['formatted_conversations'] = train_data.apply(
    lambda row: replace_graph_in_conversations(row['conversations'], row['graph']), axis=1
)

In [21]:
eval_data['formatted_conversations'] = eval_data.apply(
    lambda row: replace_graph_in_conversations(row['conversations'], row['graph']), axis=1
)

In [None]:
train_data

In [None]:
eval_data

# Exploring Gemma 2B-it model

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
%%time

from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it")
# model = AutoModel.from_pretrained("google/gemma-2-2b-it")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2-2b-it")


In [None]:
%%time

input_text = """
<human>: List all 40 sub-categories of the 'Computer Science' category in the ArXiv dataset.
<gpt>: Sure! Here are all the 40 sub-categories of the 'Computer Science' category in the ArXiv dataset:
"""
# input_ids = tokenizer(input_text, return_tensors="pt")

# outputs = model.generate(**input_ids, max_new_tokens=1024)
# print(tokenizer.decode(outputs[0]))

# Fine tune Gemma in a couple of samples

In [27]:
import logging


logging.basicConfig(level=logging.DEBUG)

In [None]:
train_prompt_template = """<human>: {}\n\n<gpt>: {}"""

train_prompts_dicts = train_data.sample(n=3, random_state=0)["formatted_conversations"].tolist()
train_prompts = []

for prompt in train_prompts_dicts:
    train_prompts.append(train_prompt_template.format(prompt[0]["value"], prompt[1]["value"]))

train_prompts

In [None]:
from datasets import Dataset

# Convert your list of prompts into a dictionary
fine_tuning_data = {"text": train_prompts}

# Create a Dataset object
fine_tuning_dataset = Dataset.from_dict(fine_tuning_data)
fine_tuning_dataset

In [None]:
print(tokenizer.__class__.__name__)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)


tokenized_fine_tuning_dataset = fine_tuning_dataset.map(tokenize_function, batched=True)

tokenized_fine_tuning_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

train_test_split = tokenized_fine_tuning_dataset.train_test_split(test_size=(1/3))
train_fine_tuning_dataset = train_test_split["train"]
eval_fine_tuning_dataset = train_test_split["test"]


In [31]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False  # Set mlm=False for causal language modeling
)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=1,
    num_train_epochs=2,
    save_strategy="epoch",
    logging_strategy="epoch",
    logging_dir="./logs",
    no_cuda=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_fine_tuning_dataset,
    eval_dataset=eval_fine_tuning_dataset,
)

In [33]:
del train_data
del eval_data
del eval_instructions
del train_instructions
del node_text_features
del node_idx_to_paper_id
del text_features

In [None]:
trainer.train()

In [None]:
logging.info("Train runned succesfully")