# Downloading text features for graph

In [1]:
from pathlib import Path


PROJECT_ROOT_DIR = str(Path.cwd().parent)
PROJECT_ROOT_DIR

'/root/llm_graph_embedder'

# Exploring Gemma 2B-it model

In [28]:
device = "cuda"

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
%%time

from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it", device_map="cuda")
# model = AutoModel.from_pretrained("google/gemma-2-2b-it")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2-2b-it", device_map="cuda")

In [None]:
model.to(device)

In [None]:
%%time

input_text = """
<human>: List all 40 sub-categories of the 'Computer Science' category in the ArXiv dataset.
<gpt>: Sure! Here are all the 40 sub-categories of the 'Computer Science' category in the ArXiv dataset:
"""

tokenized_input = tokenizer(input_text, return_tensors="pt")

# Move tokenizer outputs to the correct device
input_ids = tokenized_input["input_ids"].to(device)
attention_mask = tokenized_input.get("attention_mask", None)
if attention_mask is not None:
    attention_mask = attention_mask.to(device)

# Generate predictions
outputs = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_new_tokens=1024,
)

In [None]:
print(tokenizer.decode(outputs[0]))

# Fine tune Gemma in a couple of samples

In [34]:
import logging


logging.basicConfig(level=logging.DEBUG)

In [None]:
train_prompt_template = """<human>: {}\n\n<gpt>: {}"""

train_prompts_dicts = train_data.sample(n=3, random_state=0)[
    "formatted_conversations"
].tolist()
train_prompts = []

for prompt in train_prompts_dicts:
    train_prompts.append(
        train_prompt_template.format(prompt[0]["value"], prompt[1]["value"])
    )

train_prompts

In [None]:
from datasets import Dataset

# Convert your list of prompts into a dictionary
fine_tuning_data = {"text": train_prompts}

# Create a Dataset object
fine_tuning_dataset = Dataset.from_dict(fine_tuning_data)
fine_tuning_dataset

In [None]:
print(tokenizer.__class__.__name__)


def tokenize_function(examples):
    return tokenizer(
        examples["text"], padding="max_length", truncation=True, max_length=512
    )


tokenized_fine_tuning_dataset = fine_tuning_dataset.map(tokenize_function, batched=True)

tokenized_fine_tuning_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask"]
)

train_test_split = tokenized_fine_tuning_dataset.train_test_split(test_size=(1 / 3))
train_fine_tuning_dataset = train_test_split["train"]
eval_fine_tuning_dataset = train_test_split["test"]

In [38]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False  # Set mlm=False for causal language modeling
)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=1,
    num_train_epochs=2,
    save_strategy="epoch",
    logging_strategy="epoch",
    logging_dir="./logs",
    no_cuda=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_fine_tuning_dataset,
    eval_dataset=eval_fine_tuning_dataset,
)

In [None]:
trainer.train()

In [None]:
logging.info("Train runned succesfully")