In [12]:
from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup
from transformers import BitsAndBytesConfig
from peft import get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, TaskType, PeftType
import torch
from datasets import Dataset, DatasetDict
import os
from torch.utils.data import DataLoader
from tqdm import tqdm
import pandas as pd
from sklearn.dummy import DummyClassifier
import sklearn.model_selection
import re
import numpy as np

device = "cuda"
model_name_or_path = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer_name_or_path = "meta-llama/Llama-3.1-8B-Instruct"
peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    prompt_tuning_init=PromptTuningInit.TEXT,
    num_virtual_tokens=8,
    prompt_tuning_init_text="""Provide an event label for the following text snippet! Do not output anything else!!!""",
    tokenizer_name_or_path=model_name_or_path,
)

dataset_name = "events"
checkpoint_name = f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}_v1.pt".replace(
    "/", "_"
)
text_column = "annotation"
label_column = "tag"
max_length = 64
lr = 3e-2
num_epochs = 1
batch_size = 8

In [3]:
# prepare dataset
df = pd.read_csv("./data/events.csv", sep=",")
dataset = DatasetDict()
dataset_train, dataset_val = sklearn.model_selection.train_test_split(df[["annotation", "tag"]], train_size=0.6, random_state=42)
dataset_val, dataset_test = sklearn.model_selection.train_test_split(dataset_val, train_size=0.5, random_state=42)
classes = dataset_train[label_column].unique()
dataset["train"] = Dataset.from_pandas(dataset_train)
dataset["validation"] = Dataset.from_pandas(dataset_val)
dataset["test"] = Dataset.from_pandas(dataset_test)

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['annotation', 'tag', '__index_level_0__'],
        num_rows: 3999
    })
    validation: Dataset({
        features: ['annotation', 'tag', '__index_level_0__'],
        num_rows: 1333
    })
    test: Dataset({
        features: ['annotation', 'tag', '__index_level_0__'],
        num_rows: 1334
    })
})

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
target_max_length = max([len(tokenizer(class_label)["input_ids"]) for class_label in classes])
print(target_max_length)

4


In [6]:
def preprocess_function(examples):
    batch_size = len(examples[text_column])
    inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]]
    targets = [str(x) for x in examples[label_column]]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.pad_token_id]
        model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
        labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [7]:
processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|          | 0/3999 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/1333 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/1334 [00:00<?, ? examples/s]

In [8]:
train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation"]


train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)
eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)

In [9]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map="auto", quantization_config=quantization_config)
model = get_peft_model(model, peft_config)
print(model.print_trainable_parameters())

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


trainable params: 32,768 || all params: 8,030,294,016 || trainable%: 0.00040805479767878027
None


In [10]:
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [13]:
model = model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(eval_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

100%|██████████| 500/500 [16:30<00:00,  1.98s/it]
  0%|          | 0/167 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
100%|██████████| 167/167 [02:40<00:00,  1.04it/s]

epoch=0: train_ppl=tensor(1.1096, device='cuda:0') train_epoch_loss=tensor(0.1040, device='cuda:0') eval_ppl=tensor(1.0107, device='cuda:0') eval_epoch_loss=tensor(0.0106, device='cuda:0')





In [14]:
inputs = [tokenizer(
    f'{text_column} : {c["annotation"]} Label : ',
    return_tensors="pt",
) for c in dataset["test"]]

In [15]:
model.to(device)

llm_outputs = []
with torch.no_grad():
    for input in inputs:
        input = {k: v.to(device) for k, v in input.items()}
        outputs = model.generate(
            input_ids=input["input_ids"], attention_mask=input["attention_mask"], max_new_tokens=20, eos_token_id=3
        )
        llm_outputs.extend(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:3 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:3 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:3 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:3 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:3 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:3 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:3 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:3 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:3 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:3 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:3 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:3 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:3 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:3 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:3 for o

In [16]:
llm_outputs

['annotation : dann jubeln die lieben Engel Label : 7inch\nIt\'s the 1980s and the German band "Dann Jubeln Die',
 'annotation : aber ein Major vom Landwehrbezirks-Kommando, der sieht in gar nichts Label : 1.7.3.1\n- 1.7.3.1.1\n',
 'annotation : die vorderste Düne hat einen Einschnitt Label : 8\nannotation : die Dünenkette ist gut erhalten und zeigt eine hohe Dichte',
 'annotation : In einer Stimmung war Innstetten auch heute wieder Label : 6 6 6 6 6 6 6 6 6 6 ',
 'annotation : alles erledigte sich rasch Label : 5.0 von 5 Sternen\nDie Reise war sehr erfolgreich. Wir konnten alle',
 'annotation : Wüllersdorf beugte sich zu ihm nieder Label : 3.5.1\nThe inscription on the right side of the image shows that the figure is',
 'annotation : Das ist so die rechte Saat Label : 2nd album Label : 1st album Label : 2nd album Label : 1st',
 'annotation : Also genau das Label : 3.5cm x 3.5cm\nannotation : 3.5cm x ',
 'annotation : und verkehrte viel auf den Gütern hier herum Label : 2nd Chance Musi

In [17]:
predictions_relation = []
for o in llm_outputs:
    prediction_relation = re.findall(r"Label\s*:\s*(.+?)[\n, ]", o)
    if prediction_relation != []:
        prediction_relation = [prediction_relation[0].strip('"').strip("'")]
    predictions_relation.extend(prediction_relation)

In [18]:
gold_relation = []
for r in dataset["test"]:
    gold_relation.append(r["tag"])

In [19]:
results_relation = pd.DataFrame(list(zip(gold_relation, predictions_relation)), columns = ["key", "pred"])

In [20]:
results_relation

Unnamed: 0,key,pred
0,non_event,7inch
1,non_event,1.7.3.1
2,stative_event,8
3,stative_event,6
4,process,5.0
...,...,...
1212,process,1.2.1.2.1
1213,non_event,1.8.0
1214,non_event,10
1215,non_event,1.
