In [None]:
!pip install transformers
!pip install trl
!pip install datasets
!pip install pandas
!pip install scikit-learn

In [1]:
import sys
import io
import json
import os
import random
import numpy as np
import torch
import pandas as pd
import csv
from textwrap import dedent
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)
os.environ["WANDB_DISABLED"] = "true"

2025-02-10 08:04:41.100423: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Replace with local folder/file to model or with Huggingface ID

model_name = "path-to-model"
#model_name = "meta-llama/Llama-3.1-8B-Instruct"

In [3]:
device = torch.device("cuda")

In [4]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

seed_everything(0)

In [5]:
# Quantization makes model smaller and let's it fit on most GPUs

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_quant_type='nf4', bnb_4bit_compute_dtype=torch.bfloat16
)

In [6]:
# Load tokenizer and model

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    device_map="auto"
)

model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    quantization_config=quantization_config,
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


In [7]:
# Add padding

PAD_TOKEN = "<|pad|>"

tokenizer.add_special_tokens({"pad_token": PAD_TOKEN})
tokenizer.padding_side = "right"

model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)

print(tokenizer.pad_token, tokenizer.pad_token_id)

<|pad|> 128256


In [8]:
# Create prompt template

def format_example(row: dict):
    prompt = dedent(
        f"""
        {row['annotation']}
        """
    )
    messages = [
        {"role": "system",
         "content": "Provide an event label for the following text snippet! Do not output anything else!!!"},
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": row['tag']}
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False)

In [9]:
# Load data

# Data from https://doi.org/10.5334/johd.83
df=pd.read_csv('./events.csv',sep=',')
df=df[['annotation', 'tag']]
df['text'] = df.apply(format_example, axis=1)
df

Unnamed: 0,annotation,tag,text
0,In Front des schon seit Kurfürst Georg Wilhelm...,stative_event,<|begin_of_text|><|start_header_id|>system<|en...
1,während nach der Park- und Gartenseite hin ein...,stative_event,<|begin_of_text|><|start_header_id|>system<|en...
2,"Einige zwanzig Schritte weiter, in Richtung un...",stative_event,<|begin_of_text|><|start_header_id|>system<|en...
3,hinter der der Hohen-Cremmener Schindelturm mi...,stative_event,<|begin_of_text|><|start_header_id|>system<|en...
4,"Fronthaus, Seitenflügel und Kirchhofsmauer bil...",stative_event,<|begin_of_text|><|start_header_id|>system<|en...
...,...,...,...
6661,der bei diesen Worten aufwachte,change_of_state,<|begin_of_text|><|start_header_id|>system<|en...
6662,und Briest sagte ruhig,process,<|begin_of_text|><|start_header_id|>system<|en...
6663,"Ach, Luise",non_event,<|begin_of_text|><|start_header_id|>system<|en...
6664,laß,non_event,<|begin_of_text|><|start_header_id|>system<|en...


In [10]:
# Split into train, validation, test

from sklearn.model_selection import train_test_split

train, temp = train_test_split(df, test_size=0.2, random_state=1)
val, test = train_test_split(temp, test_size=0.2, random_state=1)

# save training-ready data to JSON
train.to_json("train.json", orient='records', lines=True)
val.to_json("val.json", orient='records', lines=True)
test.to_json("test.json", orient='records', lines=True)

In [11]:
# Transform to Huggingface-style dataset

from datasets import load_dataset

dataset = load_dataset(
    "json",
    data_files={'train': 'train.json', 'validation': 'val.json', 'test': 'test.json'}
)

print(dataset['train'][0]['text'])

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

Give a value for the following text snippet! The only possible values are: 'Rolle', 'Charakter', 'Alter', 'Physiognomie', 'Kleidung'. Do not output anything other than one of these values!!!<|eot_id|><|start_header_id|>user<|end_header_id|>

Innstetten war ernsthaft gewillt, auf das stille Leben ein  gesellschaftlich angeregteres folgen zu lassen, um seinet- und noch mehr um Effi's willen<|eot_id|><|start_header_id|>assistant<|end_header_id|>

stative_event<|eot_id|>


In [12]:
# Create data collator

from trl import DataCollatorForCompletionOnlyLM

response_template = "<|end_header_id|>"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

In [None]:
# get target module names for LoRA
target_names = []
for name, module in model.named_modules():
    if isinstance(module, nn.Linear) and "decoder.layers." in name:
        target_names.append(name)
target_names

In [13]:
# Configure LoRA

from peft import (
    LoraConfig,
    TaskType,
    get_peft_model,
    prepare_model_for_kbit_training
)

lora_config = LoraConfig(
    r=32,
    lora_alpha=16,
    target_modules=[
        "self_attn.q_proj",
        "self_attn.k_proj",
        "self_attn.v_proj",
        "self_attn.o_proj",
        "mlp.gate_proj",
        "mlp.up_proj",
        "mlp.down_proj"
    ],
    lora_dropout=0.05,
    bias='none',
    task_type=TaskType.CAUSAL_LM
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

print(model.print_trainable_parameters())

trainable params: 83,886,080 || all params: 8,114,212,864 || trainable%: 1.0338166055782685
None


In [14]:
# Configure fine tuning

from trl import SFTConfig, SFTTrainer

OUTPUT_DIR = "events"

sft_config = SFTConfig(
    output_dir=OUTPUT_DIR,
    dataset_text_field='text',
    max_seq_length=4096,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    optim="paged_adamw_8bit",
    eval_strategy='steps',
    eval_steps=0.2,
    save_steps=0.2,
    logging_steps=10,
    learning_rate=1e-4,
    fp16=True,
    save_strategy='steps',
    warmup_ratio=0.1,
    save_total_limit=2,
    lr_scheduler_type="cosine",
    save_safetensors=True,
    dataset_kwargs={
        "add_special_tokens": False,
        "append_concat_token": False,
    },
    seed=1
)

trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    tokenizer=tokenizer,
    data_collator=collator,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Map:   0%|          | 0/5332 [00:00<?, ? examples/s]

Map:   0%|          | 0/1067 [00:00<?, ? examples/s]

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [15]:
# Start fine tuning

trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
34,0.3695,0.303238
68,0.2655,0.223762
102,0.2194,0.201577
136,0.1881,0.192676


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


TrainOutput(global_step=166, training_loss=0.7382727344352079, metrics={'train_runtime': 1185.991, 'train_samples_per_second': 4.496, 'train_steps_per_second': 0.14, 'total_flos': 2.678691772465152e+16, 'train_loss': 0.7382727344352079, 'epoch': 0.9955022488755623})