# Import libraries

In [1]:
import os
import torch

from huggingface_hub import login
from datasets import load_dataset
from unsloth import FastLanguageModel
from transformers import TrainingArguments, TrainerCallback
from trl import SFTTrainer
from unsloth import is_bfloat16_supported

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


# Set Paths and Hyperparameters

In [2]:
# Base path
base_path = os.path.abspath(os.path.join(os.getcwd(), '..', '..', '..', '..', '..'))

# Source and target language
source_language = "English"
target_language = "Early Modern Bohemian German"

# Translation direction
translation_direction = "DE_to_EN" if source_language == "Early Modern Bohemian German" else "EN_to_DE"

# Model parameters
unsloth_model_name = 'unsloth/gemma-2-9b-it-bnb-4bit'
company_name = 'alphabet'

model_name = unsloth_model_name.split('/')[1]
max_new_tokens = 2000           # Maximum number of model output
max_seq_length = 5000           # Maximum of input tokens
dtype = None                    # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True             # Use 4bit quantization to reduce memory usage. Can be False.

# Model output path
model_output_path = os.path.join(
    base_path, 
    'models', 
    company_name, 
    model_name,
    'finetuning',
    translation_direction)

# Finetuning prompts path
finetuning_prompts_path = os.path.join(
    base_path, 
    'data', 
    'icl_prompts',
    company_name,
    model_name,
    translation_direction, 
    'finetuning_prompt_check.txt'
)

# Print paths
print(f'Company name: {company_name}')
print(f'Model name: {model_name}')
print(f'Base path: {base_path}')
print(f'Translation direction: {translation_direction}')
print(f'Finetuning prompts path: {finetuning_prompts_path}')
print(f'Model output path: {model_output_path}')

# Hugging face login
hub_token = "hf_..."
login(hub_token, add_to_git_credential=True)

Company name: alphabet
Model name: gemma-2-9b-it-bnb-4bit
Base path: /cs/student/msc/csml/2023/ngriessh/historical_mt
Translation direction: EN_to_DE
Finetuning prompts path: /cs/student/msc/csml/2023/ngriessh/historical_mt/data/icl_prompts/alphabet/gemma-2-9b-it-bnb-4bit/EN_to_DE/finetuning_prompt_check.txt
Model output path: /cs/student/msc/csml/2023/ngriessh/historical_mt/models/alphabet/gemma-2-9b-it-bnb-4bit/finetuning/EN_to_DE
Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /cs/student/msc/csml/2023/ngriessh/.cache/huggingface/token
Login successful


# Load Training and Validation Dataset

In [3]:
# Data preparation
dataset = load_dataset("niclasgriesshaber/EarlyModernGerman_to_EN_finetuning")
train_dataset = dataset["train"]
validation_dataset = dataset["validation"]

In [4]:
train_dataset

Dataset({
    features: ['Early Modern Bohemian German', 'English'],
    num_rows: 2429
})

In [5]:
validation_dataset

Dataset({
    features: ['Early Modern Bohemian German', 'English'],
    num_rows: 269
})

# Prompt template

In [6]:
prompt_template = """Translate the following from {} to {}:

### Input
{}

### Translation
{}"""

In [7]:
# Apply prompt template
def formatting_prompts_func(examples, source_language, target_language):

    source_texts = examples[source_language]
    target_texts = examples[target_language]
    texts = []

    for source_text, target_text in zip(source_texts, target_texts):
        # Format the prompt with dynamic source and target languages
        text = prompt_template.format(
            source_language, 
            target_language,
            source_text,
            target_text
        ) + '<eos>'
        texts.append(text)

    return {"text": texts}

# Apply Prompt Template to Validation and Test Dataset

In [8]:
# Apply prompt template to train dataset
train_dataset = train_dataset.map(
    lambda examples: formatting_prompts_func(examples, source_language, target_language),
    batched=True
)

# Apply prompte template to validation dataset
validation_dataset = validation_dataset.map(
    lambda examples: formatting_prompts_func(examples, source_language, target_language),
    batched=True
)

In [9]:
# Output a text file to check prompt
with open(finetuning_prompts_path, "w") as f:
    f.write(validation_dataset['text'][0])

# Load Model

In [10]:
# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = unsloth_model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

Unsloth: If you want to finetune Gemma 2, install flash-attn to make it faster!
To install flash-attn, do the below:

pip install --no-deps --upgrade "flash-attn>=2.6.3"
==((====))==  Unsloth 2024.9.post4: Fast Gemma2 patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce RTX 3090 Ti. Max memory: 23.575 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


# PEFT

In [11]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0.1, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.1.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2024.9.post4 patched 42 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


# Finetuning

In [12]:
# Custom Callback to Save LoRA Parameters and Track Validation Loss after Each Epoch
class SaveLoRAPeftCallback(TrainerCallback):

    def on_epoch_end(self, args, state, control, **kwargs):

        # Get epoch
        num_epoch = round(state.epoch)

        # Print end of epoch
        print(f'End of epoch {num_epoch}')

        # Evaluate the model on the validation dataset
        eval_results = trainer.evaluate()

        # Extract the validation loss
        validation_loss = eval_results["eval_loss"]

        # Print the validation loss
        print(f"Validation Loss after epoch {num_epoch}: {validation_loss}")

        # Save the validation loss to a file
        with open("validation_loss.txt", "a") as f:
            f.write(f"Validation Loss after epoch {num_epoch}: {validation_loss}\n")

        # Create folder for each epoch
        output_dir = os.path.join(args.output_dir, f"epoch_{num_epoch}")
        os.makedirs(output_dir)

        # Print saving LoRA parameters
        print(f"Saving LoRA parameters to {output_dir} at the end of epoch {round(state.epoch)}")
        
        # Save only the LoRA adapters
        kwargs['model'].save_pretrained(output_dir)

        print('Saved. Emptying torch.cuda.empty_cache() now.')
        torch.cuda.empty_cache()

# Initialize the trainer with the callback
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        learning_rate=3e-4,
        lr_scheduler_type="linear",
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=4,
        num_train_epochs=5,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        eval_strategy="epoch",
        save_strategy="no",
        optim="adamw_8bit",
        weight_decay=0.01,
        warmup_steps=10,
        output_dir=model_output_path,
        seed=0,
        log_level="info",
        report_to="none",
    ),
    callbacks=[SaveLoRAPeftCallback()],
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map (num_proc=2):   0%|          | 0/2429 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/269 [00:00<?, ? examples/s]

Using auto half precision backend


In [13]:
# Run evaluation at the beginning of the first epoch
print('Evaluation at the beginning of epoch 1')
trainer.evaluate()


***** Running Evaluation *****
  Num examples = 269
  Batch size = 1


Evaluation at the beginning of epoch 1


{'eval_loss': 4.2469868659973145,
 'eval_model_preparation_time': 0.0056,
 'eval_runtime': 50.6026,
 'eval_samples_per_second': 5.316,
 'eval_steps_per_second': 5.316}

In [14]:
# Start finetuning
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 2,429 | Num Epochs = 5
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 4
\        /    Total batch size = 4 | Total steps = 3,035
 "-____-"     Number of trainable parameters = 108,036,096


Epoch,Training Loss,Validation Loss,Model Preparation Time
0,1.7982,2.038273,0.0056
1,1.5674,1.969005,0.0056



***** Running Evaluation *****
  Num examples = 269
  Batch size = 1


End of epoch 1
Validation Loss after epoch 1: 2.038273334503174
Saving LoRA parameters to /cs/student/msc/csml/2023/ngriessh/historical_mt/models/alphabet/gemma-2-9b-it-bnb-4bit/finetuning/EN_to_DE/epoch_1 at the end of epoch 1


loading configuration file config.json from cache at /cs/student/msc/csml/2023/ngriessh/.cache/huggingface/hub/models--unsloth--gemma-2-9b-it-bnb-4bit/snapshots/c7297a426ef0190c1213fbdd7958d7fb6d482bbe/config.json
Model config Gemma2Config {
  "_name_or_path": "unsloth/gemma-2-9b-it",
  "architectures": [
    "Gemma2ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "attn_logit_softcapping": 50.0,
  "bos_token_id": 2,
  "cache_implementation": "hybrid",
  "eos_token_id": 1,
  "final_logit_softcapping": 30.0,
  "head_dim": 256,
  "hidden_act": "gelu_pytorch_tanh",
  "hidden_activation": "gelu_pytorch_tanh",
  "hidden_size": 3584,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "model_type": "gemma2",
  "num_attention_heads": 16,
  "num_hidden_layers": 42,
  "num_key_value_heads": 8,
  "pad_token_id": 0,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "b

Saved. Emptying torch.cuda.empty_cache() now.



***** Running Evaluation *****
  Num examples = 269
  Batch size = 1


End of epoch 2
Validation Loss after epoch 2: 1.9690049886703491
Saving LoRA parameters to /cs/student/msc/csml/2023/ngriessh/historical_mt/models/alphabet/gemma-2-9b-it-bnb-4bit/finetuning/EN_to_DE/epoch_2 at the end of epoch 2


loading configuration file config.json from cache at /cs/student/msc/csml/2023/ngriessh/.cache/huggingface/hub/models--unsloth--gemma-2-9b-it-bnb-4bit/snapshots/c7297a426ef0190c1213fbdd7958d7fb6d482bbe/config.json
Model config Gemma2Config {
  "_name_or_path": "unsloth/gemma-2-9b-it",
  "architectures": [
    "Gemma2ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "attn_logit_softcapping": 50.0,
  "bos_token_id": 2,
  "cache_implementation": "hybrid",
  "eos_token_id": 1,
  "final_logit_softcapping": 30.0,
  "head_dim": 256,
  "hidden_act": "gelu_pytorch_tanh",
  "hidden_activation": "gelu_pytorch_tanh",
  "hidden_size": 3584,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "model_type": "gemma2",
  "num_attention_heads": 16,
  "num_hidden_layers": 42,
  "num_key_value_heads": 8,
  "pad_token_id": 0,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "b

Saved. Emptying torch.cuda.empty_cache() now.



***** Running Evaluation *****
  Num examples = 269
  Batch size = 1


End of epoch 3


OutOfMemoryError: CUDA out of memory. Tried to allocate 3.65 GiB. GPU 0 has a total capacity of 23.57 GiB of which 2.57 GiB is free. Including non-PyTorch memory, this process has 11.37 GiB memory in use. Process 166437 has 9.51 GiB memory in use. Of the allocated memory 10.89 GiB is allocated by PyTorch, and 162.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)