# Import libraries

In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import time
import pandas as pd
#import torch

from huggingface_hub import login
from datasets import load_dataset
from unsloth import FastLanguageModel
from unsloth import is_bfloat16_supported
from peft import PeftModel

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


# Set Paths and Hyperparameters

In [None]:
# Base path
base_path = os.path.abspath(os.path.join(os.getcwd(), '..', '..', '..', '..', '..'))

# Best model stored under
best_model = "epoch_2" ##############Change this eventually!!#####################

# Source and target language
source_language = "English"
target_language = "Early Modern Bohemian German"

# Translation direction
translation_direction = "DE_to_EN" if source_language == "Early Modern Bohemian German" else "EN_to_DE"

# Model parameters
unsloth_model_name = 'unsloth/gemma-2-2b-it-bnb-4bit'
company_name = 'alphabet'

model_name = unsloth_model_name.split('/')[1]
max_new_tokens = 2000           # Maximum number of model output
max_seq_length = 5000         # Maximum of input tokens
dtype = None                    # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True             # Use 4bit quantization to reduce memory usage. Can be False.

# Load lora model from path
load_lora_model_path = os.path.join(
    base_path, 
    'models', 
    company_name, 
    model_name, 
    'finetuning', 
    translation_direction, 
    best_model)

# Save inference dataset
save_path = os.path.join(
    base_path, 
    'results', 
    company_name,
    model_name, 
    'finetuning', 
    translation_direction, 
    f'{translation_direction}_finetuning.json'
)

# Print paths
print(f'Company name: {company_name}')
print(f'Model name: {model_name}')
print(f'Base path: {base_path}')
print(f'Translation direction: {translation_direction}')
print(f'LoRA model path: {load_lora_model_path}')
print(f'Save finetuning inference dataset path: {save_path}')

# Hugging face login
hub_token = "hf_..."
login(hub_token, add_to_git_credential=True)

Company name: alphabet
Model name: gemma-2-2b-it-bnb-4bit
Base path: /cs/student/msc/csml/2023/ngriessh/historical_mt
Translation direction: EN_to_DE
LoRA model path: /cs/student/msc/csml/2023/ngriessh/historical_mt/models/alphabet/gemma-2-2b-it-bnb-4bit/finetuning/EN_to_DE/epoch_2
Save finetuning inference dataset path: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/alphabet/gemma-2-2b-it-bnb-4bit/finetuning/EN_to_DE/EN_to_DE_finetuning.json
Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /cs/student/msc/csml/2023/ngriessh/.cache/huggingface/token
Login successful


# Load Dataset & ICL examples

In [3]:
# Load test dataset
dataset = load_dataset('niclasgriesshaber/EarlyModernGerman_to_EN_finetuning')
test_dataset = dataset['test']
print(f'Loaded test dataset: \n {test_dataset}')

Loaded test dataset: 
 Dataset({
    features: ['Early Modern Bohemian German', 'English'],
    num_rows: 1000
})


# Prompt template

In [4]:
prompt_template = """Translate the following from {} to {}:

### Input
{}

### Translation
{}"""

In [5]:
# Apply prompt template to test dataset
def formatting_prompts_func(examples, source_language, target_language):
    
    source_texts = examples[source_language]
    texts = []

    for source_text in source_texts:
        # Format the prompt with dynamic source and target languages
        text = prompt_template.format(
            source_language,  # Dynamic source language
            target_language,  # Dynamic target language
            source_text,      # Actual source text to translate
            ""                # Placeholder for the output, left empty for inference
        )
        texts.append(text)

    return {"text": texts}

# Apply Prompt Template to Test Dataset

In [6]:
# Apply prompt template to all test samples
test_dataset = test_dataset.map(
    lambda examples: formatting_prompts_func(examples, source_language, target_language),
    batched=True
)

In [7]:
# Output a text file to check prompt
with open('inference_prompt_check', "w") as f:
    f.write(test_dataset['text'][0])

# Load Model

In [8]:
# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=unsloth_model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

Unsloth: If you want to finetune Gemma 2, install flash-attn to make it faster!
To install flash-attn, do the below:

pip install --no-deps --upgrade "flash-attn>=2.6.3"
==((====))==  Unsloth 2024.9.post4: Fast Gemma2 patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce RTX 3090 Ti. Max memory: 23.575 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


# Load LoRA adapters

In [9]:
# Load LoRA adapters from Peft
print('Loading LoRA adapaters')
model = PeftModel.from_pretrained(model, load_lora_model_path)
print('LoRA adapters loaded successfully')
print(f'Lora model path under: {load_lora_model_path}')

Loading LoRA adapaters
LoRA adapters loaded successfully
Lora model path under: /cs/student/msc/csml/2023/ngriessh/historical_mt/models/alphabet/gemma-2-2b-it-bnb-4bit/finetuning/EN_to_DE/epoch_2


# Inference

In [10]:
# Set model to inference mode
FastLanguageModel.for_inference(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma2ForCausalLM(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 2304)
        (layers): ModuleList(
          (0-25): 26 x Gemma2DecoderLayer(
            (self_attn): Gemma2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2304, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2304, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): 

In [11]:
# Save inferences in new dataset
def process_dataset(test_dataset, model, tokenizer, max_new_tokens, translation_direction):

    # Extract target language
    if translation_direction == "DE_to_EN":
        target_language = "English"
    else:
        target_language = "Early Modern Bohemian German"
    
    # Convert Hugging Face dataset to Pandas DataFrame
    df = pd.DataFrame(test_dataset)
    
    total_time = 0  # Initialize the total time accumulator

    # Loop through each row in the dataframe
    for i, row in df.iterrows():
        start_time = time.time()  # Start timer for the current inference

        try:
            print(f"Processing test point {i + 1} of {len(df)}")

            # Get the text for the current row
            inputs = tokenizer([row['text']], return_tensors="pt").to("cuda")

            # Generate the model outputs
            outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, use_cache=True, repetition_penalty=1.3, temperature=0.7, top_p = 0.5)

            # Decode the outputs, converting from token IDs back to text
            decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=False)

            # Since decoded_output has only one entry, extract the single output
            output = decoded_outputs[0]

            # Define the exact substring to search for
            search_string = f"### Translation"

            # Find the index of the search_string in the output
            start_idx = output.find(search_string)

            # Extract everything after the search_string if it's found
            if start_idx != -1:
                extracted_text = output[start_idx + len(search_string):]
            else:
                extracted_text = 'NA'

            # Manually remove the special token '<|eot_id|>' from the beginning and the end, if present
            extracted_text = extracted_text.replace("<end_of_turn>", "").strip()
            extracted_text = extracted_text.replace("<eos>", "").strip()


            # Print the output and extracted text
            print('_________________________________________________')
            print(output)
            print('_________________________________________________')
            print(extracted_text)
            print('_________________________________________________')

            # Add extracted_text directly to the dataframe at index i
            df.at[i, f'{translation_direction}_finetuning'] = extracted_text

        except Exception as e:
            print(f"Error occurred at test point {i + 1}: {str(e)}")
        
        end_time = time.time()  # End timer for the current inference
        elapsed_time = end_time - start_time  # Calculate elapsed time
        total_time += elapsed_time  # Accumulate total time

        print(f"Time for test point {i + 1}: {elapsed_time:.2f} seconds, Total time: {total_time:.2f} seconds")

    return df

In [12]:
tokenizer.eos_token

'<eos>'

In [13]:
# Call the function
processed_dataset = process_dataset(test_dataset, model, tokenizer, max_new_tokens, translation_direction)

# Save the dataset as a JSON file
processed_dataset.to_json(save_path, orient='records', lines=True, force_ascii=False)
print(f"Dataset saved successfully to {save_path}")

Processing test point 1 of 1000


AUTOTUNE bmm(8x66x256, 8x256x66)
  triton_bmm_1 0.0072 ms 100.0%
  bmm 0.0080 ms 90.0%
  triton_bmm_2 0.0092 ms 77.8%
  triton_bmm_7 0.0102 ms 70.0%
  triton_bmm_3 0.0113 ms 63.6%
  triton_bmm_4 0.0113 ms 63.6%
  triton_bmm_6 0.0113 ms 63.6%
  triton_bmm_5 0.0143 ms 50.0%
  triton_bmm_10 0.0143 ms 50.0%
  triton_bmm_9 0.0154 ms 46.7%
SingleProcess AUTOTUNE benchmarking takes 3.9026 seconds and 0.0056 seconds precompiling
AUTOTUNE bmm(8x66x66, 8x66x256)
  bmm 0.0063 ms 100.0%
  triton_bmm_21 0.0072 ms 87.9%
  triton_bmm_22 0.0072 ms 87.9%
  triton_bmm_26 0.0072 ms 87.9%
  triton_bmm_19 0.0082 ms 77.0%
  triton_bmm_23 0.0082 ms 77.0%
  triton_bmm_24 0.0082 ms 77.0%
  triton_bmm_25 0.0082 ms 77.0%
  triton_bmm_33 0.0089 ms 70.9%
  triton_bmm_20 0.0092 ms 68.4%
SingleProcess AUTOTUNE benchmarking takes 3.7910 seconds and 0.0008 seconds precompiling


_________________________________________________
<bos>Translate the following from English to Early Modern Bohemian German:

### Input
Since they did not split the rafting-wood, they shall for this reason split it within 14 days of today and without any delay on pain of 1 Weiss Schocks, signed 3 Oct 1588

### Translation
Demnach sie das floßholz nit gespalten haben sollen solches von dato in.14 tagen vnd ohn allen verzug spalten bey der Peen..1 weiss sso Signatum den...9 Octobris Ao p 88<eos>
_________________________________________________
Demnach sie das floßholz nit gespalten haben sollen solches von dato in.14 tagen vnd ohn allen verzug spalten bey der Peen..1 weiss sso Signatum den...9 Octobris Ao p 88
_________________________________________________
Time for test point 1: 11.68 seconds, Total time: 11.68 seconds
Processing test point 2 of 1000
_________________________________________________
<bos>Translate the following from English to Early Modern Bohemian German:

### Input