# Import libraries

In [1]:
# Import libraries
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import time
import pandas as pd

from huggingface_hub import login
from datasets import load_dataset
from unsloth import FastLanguageModel

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


# Set Paths and Hyperparameters

In [12]:
# Base path
base_path = os.path.abspath(os.path.join(os.getcwd(), '..', '..', '..', '..', '..'))

# Source and target language
source_language = "Early Modern Bohemian German" #"Early Modern Bohemian German"
target_language = "English"

# Translation direction
translation_direction = "DE_to_EN" if source_language == "Early Modern Bohemian German" else "EN_to_DE"

# Model parameters
unsloth_model_name = 'unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit'
company_name = 'meta'

model_name = unsloth_model_name.split('/')[1]
max_new_tokens = 2000       # Maximum number of model output
max_seq_length = 128000     # Maximum of input tokens
dtype = None                # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True            # Use 4bit quantization to reduce memory usage. Can be False.

# Number of icl examples
shots = 128
formatted_shots = f"{shots:03}" 

# Icl examples path
icl_examples_path = os.path.join(
    base_path, 
    'data', 
    'icl_examples',
    translation_direction, 
    f'{formatted_shots}_example_prompt.txt'
)

# Icl prompts path
icl_prompts_path = os.path.join(
    base_path, 
    'data', 
    'icl_prompts',
    company_name,
    model_name,
    translation_direction, 
    f'{formatted_shots}_prompt_check.txt'
)

# Save inference dataset
save_path = os.path.join(
    base_path, 
    'results', 
    f'{company_name}',
    f'{model_name}', 
    'icl', 
    translation_direction, 
    f'{translation_direction}_{formatted_shots}_example_prompt.json'
)

# Print paths
print(f'Company name: {company_name}')
print(f'Model name: {model_name}')
print(f'Base path: {base_path}')
print(f'Translation direction: {translation_direction}')
print(f'Number of icl examples: {shots}')
print(f'ICL examples path: {icl_examples_path}')
print(f'ICL prompts path: {icl_prompts_path}')
print(f'Save ICL inference dataset path: {save_path}')

# Hugging face login
hub_token = "hf_..."
login(hub_token, add_to_git_credential=True)

Company name: meta
Model name: Meta-Llama-3.1-8B-Instruct-bnb-4bit
Base path: /cs/student/msc/csml/2023/ngriessh/historical_mt
Translation direction: DE_to_EN
Number of icl examples: 128
ICL examples path: /cs/student/msc/csml/2023/ngriessh/historical_mt/data/icl_examples/DE_to_EN/128_example_prompt.txt
ICL prompts path: /cs/student/msc/csml/2023/ngriessh/historical_mt/data/icl_prompts/meta/Meta-Llama-3.1-8B-Instruct-bnb-4bit/DE_to_EN/128_prompt_check.txt
Save ICL inference dataset path: /cs/student/msc/csml/2023/ngriessh/historical_mt/results/meta/Meta-Llama-3.1-8B-Instruct-bnb-4bit/icl/DE_to_EN/DE_to_EN_128_example_prompt.json
Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /cs/student/msc/csml/2023/ngriessh/.cache/huggingface/token
Login successful


# Load Dataset & ICL examples

In [13]:
# Load test dataset
dataset = load_dataset('niclasgriesshaber/EarlyModernGerman_to_EN')
test_dataset = dataset['test']
print(f'Loaded test dataset: \n {test_dataset}')

Loaded test dataset: 
 Dataset({
    features: ['Early Modern Bohemian German', 'English'],
    num_rows: 1000
})


In [14]:
# Load icl examples as string
with open(icl_examples_path, 'r') as file:
    icl_examples = file.read()

# Prompt Template

In [15]:
# Prompt template
prompt_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant tasked with translating from {} to {}. NEVER provide an introduction to the translation (e.g. 'Here is the translation:', 'Translate to', 'Hier ist die Übersetzung:', etc.), explanations or clarifications.
NEVER provide a note after your translation. In the following, there are some examples how you should translate in the translation task.<|eot_id|>
<|start_header_id|>user<|end_header_id|>
{}
### Translation Task. Only translate the following text. Nothing else!

{}:
{}

Translate to {} and match the structure of the source text. Output only this translation and nothing else.<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>{}"""

In [16]:
# Apply prompt template to test dataset
def formatting_prompts_func(examples, source_language, target_language, icl_examples):
    
    source_texts = examples[source_language]
    texts = []

    for source_text in source_texts:
        # Format the prompt with dynamic source and target languages
        text = prompt_template.format(
            source_language,  # Dynamic source language
            target_language,  # Dynamic target language
            icl_examples,     # Example shots (prompt text before asking for translation)
            source_language,  # Target language for translation request
            source_text,      # Actual source text to translate
            target_language,  # Target language for translation output (no actual output included)
            ""                # Placeholder for the output, left empty for inference
        ) 
        texts.append(text)

    return {"text": texts}

# Apply Prompt Template to Test Dataset

In [17]:
# Apply prompt template to all test samples
test_dataset = test_dataset.map(
    lambda examples: formatting_prompts_func(examples, source_language, target_language, icl_examples),
    batched=True
)

# Output a text file to check prompt
with open(icl_prompts_path, "w") as f:
    f.write(test_dataset['text'][1])


# Load Model

In [8]:
# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=unsloth_model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

==((====))==  Unsloth 2024.9.post4: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce RTX 3090 Ti. Max memory: 23.574 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


In [9]:
# Set model to inference mode
FastLanguageModel.for_inference(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaExtendedRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,),

In [18]:
# Save inferences in new dataset
def process_dataset(test_dataset, model, tokenizer, max_new_tokens, formatted_shots, translation_direction):

    # Extract target language
    if translation_direction == "DE_to_EN":
        target_language = "English"
    else:
        target_language = "Early Modern Bohemian German"
    
    # Convert Hugging Face dataset to Pandas DataFrame
    df = pd.DataFrame(test_dataset)
    
    total_time = 0  # Initialize the total time accumulator

    # Loop through each row in the dataframe
    for i, row in df.iterrows():
        start_time = time.time()  # Start timer for the current inference

        try:
            print(f"Processing test point {i + 1} of {len(df)}")

            # Get the text for the current row
            inputs = tokenizer([row['text']], return_tensors="pt").to("cuda")

            # Generate the model outputs
            outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, use_cache=True, repetition_penalty=1.1, temperature=0.01)

            # Decode the outputs, converting from token IDs back to text
            decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=False)

            # Since decoded_output has only one entry, extract the single output
            output = decoded_outputs[0]

            # Define the exact substring to search for
            search_string = f"<|start_header_id|>assistant<|end_header_id|>\n"

            # Find the index of the search_string in the output
            start_idx = output.find(search_string)

            # Extract everything after the search_string if it's found
            if start_idx != -1:
                extracted_text = output[start_idx + len(search_string):]
            else:
                extracted_text = 'NA'

            # Manually remove the special token '<|eot_id|>' from the beginning and the end, if present
            #extracted_text = extracted_text.replace("<|start_header_id|>assistant<|end_header_id|>\n", "").strip()
            extracted_text = extracted_text.replace("<|eot_id|>", "").strip()

            # Print the output and extracted text
            #print('_________________________________________________')
            #print(output)
            print('Sheilagh Ogilvie:')
            print('_________________________________________________')
            print(test_dataset[i][target_language])
            print('_________________________________________________')
            #print('LLM-generated Response:')
            print('_________________________________________________')
            print(extracted_text)
            print('_________________________________________________')
            print('_________________________________________________')
            print('_________________________________________________')
            print('_________________________________________________')

            # Add extracted_text directly to the dataframe at index i
            df.at[i, f'{translation_direction}_{formatted_shots}_example_prompt'] = extracted_text

        except Exception as e:
            print(f"Error occurred at test point {i + 1}: {str(e)}")
        
        end_time = time.time()  # End timer for the current inference
        elapsed_time = end_time - start_time  # Calculate elapsed time
        total_time += elapsed_time  # Accumulate total time

        print(f"Time for test point {i + 1}: {elapsed_time:.2f} seconds, Total time: {total_time:.2f} seconds")

    return df

In [19]:
# Call the function
processed_dataset = process_dataset(test_dataset, model, tokenizer, max_new_tokens, formatted_shots, translation_direction)

# Save the dataset as a JSON file
processed_dataset.to_json(save_path, orient='records', lines=True, force_ascii=False)
print(f"Dataset saved successfully to {save_path}")

Sheilagh Ogilvie:
_________________________________________________
The lady complains on account of many points against her serfs’ disobedience: 1. That they did not want to cover-in and stick the shepherding, would rather lay down their bodies and lives concerning it. 2. When the village headman set in several, without prior knowledge of the manor he had summoned together the entire community. 3. That the serfs would not take or cart-in any straw. 4. That they would not do the malt-carting according to the decision. A decision was issued concerning this, and the peasants were punished for their disobedience.
_________________________________________________
_________________________________________________
The lady complains about several points of disobedience by her serfs. 1. That they did not want to cover and clean the threshing floor and wanted to sacrifice their lives and livelihood over it. 2. As the judge had summoned several of them without the knowledge of the manor, 3. Tha

KeyboardInterrupt: 