In [None]:
import os
import sys
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Files to save the translations
input_file = "test/colours_sentences.txt"
output_nl = "test/colours_dutch.txt"
output_de = "test/colours_german.txt"

# Load the model
model_id = "Unbabel/Tower-Plus-2B"
tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    torch_dtype=torch.bfloat16, 
    device_map="auto"
)

# Load data
with open("test/colours_sentences.txt", "r", encoding="utf-8") as f:
    english_lines = [line.strip() for line in f if line.strip()]

# Translation loops
dutch_translations = []
german_translations = []

for idx, line in enumerate(english_lines):
    # Translation to Dutch
    messages_nl = [
        {"role": "user", "content": f"Translate the following English source text to Dutch:\nEnglish: {line}\nDutch: "}
    ]

    # Translation to German
    messages_de = [
        {"role": "user", "content": f"Translate the following English source text to German:\nEnglish: {line}\nGerman: "}
    ]

    for lang, msgs, target_list in [("Dutch", messages_nl, dutch_translations), ("German", messages_de, german_translations)]:
        input_ids = tokenizer.apply_chat_template(
            msgs, 
            add_generation_prompt=True, 
            return_tensors="pt"
        ).to(model.device)
        
        outputs = model.generate(input_ids, max_new_tokens=100, do_sample=False)
        
        # Only save the translation part
        translation = tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True)
        target_list.append(translation.strip())

# Save translations to files
with open(output_nl, "w", encoding="utf-8") as f:
    f.write("\n".join(dutch_translations) + "\n")

with open(output_de, "w", encoding="utf-8") as f:
    f.write("\n".join(german_translations) + "\n")

print(f"Saved translations:")
print(f"Dutch: {output_nl}")
print(f"German: {output_de}")

  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!
The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.86s/it]


Start met het vertalen van 800 zinnen...
Voortgang: 20/800 zinnen voltooid.
Voortgang: 40/800 zinnen voltooid.
Voortgang: 60/800 zinnen voltooid.
Voortgang: 80/800 zinnen voltooid.
Voortgang: 100/800 zinnen voltooid.
Voortgang: 120/800 zinnen voltooid.
Voortgang: 140/800 zinnen voltooid.
Voortgang: 160/800 zinnen voltooid.
Voortgang: 180/800 zinnen voltooid.
Voortgang: 200/800 zinnen voltooid.
Voortgang: 220/800 zinnen voltooid.
Voortgang: 240/800 zinnen voltooid.
Voortgang: 260/800 zinnen voltooid.
Voortgang: 280/800 zinnen voltooid.
Voortgang: 300/800 zinnen voltooid.
Voortgang: 320/800 zinnen voltooid.
Voortgang: 340/800 zinnen voltooid.
Voortgang: 360/800 zinnen voltooid.
Voortgang: 380/800 zinnen voltooid.
Voortgang: 400/800 zinnen voltooid.
Voortgang: 420/800 zinnen voltooid.
Voortgang: 440/800 zinnen voltooid.
Voortgang: 460/800 zinnen voltooid.
Voortgang: 480/800 zinnen voltooid.
Voortgang: 500/800 zinnen voltooid.
Voortgang: 520/800 zinnen voltooid.
Voortgang: 540/800 zinnen v

In [None]:
import os
import sys
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Files to save the translations
input_file = "test/colours_sentences.txt"
output_ar = "test/colours_arabic.txt"

# Load the model
model_id = "google/gemma-3-12b-it" 
tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    torch_dtype=torch.bfloat16, 
    device_map="auto"
)

# Load the data
if not os.path.exists(input_file):
    raise FileNotFoundError(f"Bestand {input_file} niet gevonden!")

with open(input_file, "r", encoding="utf-8") as f:
    english_lines = [line.strip() for line in f if line.strip()]

print(f"Start met het vertalen van {len(english_lines)} zinnen naar het Arabisch...")

# Translation loop
arabic_translations = []

for idx, line in enumerate(english_lines):
    messages = [
        {"role": "user", "content": f"Translate the following English sentence to Arabic. Only provide the translation, no extra text.\nEnglish: {line}"}
    ]
    
    input_ids = tokenizer.apply_chat_template(
        messages, 
        add_generation_prompt=True, 
        return_tensors="pt"
    ).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids, 
            max_new_tokens=150, 
            do_sample=False 
        )
    
    # Just save the translation part
    translation = tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True)
    arabic_translations.append(translation.strip())

# Save the results
with open(output_ar, "w", encoding="utf-8") as f:
    f.write("\n".join(arabic_translations) + "\n")

print(f"Arabic translations saved to: {output_ar}")

  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!
Fetching 5 files: 100%|██████████| 5/5 [00:09<00:00,  1.97s/it]
Loading checkpoint shards: 100%|██████████| 5/5 [00:26<00:00,  5.35s/it]
The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Start met het vertalen van 800 zinnen naar het Arabisch...
Voortgang: 20/800 zinnen voltooid.
Voortgang: 40/800 zinnen voltooid.
Voortgang: 60/800 zinnen voltooid.
Voortgang: 80/800 zinnen voltooid.
Voortgang: 100/800 zinnen voltooid.
Voortgang: 120/800 zinnen voltooid.
Voortgang: 140/800 zinnen voltooid.
Voortgang: 160/800 zinnen voltooid.
Voortgang: 180/800 zinnen voltooid.
Voortgang: 200/800 zinnen voltooid.
Voortgang: 220/800 zinnen voltooid.
Voortgang: 240/800 zinnen voltooid.
Voortgang: 260/800 zinnen voltooid.
Voortgang: 280/800 zinnen voltooid.
Voortgang: 300/800 zinnen voltooid.
Voortgang: 320/800 zinnen voltooid.
Voortgang: 340/800 zinnen voltooid.
Voortgang: 360/800 zinnen voltooid.
Voortgang: 380/800 zinnen voltooid.
Voortgang: 400/800 zinnen voltooid.
Voortgang: 420/800 zinnen voltooid.
Voortgang: 440/800 zinnen voltooid.
Voortgang: 460/800 zinnen voltooid.
Voortgang: 480/800 zinnen voltooid.
Voortgang: 500/800 zinnen voltooid.
Voortgang: 520/800 zinnen voltooid.
Voortgang