In [2]:
%pip install --upgrade transformers tokenizers

Collecting transformers
  Downloading transformers-4.57.6-py3-none-any.whl.metadata (43 kB)
Collecting tokenizers
  Downloading tokenizers-0.22.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.3 kB)
Downloading transformers-4.57.6-py3-none-any.whl (12.0 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m179.8 MB/s[0m  [33m0:00:00[0m
[?25hDownloading tokenizers-0.22.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m144.2 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
[2K  Attempting uninstall: tokenizers
[2K    Found existing installation: tokenizers 0.19.1
[2K    Uninstalling tokenizers-0.19.1:
[2K      Successfully uninstalled tokenizers-0.19.1
[2K  Attempting uninstall: transformers━━━━━━━━━━━━━━━━━━[0m [32m0/2[0m [tokenizers]
[2K    Found existing

In [None]:
import os
import sys
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Files to save the translations
input_file = "shopping_sentences.txt"
output_nl = "shopping_dutch.txt"
output_de = "shopping_german.txt"

# Load the model
model_id = "Unbabel/Tower-Plus-2B"
tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    torch_dtype=torch.bfloat16, 
    device_map="auto"
)

# Load data
with open("shopping_sentences.txt", "r", encoding="utf-8") as f:
    english_lines = [line.strip() for line in f if line.strip()]

# Translation loops
dutch_translations = []
german_translations = []

for idx, line in enumerate(english_lines):
    # Translation to Dutch
    messages_nl = [
        {"role": "user", "content": f"Translate the following English source text to Dutch:\nEnglish: {line}\nDutch: "}
    ]

    # Translation to German
    messages_de = [
        {"role": "user", "content": f"Translate the following English source text to German:\nEnglish: {line}\nGerman: "}
    ]

    for lang, msgs, target_list in [("Dutch", messages_nl, dutch_translations), ("German", messages_de, german_translations)]:
        input_ids = tokenizer.apply_chat_template(
            msgs, 
            add_generation_prompt=True, 
            return_tensors="pt"
        ).to(model.device)
        
        outputs = model.generate(input_ids, max_new_tokens=100, do_sample=False)
        
        # Only save the translation part
        translation = tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True)
        target_list.append(translation.strip())

# Save translations to files
with open(output_nl, "w", encoding="utf-8") as f:
    f.write("\n".join(dutch_translations) + "\n")

with open(output_de, "w", encoding="utf-8") as f:
    f.write("\n".join(german_translations) + "\n")

print(f"Saved translations:")
print(f"Dutch: {output_nl}")
print(f"German: {output_de}")

  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!
The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.11s/it]


Saved translations:
Dutch: test/shopping_dutch.txt
German: test/shopping_german.txt


In [None]:
import os
import sys
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Files to save the translations
input_file = "shopping_sentences.txt"
output_ar = "shopping_arabic.txt"

# Load the model
model_id = "google/gemma-3-12b-it" 
tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    torch_dtype=torch.bfloat16, 
    device_map="auto"
)

# Load the data
if not os.path.exists(input_file):
    raise FileNotFoundError(f"Bestand {input_file} niet gevonden!")

with open(input_file, "r", encoding="utf-8") as f:
    english_lines = [line.strip() for line in f if line.strip()]

print(f"Start met het vertalen van {len(english_lines)} zinnen naar het Arabisch...")

# Translation loop
arabic_translations = []

for idx, line in enumerate(english_lines):
    messages = [
        {"role": "user", "content": f"Translate the following English sentence to Arabic. Only provide the translation, no extra text.\nEnglish: {line}"}
    ]
    
    input_ids = tokenizer.apply_chat_template(
        messages, 
        add_generation_prompt=True, 
        return_tensors="pt"
    ).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids, 
            max_new_tokens=150, 
            do_sample=False 
        )
    
    # Just save the translation part
    translation = tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True)
    arabic_translations.append(translation.strip())

# Save the results
with open(output_ar, "w", encoding="utf-8") as f:
    f.write("\n".join(arabic_translations) + "\n")

print(f"Arabic translations saved to: {output_ar}")

Loading checkpoint shards: 100%|██████████| 5/5 [00:29<00:00,  5.81s/it]
The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Start met het vertalen van 910 zinnen naar het Arabisch...
Arabic translations saved to: test/shopping_arabic.txt
