In [21]:
"""
---------------------------------------------------------
Translation with Helsinki-NLP/opus-mt-en-tr
for English-to-Turkish (Longer Context Friendly)
---------------------------------------------------------

This script:
  - Reads a CSV file with at least the following columns:
      ['source', 'article_url', 'title', 'date', 'shortened_full_text']
  - Translates 'title' and 'shortened_full_text' from English to Turkish
  - Adds new columns 'title_tr' and 'shortened_full_text_tr'
  - Saves the resulting DataFrame to a CSV file
"""

import pandas as pd
import torch
from transformers import pipeline

In [22]:
df = pd.read_csv('./balanced_data_1000.csv')

In [23]:
if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    print("MPS is available. Using Apple GPU acceleration.")
    device_arg = "mps"
else:
    print("Using CPU.")
    device_arg = "cpu"

MPS is available. Using Apple GPU acceleration.


In [24]:
model_name = "Helsinki-NLP/opus-mt-tc-big-en-tr"
translator = pipeline(
    "translation",
    model=model_name,
    tokenizer=model_name,
    device=device_arg,
    truncation=True,
    max_length=1024  # you can try 2048 if your system has enough memory
)

config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/470M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/337 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/833k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.50M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/470M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

In [25]:
def translate_text(text):
    if not isinstance(text, str):
        return ""
    result = translator(text)
    return result[0]["translation_text"]

In [26]:
df['title_tr'] = df['title'].apply(translate_text)
df['shortened_full_text_tr'] = df['shortened_full_text'].apply(translate_text)

In [27]:
df.to_csv('translated_file_helsinki.csv', index=False)
print("Translation complete. Saved to 'translated_file_helsinki.csv'.")

Translation complete. Saved to 'translated_file_helsinki.csv'.
