# Requirements and Initialization

In [None]:
!pip install --quiet datasets pandas sentencepiece sentence_transformers torch tqdm transformers 

In [None]:
import pandas as pd

from torch import device
from torch.cuda import is_available
from tqdm import trange
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, MarianMTModel, MarianTokenizer

In [None]:
checkpoint = 'Helsinki-NLP/opus-tatoeba-en-tr'
dataset_path = 'path/to/dataset.csv'
translated_dataset_path = 'path/to/translated_dataset.csv'
batch_size = 32

# Model Loading

In [None]:
device = device('cuda') if is_available() else device('cpu')
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint).to(device)

# Translation

In [None]:
df = pd.read_csv(dataset_path)

In [None]:
data = []
batch_size = batch_size
beg = 0
end = len(df)
for i in trange(beg, end, batch_size):
  slc = df.iloc[i : i + batch_size]
  tk = tokenizer(list(slc['src']), padding=True, return_tensors='pt')
  tk.to(device)
  translate = model.generate(**tk)
  trans = [tokenizer.decode(ids, skip_special_tokens=True) for ids in translate]
  data.extend([[x,y,z] for x,y,z in zip(list(slc['src']), list(slc['tgt']), trans)])

In [None]:
output_df = pd.DataFrame(data, columns=['src', 'tgt', 'trs'])
output_df.head()

In [None]:
output_df.to_csv(translated_dataset_path, encoding='utf-8', index = False)