In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Importing the texts to translate:

In [None]:
import pandas as pd
import numpy as np
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

path='/content/drive/MyDrive/CBS/CBS Thesis Lydia & Sara/Data/03_Data_Modeling/XX.csv'
df= pd.read_csv(path, sep='\t', encoding='utf-16')

df.head()

Translation models:


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_da="Helsinki-NLP/opus-mt-da-en"
model_se="Helsinki-NLP/opus-mt-sv-en"
model_fi="Helsinki-NLP/opus-mt-fi-en"
model_de="Helsinki-NLP/opus-mt-de-en"

model_name=model_da  ##Set to current language

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

Translation set-up:

In [None]:
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

def translate_batch(texts, tokenizer, model, device, batch_size=64):
    model.eval()
    dataset = TextDataset(texts)
    loader = DataLoader(dataset, batch_size=batch_size)

    translated_texts = []

    for batch in tqdm(loader, desc="Translating batches"):
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model.generate(inputs["input_ids"], max_length=512, num_beams=4, early_stopping=True)

        translations = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        translated_texts.extend(translations)

    return translated_texts


Translating batches of Hit Sentences:

In [None]:
texts = df['Hit Sentence'].tolist()

translations = translate_batch(texts, tokenizer, model, device)

df['Translation'] = translations

Saving the dataframe with the new Translation-column:

In [None]:
df.to_csv('/content/drive/MyDrive/CBS/CBS Thesis Lydia & Sara/Data/03_Data_Modeling/XX.csv',
          sep='\t',
          encoding='utf-16',
          index=False)