In [None]:
from transformers import MarianMTModel, MarianTokenizer
import pandas as pd
from tqdm import tqdm
import os
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import json
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
root = "/content/drive/MyDrive/MASTER_THESIS/"
FR_CAPTIONS = root + "subject_matter_captions_FR.csv"
subject_matter_captions_FR = pd.read_csv(FR_CAPTIONS)
subject_matter_captions_FR.head(3)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, source_texts):
        self.source_texts = source_texts

    def __len__(self):
        return len(self.source_texts)

    def __getitem__(self, idx):
        return self.source_texts[idx]
    
def translate_batch(source_captions, model, tokenizer, device, batch_size=16):
    dataset = TranslationDataset(source_captions)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    translated_texts = []

    for i, batch in enumerate(tqdm(dataloader)):
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
        inputs = inputs.to(device)

        # Perform translation
        translated = model.generate(**inputs)

        # Decode translations
        translated_batch = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
        translated_texts.extend(translated_batch)
    
    resulting_df = FR_CAPTIONS.copy()
    resulting_df["caption"] = translated_texts
    resulting_df["length_tokenization"] = np.nan
    return resulting_df

In [None]:
model_fr_en = "Helsinki-NLP/opus-mt-fr-en"
tokenizer_fr_en = MarianTokenizer.from_pretrained(model_fr_en)
model_fr_en = MarianMTModel.from_pretrained(model_fr_en).to(device)

# record_id	    row_type	caption	    number_of_tokens
subject_matter_captions_EN = translate_batch(list(subject_matter_captions_FR["caption"]), model_fr_en, tokenizer_fr_en, device, batch_size=100)
subject_matter_captions_EN.to_csv(root + "subject_matter_captions_EN.csv", index=False)

In [None]:
model_en_nl = "Helsinki-NLP/opus-mt-en-nl" # We cannot translate from French to Dutch using opus
tokenizer_en_nl = MarianTokenizer.from_pretrained(model_en_nl)
model_en_nl = MarianMTModel.from_pretrained(model_en_nl).to(device)

subject_matter_captions_NL = translate_batch(list(subject_matter_captions_EN["caption"]), model_en_nl, tokenizer_en_nl, device, batch_size=100)
subject_matter_captions_NL.to_csv(root + "subject_matter_captions_NL.csv", index=False)