In [1]:
from transformers import MarianMTModel, MarianTokenizer
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv 
import os
import torch

In [2]:
# loading variables from .env file
load_dotenv("../../private_data/.env") 

# PARENT gets us to the root of the project
PARENT = "./../../"

OUTPUT_MOONDREAM_CAPTIONS = PARENT + os.getenv("OUTPUT_MOONDREAM_CAPTIONS")

In [3]:
captions = pd.read_csv(OUTPUT_MOONDREAM_CAPTIONS)
captions

Unnamed: 0,recordID,task,EN,FR,NL
0,64,caption,A religious scene features a central figure o...,,
1,64,What objects do you see ?,"In the image, there are two people on a cross...",,
2,64,What colors do you see ?,The image features a painting with a predomin...,,
3,64,Is this image bright or dark ?,The image is dark.,,
4,64,What emotion do you feel when looking at this ...,"When looking at this image, I feel a sense of...",,
...,...,...,...,...,...
455,324,caption,"A nude woman in a blue and white dress, with ...",,
456,324,What objects do you see ?,"In the image, there are several objects that ...",,
457,324,What colors do you see ?,The image features a painting with a woman an...,,
458,324,Is this image bright or dark ?,This image is dark.,,


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
source_lang = "EN"
target_lang = "FR"

if source_lang == "EN" and target_lang == "FR":
    # Load the pre-trained model and tokenizer
    model_name = "Helsinki-NLP/opus-mt-en-fr"
elif source_lang == "EN" and target_lang == "NL":
    model_name = "Helsinki-NLP/opus-mt-en-nl"

print(f"Model name: {model_name}")
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name).to(device)

Model name: Helsinki-NLP/opus-mt-en-fr




In [6]:
model.device

device(type='cuda', index=0)

In [7]:
source = list(captions[source_lang])
len(source)

460

In [29]:
def translate_batch_to_french(texts, batch_size=8):
    translated_texts = []
    
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
        inputs = inputs.to(device)
        
        # Perform translation
        translated = model.generate(**inputs)
        
        # Decode translations
        translated_batch = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
        translated_texts.extend(translated_batch)
    
    return translated_texts

batch_size = 28
translated_sentences = translate_batch_to_french(source, batch_size=batch_size)

captions[target_lang] = translated_sentences
captions.to_csv(OUTPUT_MOONDREAM_CAPTIONS, index=False)
captions

 76%|███████▋  | 13/17 [08:52<02:36, 39.20s/it]