In [None]:
import pandas as pd

# Load the cleaned Spanish -> English dataset
df = pd.read_csv('../datasets/cleaned_es_en_dataset.csv', delimiter=';', encoding='utf-8')
inputs = df['input_text'].tolist()

In [None]:
from transformers import pipeline

# Initialize the translation pipeline
translator = pipeline('translation', model='Helsinki-NLP/opus-mt-es-en')

In [None]:
import time

# Generate translations (takes ~4.5 minutes)
translations = []
times = []
token_counts = []
for text in inputs:
    tokens = text.split()
    token_count = len(tokens)
    
    start_time = time.time()
    translation = translator(text)[0]['translation_text']
    end_time = time.time()
    
    translations.append(translation)
    times.append(end_time - start_time)
    token_counts.append(token_count)    

# translations = [translator(text)[0]['translation_text'] for text in inputs]

total_time = sum(times)
total_tokens = sum(token_counts)
average_time_per_token = total_time / total_tokens

In [None]:
print(f'Total time taken: {total_time:.6f} seconds')
print(f'Total number of tokens: {total_tokens}')
print(f'Average time per token: {average_time_per_token:.6f} seconds')
print('=================================================================')

for text, translation, duration, token_count in zip(inputs, translations, times, token_counts):
    print(f'Spanish text: {text}')
    print(f'English translation: {translation}')
    print(f'Time taken: {duration:.6f} seconds')
    print(f'Number of tokens: {token_count}\n')

In [None]:
# Save translated text to a new CSV file
tr_df = pd.DataFrame(translations, columns=['translated_text'])
tr_df.to_csv('../translated-datasets/opus-mt-es-en-translated_es_en_dataset.csv', sep=';', index=False)