# Compare Translations: mBART vs. Google Translate API
Author: Loc Dao (LD) - locdao.fw@gmail.com

In [22]:
import easyocr
import os
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
import time
import json
from google.cloud import translate_v2 as translate
from concurrent.futures import ThreadPoolExecutor, as_completed

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [23]:
# Download and check if model works
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "facebook/mbart-large-50-many-to-many-mmt"
local_dir = "./model/"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast = False, cache_dir=local_dir)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name, cache_dir=local_dir)

print("Model and tokenizer successfully downloaded and loaded!")

text = "Hello, This is LD. How are you doing today?"
source_lang = "en_XX"  
target_lang = "fr_XX"

tokenizer.src_lang = source_lang
inputs = tokenizer(text, return_tensors="pt")

translated_tokens = model.generate(
    **inputs,
    forced_bos_token_id=tokenizer.lang_code_to_id[target_lang],
    max_length=50
)

translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
print("Input (English):", text)
print("Translated (French):", translated_text)

  from .autonotebook import tqdm as notebook_tqdm


Model and tokenizer successfully downloaded and loaded!
Input (English): Hello, This is LD. How are you doing today?
Translated (French): Bonjour, je suis LD. Comment va-t-il aujourd'hui?


In [9]:
# Prepare ocr
supported_lang_ch = ['ch_sim', 'en']  
supported_lang_others = ['en', 'es', 'de', 'fr']  

reader_ch = easyocr.Reader(supported_lang_ch, gpu=True) 
reader_others = easyocr.Reader(supported_lang_others, gpu=True)

In [12]:
# Utils func
def prepare_ocr_data(dir_path, context):
    print(f"Progressing directory {dir_path}:") 
    valid_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff'}
    
    results = []
    ocr_reader = context["reader"]
    source_lang = context["source_lang"]
           
    for filename in os.listdir(dir_path):
        file_path = os.path.join(dir_path, filename)
        if os.path.isfile(file_path) and os.path.splitext(filename)[1].lower() in valid_extensions:
            print(f"    Progressing {file_path}") 
            ocr_result = ocr_reader.readtext(file_path, detail = 0, paragraph=True)
            for idx, text in enumerate(ocr_result):
                result = {
                    "source_dir": dir_path,
                    "source_file": filename,
                    "source_lang": source_lang,
                    "token_index": idx,
                    "ocr_original_text": text
                }
                results.append(result)
        else:
            print(f"    Not a file or does not exist: {filename}")
    return results

def make_samples_data():
    results = []

    results.extend(prepare_ocr_data("sample/chi", {"source_lang": "zh_CN", "reader": reader_ch}))
    results.extend(prepare_ocr_data("sample/es", {"source_lang": "es_XX", "reader": reader_others}))
    results.extend(prepare_ocr_data("sample/ger", {"source_lang": "de_DE", "reader": reader_others}))
    
    df = pd.DataFrame(results, columns=[
        "source_dir", "source_file", "source_lang",
        "token_index", "ocr_original_text", 
    ])
    return df

def load(sample_file):
    if os.path.exists(sample_file):
        print(f"Loading sample file...")
        df = pd.read_csv(sample_file)
    else:
        print(f"Making sample file...")
        df = make_samples_data()
        df.to_csv(sample_file, index=False)
    return df

In [13]:
# Load or make sample_file
# print(prepare_ocr_data("sample/chi", {"source_lang": "chi_sim", "target_lang": "en_US", "reader": reader_ch}))
# print(prepare_ocr_data("sample/es", {"source_lang": "es", "target_lang": "en_US", "reader": reader_others}))
# print(prepare_ocr_data("sample/ger", {"source_lang": "ger", "target_lang": "en_US", "reader": reader_others}))

sample_file = "ocr_results.csv"

df = load(sample_file)
df

Loading sample file...


Unnamed: 0,source_dir,source_file,source_lang,token_index,ocr_original_text
0,sample/chi,foreign-language-immunization-schedules.png,zh_CN,0,"IMMUNIZATION RECORD (预防接穆e,) Personal Health A..."
1,sample/chi,foreign-language-immunization-schedules.png,zh_CN,1,IMUNIZATION RECORD (嶷 簇
2,sample/chi,foreign-language-immunization-schedules.png,zh_CN,2,IIMUNIZATIOV (疫苗)
3,sample/chi,foreign-language-immunization-schedules.png,zh_CN,3,DOINY 日月年
4,sample/chi,foreign-language-immunization-schedules.png,zh_CN,4,DDIIAIY
...,...,...,...,...,...
170,sample/ger,3 - ger.jpg,de_DE,0,Weitere Schutzimpfungen Other Vaccinations Imp...
171,sample/ger,2 - ger.jpg,de_DE,0,MEXIVHAPOIHOE CBHIETEJIBCTBO 0 BAKLILHALIMI ...
172,sample/ger,2 - ger.jpg,de_DE,1,KeM I3rO TOBJCHa [ounIc6 IOJUKHOCTb OquunabHas...
173,sample/ger,2 - ger.jpg,de_DE,2,2


In [20]:
# model supported lang
# ['ar_AR', 'cs_CZ', 'de_DE', 'en_XX', 'es_XX', 
#  'et_EE', 'fi_FI', 'fr_XX', 'gu_IN', 
#  'hi_IN', 'it_IT', 'ja_XX', 'kk_KZ', 
#  'ko_KR', 'lt_LT', 'lv_LV', 'my_MM', 
#  'ne_NP', 'nl_XX', 'ro_RO', 'ru_RU', 'si_LK', 'tr_TR', 
#  'vi_VN', 'zh_CN', 'af_ZA', 'az_AZ', 
#  'bn_IN', 'fa_IR', 'he_IL', 'hr_HR', 
#  'id_ID', 'ka_GE', 'km_KH', 'mk_MK', 
#  'ml_IN', 'mn_MN', 'mr_IN', 'pl_PL', 
#  'ps_AF', 'pt_XX', 'sv_SE', 'sw_KE', 
#  'ta_IN', 'te_IN', 'th_TH', 'tl_XX', 
#  'uk_UA', 'ur_PK', 'xh_ZA', 'gl_ES', 'sl_SI']
def translate_with_facebook_model(text, source_lang, target_lang):
    tokenizer.src_lang = source_lang
    inputs = tokenizer(text, return_tensors="pt")
    
    translated_tokens = model.generate(
        **inputs,
        forced_bos_token_id=tokenizer.lang_code_to_id[target_lang],
        max_length=300
    )
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    return translated_text

def translate_row(row, target_lang="en_XX"):
    source_lang = row["source_lang"]
    text = row["ocr_original_text"]
    translated = translate_with_facebook_model(str(text), source_lang, target_lang)
    return row.name, translated

def do_facebook(): 
    max_workers = min(4, os.cpu_count() or 1)
    total_rows = len(df)
    print(f"Starting translation for {total_rows} rows with {max_workers} threads...")
    
    start_time = time.time()
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_index = {executor.submit(translate_row, row): row.name for index, row in df.iterrows()}
        with tqdm(total=total_rows, desc="Translating", unit="row") as pbar:
            completed = 0
            failed = 0
            for future in as_completed(future_to_index):
                try:
                    index, translated = future.result()
                    df.at[index, "translated_by_mbart"] = translated
                    completed += 1
                    pbar.update(1)
                except Exception as e:
                    print(f"Future failed for index {future_to_index[future]}: {e}")
                    failed += 1
                    pbar.update(1)
                
                # Periodic status update
                if completed % 10 == 0 or completed + failed == total_rows:
                    elapsed = time.time() - start_time
                    remaining = (elapsed / completed * (total_rows - completed - failed)) if completed > 0 else 0
                    print(f"Status: {completed} completed, {failed} failed, {total_rows - completed - failed} pending, "
                          f"Elapsed: {elapsed:.1f}s, ETA: {remaining:.1f}s")
    
    df.to_csv("translated_with_facebook_model.csv", index = False)

In [17]:
config_path = "./config.json"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = config_path
google_client = translate.Client()

def translate_with_google(text, source_lang, target_lang):
    return google_client.translate(text,source_language=source_lang,target_language=target_lang)
    
def translate_row_(row, target_lang="en"):
    lang_map = {
        "chi_sim": "zh-CN",
        "es_XX": "es",
        "de_DE": "de",
        "en_XX": "en"
    }
    source_lang = lang_map.get(row["source_lang"], row["source_lang"])
    text = row["ocr_original_text"]
    translated = translate_with_google(str(text), source_lang, target_lang)
    return row.name, translated["translatedText"]
    
def do_google():
    max_workers = min(4, os.cpu_count() or 1)
    total_rows = len(df)
    print(f"Starting translation for {total_rows} rows with {max_workers} threads...")
    
    start_time = time.time()
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_index = {executor.submit(translate_row_, row): row.name for index, row in df.iterrows()}
        with tqdm(total=total_rows, desc="Translating", unit="row") as pbar:
            completed = 0
            failed = 0
            for future in as_completed(future_to_index):
                try:
                    index, translated = future.result()
                    df.at[index, "translated_by_google"] = translated
                    completed += 1
                    pbar.update(1)
                except Exception as e:
                    print(f"Future failed for index {future_to_index[future]}: {e}")
                    failed += 1
                    pbar.update(1)
                
                # Periodic status update
                if completed % 10 == 0 or completed + failed == total_rows:
                    elapsed = time.time() - start_time
                    remaining = (elapsed / completed * (total_rows - completed - failed)) if completed > 0 else 0
                    print(f"Status: {completed} completed, {failed} failed, {total_rows - completed - failed} pending, "
                          f"Elapsed: {elapsed:.1f}s, ETA: {remaining:.1f}s")
    df.to_csv("translated_with_google.csv", index = False)

Starting translation for 175 rows with 4 threads...


Translating:   7%|█▋                      | 12/175 [00:00<00:05, 28.87row/s]

Status: 10 completed, 0 failed, 165 pending, Elapsed: 0.6s, ETA: 10.0s


Translating:  15%|███▌                    | 26/175 [00:00<00:04, 34.96row/s]

Status: 20 completed, 0 failed, 155 pending, Elapsed: 0.9s, ETA: 7.1s


Translating:  19%|████▋                   | 34/175 [00:01<00:04, 31.18row/s]

Status: 30 completed, 0 failed, 145 pending, Elapsed: 1.2s, ETA: 5.8s


Translating:  27%|██████▍                 | 47/175 [00:01<00:02, 47.32row/s]

Status: 40 completed, 0 failed, 135 pending, Elapsed: 1.5s, ETA: 4.9s
Status: 50 completed, 0 failed, 125 pending, Elapsed: 1.6s, ETA: 4.0s


Translating:  39%|█████████▎              | 68/175 [00:01<00:01, 53.75row/s]

Status: 60 completed, 0 failed, 115 pending, Elapsed: 1.8s, ETA: 3.4s
Status: 70 completed, 0 failed, 105 pending, Elapsed: 1.9s, ETA: 2.9s


Translating:  50%|████████████            | 88/175 [00:02<00:01, 54.02row/s]

Status: 80 completed, 0 failed, 95 pending, Elapsed: 2.2s, ETA: 2.6s
Status: 90 completed, 0 failed, 85 pending, Elapsed: 2.3s, ETA: 2.2s


Translating:  61%|█████████████▉         | 106/175 [00:02<00:01, 54.47row/s]

Status: 100 completed, 0 failed, 75 pending, Elapsed: 2.5s, ETA: 1.9s


Translating:  67%|███████████████▌       | 118/175 [00:02<00:01, 48.88row/s]

Status: 110 completed, 0 failed, 65 pending, Elapsed: 2.7s, ETA: 1.6s
Status: 120 completed, 0 failed, 55 pending, Elapsed: 2.9s, ETA: 1.3s


Translating:  78%|██████████████████     | 137/175 [00:03<00:00, 51.62row/s]

Status: 130 completed, 0 failed, 45 pending, Elapsed: 3.1s, ETA: 1.1s
Status: 140 completed, 0 failed, 35 pending, Elapsed: 3.3s, ETA: 0.8s


Translating:  89%|████████████████████▎  | 155/175 [00:03<00:00, 52.28row/s]

Status: 150 completed, 0 failed, 25 pending, Elapsed: 3.5s, ETA: 0.6s
Status: 160 completed, 0 failed, 15 pending, Elapsed: 3.7s, ETA: 0.3s


Translating:  98%|██████████████████████▌| 172/175 [00:03<00:00, 42.58row/s]

Status: 170 completed, 0 failed, 5 pending, Elapsed: 3.9s, ETA: 0.1s


Translating: 100%|███████████████████████| 175/175 [00:03<00:00, 44.21row/s]

Status: 175 completed, 0 failed, 0 pending, Elapsed: 4.1s, ETA: 0.0s





In [41]:
df_facebook = pd.read_csv("translated_with_facebook_model.csv")
text = df_facebook.at[158 - 1, "ocr_original_text"]
src_lang = df_facebook.at[158 - 1, "source_lang"]
target_lang = "en_XX"
print(f"[FB] Original text: {text}")
print(f"[FB] translated: {translate_with_facebook_model(text, src_lang, target_lang)}")

df_google = pd.read_csv("translated_with_google.csv")
text = df_google.at[158 - 1, "ocr_original_text"]
src_lang = "es" if df_google.at[158 - 1, "source_lang"] == "es_XX" else "es_XX"
target_lang = "en"
print(f"[GG] Original text: {text}")
print(f"[GG] translated: {translate_with_google(text, src_lang, target_lang)}")

[FB] Original text: DATE GIVEN focha de vacunación
[FB] translated: The Committee recommends that the State party take all necessary measures to ensure the full enjoyment of all human rights and fundamental freedoms, including the right to education, including the right to health, the right to food, the right
[GG] Original text: DATE GIVEN focha de vacunación
[GG] translated: {'translatedText': 'DATE GIVEN vaccination token', 'input': 'DATE GIVEN focha de vacunación'}
