In [1]:
import pandas as pd
import os
from tqdm import trange
import requests
import json
from typing import List, Dict, Tuple, Any
import time


## load env variables
from dotenv import load_dotenv

load_dotenv()
HF_KEY = os.environ['HUGGINGFACE']
API_URL = os.environ['HF_API_URL']

In [4]:
# Create global var - Num of HF inference calls

def load_region_data(region: str) -> pd.DataFrame:
    # Define the file path based on the region
    region_path = f"../data/official_data/feedback_{region}.xlsx"

    # Specify columns to read
    columns_to_read = ["Feedback id", "Feedback 1", "Feedback 2", "URL"]

    # Load the data into a DataFrame
    df = pd.read_excel(region_path, usecols=columns_to_read)

    # Filter out rows with missing or invalid data
    df_filtered = df[
        (df['Feedback 1'].notna()) &
        (df['Feedback 2'].notna()) &
        (df['Feedback 2'] != '{"description":""}')
    ].copy()  # Ensure df_filtered is a separate copy

    # Extract the 'description' field from JSON in 'Feedback 2'
    df_filtered.loc[:, 'Feedback 2'] = df_filtered['Feedback 2'].apply(
        lambda x: json.loads(x)['description'] if isinstance(x, str) else None
    )

    # Convert 'Feedback id' to numeric and drop rows with invalid IDs
    df_filtered.loc[:, 'Feedback id'] = pd.to_numeric(df_filtered['Feedback id'], errors='coerce')
    df_filtered = df_filtered.dropna(subset=['Feedback id'])
    df_filtered.reset_index(drop=True, inplace=True)

    return df_filtered

def translate_batch(region_lowercased: str, foreign_texts: list):
    payload = {
        "inputs": [text for text in foreign_texts],
        "parameters": {"src_lang": region_lowercased,
                        "tgt_lang": "en"}
    }
    
    response = requests.post(
        API_URL, headers={"Authorization": f"Bearer {HF_KEY}"}, json=payload
    )    
    output = response.json()

    if 'error' in output:
        raise Exception(output['error'])
    
    translated_text = [{"Original": foreign_texts[i],
                        "Translated" : output[i]['translation_text'][3:]} for i in range(len(output))]
    
    return translated_text


def format_llm_input(df: pd.DataFrame) -> Tuple[List[Dict[str, str]], Dict[int, str]]:
    # Extract feedback IDs and feedback text
    feedback_ids = list(df['Feedback id'])
    feedback_texts = list(df['Feedback 2'])

    # Create a dictionary mapping feedback IDs to feedback text
    id_feedback = {int(feedback_id): feedback for feedback_id, feedback in zip(feedback_ids, feedback_texts)}

    # Prepare the LLM input as a list of dictionaries
    llm_input = [{'id': feedback_id, 'feedback': feedback} for feedback_id, feedback in id_feedback.items()]

    return llm_input, id_feedback


# Loop through the llm_input
def translate_region_text(llm_input, region_lowercased):
    print("\nTranslation in progress now...\n")
    
    collected_translations = []
    
    for i in trange(0,len(llm_input)-2,2):
        # slice the input 
        batch_list_of_dicts = llm_input[i:i+2]
        batch_list = [dic['feedback'] for dic in batch_list_of_dicts]
        res = translate_batch(region_lowercased, batch_list)
        collected_translations.extend(res)
        time.sleep(1)
        if i >=5:
            break
    return collected_translations

In [6]:
region  = "VN_Article"
region_lowercased = "vn"
df = load_region_data(region)
llm_input, id_feedback = format_llm_input(df)
translations = translate_region_text(llm_input, region_lowercased)

  warn("""Cannot parse header or footer so it will be ignored""")



Translation in progress now...



  1%|▏         | 3/215 [00:58<1:09:09, 19.57s/it]


Exception: Model facebook/mbart-large-50-many-to-many-mmt is currently loading