In [7]:
import pandas as pd
import os
from tqdm import trange
import requests
import json
from typing import List, Dict, Tuple, Any
import time


## load env variables
from dotenv import load_dotenv

load_dotenv()
HF_KEY = os.environ['HUGGINGFACE']
API_URL = os.environ['HF_API_URL']


In [3]:
def load_region_data(region: str) -> pd.DataFrame:
    # Define the file path based on the region
    region_path = f"../data/official_data/feedback_{region}.xlsx"

    # Specify columns to read
    columns_to_read = ["Feedback id", "Feedback 1", "Feedback 2", "URL"]

    # Load the data into a DataFrame
    df = pd.read_excel(region_path, usecols=columns_to_read)

    # Filter out rows with missing or invalid data
    df_filtered = df[
        (df['Feedback 1'].notna()) &
        (df['Feedback 2'].notna()) &
        (df['Feedback 2'] != '{"description":""}')
    ].copy()  # Ensure df_filtered is a separate copy

    # Extract the 'description' field from JSON in 'Feedback 2'
    df_filtered.loc[:, 'Feedback 2'] = df_filtered['Feedback 2'].apply(
        lambda x: json.loads(x)['description'] if isinstance(x, str) else None
    )

    # Convert 'Feedback id' to numeric and drop rows with invalid IDs
    df_filtered.loc[:, 'Feedback id'] = pd.to_numeric(df_filtered['Feedback id'], errors='coerce')
    df_filtered = df_filtered.dropna(subset=['Feedback id'])
    df_filtered.reset_index(drop=True, inplace=True)

    return df_filtered

def translate_batch(region_lowercased: str, foreign_texts: list):
    payload = {
        "inputs": [text for text in foreign_texts],
        "parameters": {"src_lang": region_lowercased,
                        "tgt_lang": "en"}
    }
    
    response = requests.post(
        API_URL, headers={"Authorization": f"Bearer {HF_KEY}"}, json=payload
    )    
    output = response.json()

    if 'error' in output:
        raise Exception(output['error'])
    
    translated_text = [{"Original": foreign_texts[i],
                        "Translated" : output[i]['translation_text'][3:]} for i in range(len(output))]
    
    return translated_text

def format_llm_input(df: pd.DataFrame) -> Tuple[List[Dict[str, str]], Dict[int, str]]:
    # Extract feedback IDs and feedback text
    feedback_ids = list(df['Feedback id'])
    feedback_texts = list(df['Feedback 2'])

    # Create a dictionary mapping feedback IDs to feedback text
    id_feedback = {int(feedback_id): feedback for feedback_id, feedback in zip(feedback_ids, feedback_texts)}

    # Prepare the LLM input as a list of dictionaries
    llm_input = [{'id': feedback_id, 'feedback': feedback} for feedback_id, feedback in id_feedback.items()]

    return llm_input, id_feedback

In [6]:
region  = "VN_Article"
df = load_region_data(region)
llm_input, id_feedback = format_llm_input(df)
llm_input



  warn("""Cannot parse header or footer so it will be ignored""")


[{'id': 3558716, 'feedback': 'B·∫°n c√≥ th·ªÉ kho√° lu√¥n tk gian h√†ng c·ªßa t√¥i nh√©'},
 {'id': 3557525,
  'feedback': 'Kh√¥ng h∆∞·ªõng d·∫´n tr√™n b·∫£ng ƒëi·ªán tho·∫°i hay m√°y m√°y r√µ r√†ng, t√¨m ho√†i kh√¥ng ra'},
 {'id': 3554240, 'feedback': 'T√¥i l√† ng∆∞·ªùi mua'},
 {'id': 3552371, 'feedback': 'm√¨nh kh√¥ng h·ªßy ƒëuoc d·ªãch v·ª• freeship extra.'},
 {'id': 3552236,
  'feedback': 'T√¥i v√†o t√†i kho·∫£n c·ªßa t√¥i kh√¥ng c√≥ m·ª•c C√¥ng c·ª• thi·∫øt k·∫ø h√¨nh ·∫£nh !'},
 {'id': 3550988,
  'feedback': 'kh√¥ng c√≥ th√¥ng tin nh∆∞ trong h√¨nh ·∫£nh h∆∞·ªõng d·∫´n'},
 {'id': 3550622,
  'feedback': 'b·∫£n d√πng cho web c√≥ kh√¥ng, t·∫°i t√¥i th·∫•y h∆∞·ªõng d·∫´n cho app ak k th·∫•y h∆∞·ªõng d·∫´n cho WEb\n'},
 {'id': 3550497,
  'feedback': 'L√Ω gi·∫£i chi ti·∫øt h∆°n b·∫±ng video h∆∞·ªõng d·∫´n gi·∫£i th√≠ch c·ª• th·ªÉ v√¨ sao'},
 {'id': 3549241, 'feedback': 'G·ªçi kh√¥ng ƒë∆∞·ª£c'},
 {'id': 3547268, 'feedback': 'B·ªè r·ªìi √†?'},
 {'id': 3545071,
  'feedback': 'h√†ng gi·∫£ th

In [None]:
# Loop through the llm_input
def translate_region_text(llm_input, region_lowercased):
    window = 2
    for i in range(0,len(llm_input), window):
        batch = 
        res = translate_batch(region_lowercased, batch)
        res
    return

In [39]:
region_lowercased = region.lower()
sample_text_1 = "H√£y lu√¥n tin t∆∞·ªüng v√†o b·∫£n th√¢n, v√¨ b·∫°n m·∫°nh m·∫Ω h∆°n b·∫°n nghƒ©."
sample_text_2 = "Nh·ªØng ƒëi·ªÅu nh·ªè b√© h√†ng ng√†y t·∫°o n√™n s·ª± kh√°c bi·ªát l·ªõn trong cu·ªôc s·ªëng."
foreign_texts = [sample_text_1, sample_text_2]



  warn("""Cannot parse header or footer so it will be ignored""")


[{'Original': 'H√£y lu√¥n tin t∆∞·ªüng v√†o b·∫£n th√¢n, v√¨ b·∫°n m·∫°nh m·∫Ω h∆°n b·∫°n nghƒ©.',
  'Translated': 'always believe in yourself, because you are stronger than you think.'},
 {'Original': 'Nh·ªØng ƒëi·ªÅu nh·ªè b√© h√†ng ng√†y t·∫°o n√™n s·ª± kh√°c bi·ªát l·ªõn trong cu·ªôc s·ªëng.',
  'Translated': "Every day's little things make a big difference in life."}]