In [18]:
import pandas as pd
import os
from tqdm import trange
import requests
import json
from typing import List, Dict, Tuple, Any


## load env variables
from dotenv import load_dotenv

load_dotenv()
hf_key = os.environ['HUGGINGFACE']
API_URL = "https://api-inference.huggingface.co/models/facebook/mbart-large-50-many-to-many-mmt"


In [20]:
def load_region_data(region: str) -> pd.DataFrame:
    # Define the file path based on the region
    region_path = f"../data/official_data/feedback_{region}.xlsx"

    # Specify columns to read
    columns_to_read = ["Feedback id", "Feedback 1", "Feedback 2", "URL"]

    # Load the data into a DataFrame
    df = pd.read_excel(region_path, usecols=columns_to_read)

    # Filter out rows with missing or invalid data
    df_filtered = df[
        (df['Feedback 1'].notna()) &
        (df['Feedback 2'].notna()) &
        (df['Feedback 2'] != '{"description":""}')
    ].copy()  # Ensure df_filtered is a separate copy

    # Extract the 'description' field from JSON in 'Feedback 2'
    df_filtered.loc[:, 'Feedback 2'] = df_filtered['Feedback 2'].apply(
        lambda x: json.loads(x)['description'] if isinstance(x, str) else None
    )

    # Convert 'Feedback id' to numeric and drop rows with invalid IDs
    df_filtered.loc[:, 'Feedback id'] = pd.to_numeric(df_filtered['Feedback id'], errors='coerce')
    df_filtered = df_filtered.dropna(subset=['Feedback id'])

    return df_filtered

def translate_batch(region_lowercased: str, foreign_texts: list):
    payload = {
        "inputs": [text for text in foreign_texts],
        "parameters": {"src_lang": region_lowercased,
                        "tgt_lang": "en"}
    }
    
    response = requests.post(
        API_URL, headers={"Authorization": f"Bearer {hf_key}"}, json=payload
    )    
    output = response.json()

    if 'error' in output:
        raise Exception("Translation server error")
    
    translated_text = [{"Original": foreign_texts[i],
                        "Translated" : output[i]['translation_text'][3:]} for i in range(len(output))]
    
    return translated_text

def format_llm_input(df: pd.DataFrame) -> Tuple[List[Dict[str, str]], Dict[int, str]]:
    # Extract feedback IDs and feedback text
    feedback_ids = list(df['Feedback id'])
    feedback_texts = list(df['Feedback 2'])

    # Create a dictionary mapping feedback IDs to feedback text
    id_feedback = {int(feedback_id): feedback for feedback_id, feedback in zip(feedback_ids, feedback_texts)}

    # Prepare the LLM input as a list of dictionaries
    llm_input = [{'id': feedback_id, 'feedback': feedback} for feedback_id, feedback in id_feedback.items()]

    return llm_input, id_feedback

In [17]:
region = "MY"
df = load_region_data(region)
df.head()

  warn("""Cannot parse header or footer so it will be ignored""")


Unnamed: 0,Feedback id,Feedback 1,Feedback 2,URL
407,3530749,I didn't find any useful information.,At times customers ret goods Bcos of modifying...,https://seller.shopee.com.my/edu/courseDetail/...
943,3461260,I didn't find any useful information.,lack rules，already provide the proof to show t...,https://seller.shopee.com.my/edu/courseDetail/...
3927,3132374,I didn't find any useful information.,13-6-2024: We can't find the media space butt...,https://seller.shopee.com.my/edu/courseDetail/...
4926,3062338,The lessons were too difficult.|Images/gifs di...,Good thank you,https://seller.shopee.com.my/edu/courseDetail/...
5671,3000004,I didn't find any useful information.,I cant find where is the logistic fee for parc...,https://seller.shopee.com.my/edu/courseDetail/381


In [22]:
llm_input, id_feedback = format_llm_input(df)
id_feedback


{3530749: 'At times customers ret goods Bcos of modifying so it’s unfair to us (seller) . Spareparts heavy & when returning our packing all destroyed . Unfair to',
 3461260: 'lack rules，already provide the proof to show that false from courier side. Still add my penalty point. ',
 3132374: "13-6-2024:  We can't find the media space button in shopee",
 3062338: 'Good thank you ',
 3000004: 'I cant find where is the logistic fee for parcel??? and now shopee system deduct me too over, useless system',
 2996871: 'Stupid procedure to hold seller payment. Buyer already confirm received payment still don’t want release for seller!!',
 2991228: 'cod no good',
 2874584: 'Why is my product approved for listing and live but yet I see nothing listed in my shop? None of the articles provide any information regarding this ',
 2874583: 'Why is my product approved for listing and live but yet I see nothing listed in my shop? None of the articles provide any information regarding this ',
 2874582: 'Why

In [None]:


region  = "MY"
region_lowercased = region.lower()
sample_text_1 = "Kebahagiaan sebenar datang dari hati yang bersyukur."
sample_text_2 = "Setiap cabaran adalah peluang untuk belajar dan berkembang."
foreign_texts = [sample_text_1, sample_text_2]

res = translate_batch(region_lowercased,  [sample_text_1, sample_text_2])
res