In [1]:
import json

In [1]:
# !wget https://raw.githubusercontent.com/gururise/AlpacaDataCleaned/main/alpaca_data_cleaned.json
filename = 'alpaca_data_cleaned.json'

In [20]:
with open('alpaca_data_cleaned.json') as fopen:
    data = json.load(fopen)

In [21]:
len(data)

51732

In [10]:
import openai
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import os
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)

In [11]:
openai.api_key = ''

In [12]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(3))
def translate_text(value):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": f"Translate the following text to standard Malaysia: '{value}'"},
            ],
        max_tokens=1024,
        temperature=0,
        )
    return response.choices[0]["message"]["content"].strip()

In [22]:
def translate_item(item):
    translated_item = {}
    for key, value in item.items():
        if value:
            translated_value = translate_text(value)
            translated_item[key] = translated_value
        else:
            translated_item[key] = ''
    return translated_item

# Maximum number of parallel requests
MAX_PARALLEL_REQUESTS = 50

In [26]:
CHUNK_SIZE = 1000
start = 0
end = len(data)
# Translate the data in chunks of 1000 items
for i in range(start, end, CHUNK_SIZE):
    start = i
    end = i + CHUNK_SIZE
    
    new_filename = f'{filename}_{start}_to_{end}.json'
    if os.path.exists(new_filename):
        continue

    translated_data = []
    data_new = data[start:end]

    with ThreadPoolExecutor(max_workers=MAX_PARALLEL_REQUESTS) as executor:
        futures = {executor.submit(translate_item, item): item for item in data_new}

        for future in tqdm(as_completed(futures), total=len(futures), desc="Translating"):
            translated_data.append(future.result())

    
    with open(new_filename, 'w') as f:
        json.dump(translated_data, f, ensure_ascii=False, indent=4)

Translating: 100%|███████████████████████████████████████████████████████████████████████████████████| 1000/1000 [04:41<00:00,  3.56it/s]
Translating: 100%|███████████████████████████████████████████████████████████████████████████████████| 1000/1000 [04:06<00:00,  4.05it/s]
Translating: 100%|███████████████████████████████████████████████████████████████████████████████████| 1000/1000 [04:36<00:00,  3.62it/s]
Translating: 100%|███████████████████████████████████████████████████████████████████████████████████| 1000/1000 [04:32<00:00,  3.67it/s]
Translating: 100%|███████████████████████████████████████████████████████████████████████████████████| 1000/1000 [04:29<00:00,  3.71it/s]
Translating: 100%|███████████████████████████████████████████████████████████████████████████████████| 1000/1000 [04:42<00:00,  3.53it/s]
Translating:  53%|████████████████████████████████████████████▏                                       | 526/1000 [02:22<02:12,  3.57it/s]IOPub message rate exceeded.
The no

In [30]:
from glob import glob

files = glob('alpaca_data_cleaned.json_*.json')
data = []
for f in files:
    with open(f) as fopen:
        data.extend(json.load(fopen))
        
len(data)

49000

In [33]:
with open('translated-alpaca_data_cleaned.json', 'w') as fopen:
    json.dump(data, fopen)