In [9]:
# !wget https://f000.backblazeb2.com/file/malay-dataset/sentiment/semisupervised/train-set.json

In [2]:
import json

In [16]:
filename = 'train-set.json'

In [19]:
with open(filename) as fopen:
    data = json.load(fopen)['train_X']

In [8]:
len(data['train_X'])

167004

In [10]:
import openai
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import os
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)

In [11]:
openai.api_key = ''

In [31]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(3))
def translate_text(value):
    t = f"""
text `{value}`, categorize the text with label ['negative', 'positive', 'neutral'] and explain, return as JSON key {{'sentiment', 'explain_en', 'explain_ms'}}
    """
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
                {"role": "user", "content": t},
            ],
        max_tokens=1500,
        temperature=0.1,
        )
    return response.choices[0]["message"]["content"].strip()

In [32]:
def translate(data):
    return (data, translate_text(data))

In [35]:
MAX_PARALLEL_REQUESTS = 100
CHUNK_SIZE = 1000
start = 10
end = len(data)
# Translate the data in chunks of 1000 items
for i in range(start, end, CHUNK_SIZE):
    start = i
    end = i + CHUNK_SIZE
    
    new_filename = f'{filename}_{start}_to_{end}.json'
    if os.path.exists(new_filename):
        continue

    translated_data = []
    data_new = data[start:end]

    with ThreadPoolExecutor(max_workers=MAX_PARALLEL_REQUESTS) as executor:
        futures = {executor.submit(translate, item): item for item in data_new}

        for future in tqdm(as_completed(futures), total=len(futures), desc="Translating"):
            translated_data.append(future.result())

    
    with open(new_filename, 'w') as f:
        json.dump(translated_data, f, ensure_ascii=False, indent=4)

Translating: 100%|██████████████████████████| 1000/1000 [01:08<00:00, 14.63it/s]
Translating: 100%|██████████████████████████| 1000/1000 [01:10<00:00, 14.09it/s]
Translating: 100%|██████████████████████████| 1000/1000 [01:08<00:00, 14.50it/s]
Translating: 100%|██████████████████████████| 1000/1000 [01:10<00:00, 14.21it/s]
Translating: 100%|██████████████████████████| 1000/1000 [01:14<00:00, 13.46it/s]
Translating: 100%|██████████████████████████| 1000/1000 [01:16<00:00, 13.05it/s]
Translating: 100%|██████████████████████████| 1000/1000 [01:14<00:00, 13.39it/s]
Translating: 100%|██████████████████████████| 1000/1000 [01:12<00:00, 13.74it/s]
Translating: 100%|██████████████████████████| 1000/1000 [01:10<00:00, 14.13it/s]
Translating: 100%|██████████████████████████| 1000/1000 [01:10<00:00, 14.12it/s]
Translating: 100%|██████████████████████████| 1000/1000 [01:13<00:00, 13.55it/s]
Translating: 100%|██████████████████████████| 1000/1000 [01:09<00:00, 14.33it/s]
Translating: 100%|██████████

In [36]:
from glob import glob

data = []
for f in glob('train-set.json_*.json'):
    with open(f) as fopen:
        data.extend(json.load(fopen))

In [37]:
data[:10]

[['Terimakasih ya',
  '{\n    "sentiment": "positive",\n    "explain_en": "The text is categorized as positive because it expresses gratitude or thankfulness.",\n    "explain_ms": "Teks ini dikategorikan sebagai positif karena mengungkapkan rasa terima kasih atau syukur."\n}'],
 ['Enjin klik trus scroll ini yu',
  '{\n    "sentiment": "neutral",\n    "explain_en": "The text is neutral as it does not express any positive or negative sentiment.",\n    "explain_ms": "Teks ini adalah neutral kerana ia tidak mengungkapkan sebarang sentimen positif atau negatif."\n}'],
 ['Karya tangan anak bangsa sekarang patut diacungi jempol apalagi produknya yang yang bagus, banggalah kita memakai k',
  '{\n    "sentiment": "positive",\n    "explain_en": "The text is positive because it praises the quality of products made by Indonesian people and encourages people to be proud of using them.",\n    "explain_ms": "Teks ini positif karena memuji kualitas produk yang dibuat oleh orang Indonesia dan mendorong

In [39]:
with open('explain-sentiment.json', 'w') as fopen:
    json.dump(data, fopen)