In [4]:
# !wget https://huggingface.co/datasets/mesolitica/semisupervised-abstractive-summarization-ms-news/resolve/main/populate-news.json.semisupervised

In [7]:
import json

data = []
with open('populate-news.json.semisupervised') as fopen:
    for l in fopen:
        l = json.loads(l)
        if len(l['text'].split()) > 50:
            data.append(l['text'])

In [17]:
filename = 'populate-news.json.semisupervised'

In [9]:
len(data)

78034

In [11]:
import openai
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import os
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)

In [14]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(3))
def translate_text(value):
    t = f"""
paragraph `{value}`, summarize less than 128 words to bahasa melayu
    """
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
                {"role": "user", "content": t},
            ],
        max_tokens=1500,
        temperature=0.1,
        )
    return response.choices[0]["message"]["content"].strip()

In [15]:
def translate(data):
    return (data, translate_text(data))

In [None]:
MAX_PARALLEL_REQUESTS = 100
CHUNK_SIZE = 100
start = 2
end = len(data)
# Translate the data in chunks of 1000 items
for i in range(start, end, CHUNK_SIZE):
    start = i
    end = i + CHUNK_SIZE
    
    new_filename = f'{filename}_{start}_to_{end}.json'
    if os.path.exists(new_filename):
        continue

    translated_data = []
    data_new = data[start:end]

    with ThreadPoolExecutor(max_workers=MAX_PARALLEL_REQUESTS) as executor:
        futures = {executor.submit(translate, item): item for item in data_new}

        for future in tqdm(as_completed(futures), total=len(futures), desc="Translating"):
            translated_data.append(future.result())

    
    with open(new_filename, 'w') as f:
        json.dump(translated_data, f, ensure_ascii=False, indent=4)

Translating: 100%|████████████████████████████| 100/100 [00:18<00:00,  5.36it/s]
Translating: 100%|████████████████████████████| 100/100 [00:14<00:00,  7.04it/s]
Translating: 100%|████████████████████████████| 100/100 [00:17<00:00,  5.62it/s]
Translating: 100%|████████████████████████████| 100/100 [00:17<00:00,  5.77it/s]
Translating: 100%|████████████████████████████| 100/100 [00:16<00:00,  6.17it/s]
Translating: 100%|████████████████████████████| 100/100 [00:16<00:00,  5.91it/s]
Translating: 100%|████████████████████████████| 100/100 [00:21<00:00,  4.66it/s]
Translating: 100%|████████████████████████████| 100/100 [00:14<00:00,  6.70it/s]
Translating: 100%|████████████████████████████| 100/100 [00:14<00:00,  6.76it/s]
Translating: 100%|████████████████████████████| 100/100 [00:15<00:00,  6.58it/s]
Translating: 100%|████████████████████████████| 100/100 [00:15<00:00,  6.64it/s]
Translating: 100%|████████████████████████████| 100/100 [00:15<00:00,  6.52it/s]
Translating: 100%|██████████