In [1]:
import json
import os
from pydantic import BaseModel

class Translation(BaseModel):
    standard_en: str
    standard_ms: str
        
schema = Translation.model_json_schema()
schema

{'properties': {'standard_en': {'title': 'Standard En', 'type': 'string'},
  'standard_ms': {'title': 'Standard Ms', 'type': 'string'}},
 'required': ['standard_en', 'standard_ms'],
 'title': 'Translation',
 'type': 'object'}

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('casperhansen/llama-3-70b-instruct-awq')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
texts = []
with open('manglish.texts') as fopen:
    for l in fopen:
        l = json.loads(l)
        texts.append(l)

In [4]:
len(texts)

2000001

In [5]:
prompts = []
for d in texts:
    s = f"""
```
{d}
```

translate everything, DO NOT FORGOT ABOUT THE INITIAL, to standard english and standard malay, NO NEED TO EXPLAIN, return in JSON {{'standard_en', 'standard_ms'}}
""".strip()
    prompts.append(s)

In [6]:
directory = 'translate-manglish'
# !rm -rf {directory}
!mkdir {directory}

mkdir: cannot create directory ‘translate-manglish’: File exists


In [7]:
from glob import glob

files = glob(f'{directory}/*.json')
len(files)

203188

In [8]:
from openai import OpenAI
import requests

headers = {
    'accept': 'application/json',
    'Content-Type': 'application/json',
}

def answer(q, i, verbose = False):
    filename = f'{directory}/{i}.json'
    if os.path.exists(filename):
        return
    
    results = []
    openai = OpenAI(
        base_url='',
        api_key='empty',
    )
    response = None
    while True:
        try:
            json_data = {
                'messages': [
                    {
                        'role': 'user',
                        'content': q,
                    },
                ],
                'model': 'model',
                'stop': [
                    '<|eot_id|>',
                ],
                'temperature': 0.9,
                'max_tokens': 2048,
            }
            response = requests.post(
                '', 
                headers=headers, json=json_data, timeout = 60 * 10)
            if verbose:
                print(response, response.__dict__)
            r = response.json()['choices'][0]['message']['content']
            results.append(r)
            break
        except Exception as e:
            if verbose:
                print(e)
            if response is not None and response.status_code != 503:
                break
    
    
    if len(results): 
        with open(filename, 'w') as fopen:
            json.dump(results, fopen)

In [10]:
for f in files:
    with open(f) as fopen:
        d = json.load(fopen)
        
    if len(d) == 0:
        os.remove(f)

In [11]:
def consumer(queue, name):
    while True:
        if queue.qsize() == 0:
            break
        item = queue.get()
        answer(*item)
    print(f'consumer {name} done')

In [12]:
urls = [(q, no) for no, q in enumerate(prompts)]

In [13]:
answer(*urls[0])

In [14]:
answer(*urls[int(1e6)])

In [15]:
with open(f'{directory}/{int(1e6)}.json') as fopen:
    print(fopen.read())

["{\n\"standard_en\": \"By the way, front passenger windows only cost around RM90, and adding a labor charge of about RM45 plus a vacuum service at the car wash for only RM15. So, it's still cheap compared to replacing all the keys.\",\n\"standard_ms\": \"Bagi maklumat, tingkap penumpang hadapan hanya berharga sekitar RM90, dan menambahkan caj buruh sebanyak kira-kira RM45 plus perkhidmatan vakum di car wash hanya RM15. Jadi, ia masih murah jika dibandingkan dengan menggantikan semua kunci.\"\n}"]


In [16]:
from threading import Thread
from queue import Queue
from tqdm import tqdm

queue = Queue()
for u in urls:
    queue.put(u)
    
ori_size = queue.qsize()

In [None]:
max_worker = 50
consumers = [Thread(target=consumer, args=(queue,i)) for i in range(max_worker)]
for i in range(len(consumers)):
    consumers[i].start()
    
pbar = tqdm(total=ori_size)
last_size = 0
while True:
    size = queue.qsize()
    if size == 0:
        break
    left = ori_size - size
    minus = left - last_size
    if minus > 0:
        pbar.update(minus)
        last_size += minus

pbar.close()

 10%|██████▍                                                            | 193043/2000001 [3:00:47<44:51:41, 11.19it/s]