In [1]:
import json
import os
from pydantic import BaseModel

class Translation(BaseModel):
    standard_en: str
    standard_ms: str
        
schema = Translation.model_json_schema()
schema

{'properties': {'standard_en': {'title': 'Standard En', 'type': 'string'},
  'standard_ms': {'title': 'Standard Ms', 'type': 'string'}},
 'required': ['standard_en', 'standard_ms'],
 'title': 'Translation',
 'type': 'object'}

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('casperhansen/llama-3-70b-instruct-awq')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
texts = []
with open('mandarin.texts') as fopen:
    for l in fopen:
        l = json.loads(l)
        texts.append(l)

In [4]:
len(texts)

3000001

In [5]:
prompts = []
for d in texts:
    s = f"""
```
{d}
```

translate everything, DO NOT FORGOT ABOUT THE INITIAL, to standard english and standard malay, NO NEED TO EXPLAIN, return in JSON {{'standard_en', 'standard_ms'}}
""".strip()
    prompts.append(s)

In [6]:
m = [
    {'role': 'user', 'content': prompts[0]}
]
inputs = tokenizer.apply_chat_template(m, tokenize = False)

In [7]:
# import requests

# headers = {
#     'accept': 'application/json',
#     'Content-Type': 'application/json',
# }

# json_data = {
#   "inputs": inputs,
#   "parameters": {
#     "do_sample": True,
#     "grammar": {"type": "json", "value": schema},
#     "max_new_tokens": 2048,
#     "repetition_penalty": 1.03,
#     "stop": [
#       "<|eot_id|>"
#     ],
#     "temperature": 0.9,
#     "top_k": 50,
#     "top_p": 0.95,
#     "watermark": False
#   }
# }

# response = requests.post(
#     'https://llama-3.us.mesolitica.com/generate', 
#     headers=headers, json=json_data, timeout = 60 * 10)

In [8]:
# response.json()['generated_text']

In [9]:
directory = 'translate-c.cari'
# !rm -rf {directory}
!mkdir {directory}

mkdir: cannot create directory ‘translate-c.cari’: File exists


In [10]:
from glob import glob

files = glob(f'{directory}/*.json')
len(files)

342252

In [11]:
from openai import OpenAI
import requests

headers = {
    'accept': 'application/json',
    'Content-Type': 'application/json',
}

def answer(q, i, verbose = False):
    filename = f'{directory}/{i}.json'
    if os.path.exists(filename):
        return
    
    results = []
    openai = OpenAI(
        base_url='',
        api_key='empty',
    )
    response = None
    while True:
        try:
            json_data = {
                'messages': [
                    {
                        'role': 'user',
                        'content': q,
                    },
                ],
                'model': 'model',
                'stop': [
                    '<|eot_id|>',
                ],
                'temperature': 0.9,
                'max_tokens': 2048,
            }
            response = requests.post(
                '', 
                headers=headers, json=json_data, timeout = 60 * 10)
            if verbose:
                print(response, response.__dict__)
            r = response.json()['choices'][0]['message']['content']
            results.append(r)
            break
        except Exception as e:
            if verbose:
                print(e)
            if response is not None and response.status_code != 503:
                break
    
    
    if len(results): 
        with open(filename, 'w') as fopen:
            json.dump(results, fopen)

In [13]:
def consumer(queue, name):
    while True:
        if queue.qsize() == 0:
            break
        item = queue.get()
        answer(*item)
    print(f'consumer {name} done')

In [14]:
urls = [(q, no) for no, q in enumerate(prompts)]

In [15]:
# answer(*urls[1000000], verbose = True)

In [16]:
answer(*urls[1000001], verbose = False)

In [17]:
with open(f'{directory}/1000001.json') as fopen:
    print(fopen.read())

["{\n\"standard_en\": \"Original post by Devil.May.Cry on 11-7-2009 07:36 PM. I've checked, no virus found. Could it be a hardware problem? Please help, experts! Open Task Manager and see which program is using up your resources. Otherwise, you can upload a screenshot of your Task Manager for us to take a look.\",\n\"standard_ms\": \"Pos asal oleh Devil.May.Cry pada 11-7-2009 07:36 PT. Saya telah menyemak, tidak ada virus. Adakah ia masalah keras? Sila bantu, pakar! Buka Pengurus Tugas dan lihat mana program yang menggunakan sumber anda. Jika tidak, anda boleh memuat naik screenshot Pengurus Tugas anda untuk kami lihat.\"\n}"]


In [18]:
answer(*urls[0])

In [19]:
from threading import Thread
from queue import Queue
from tqdm import tqdm

queue = Queue()
for u in urls:
    queue.put(u)
    
ori_size = queue.qsize()

In [None]:
max_worker = 50
consumers = [Thread(target=consumer, args=(queue,i)) for i in range(max_worker)]
for i in range(len(consumers)):
    consumers[i].start()
    
pbar = tqdm(total=ori_size)
last_size = 0
while True:
    size = queue.qsize()
    if size == 0:
        break
    left = ori_size - size
    minus = left - last_size
    if minus > 0:
        pbar.update(minus)
        last_size += minus

pbar.close()

  6%|███▊                                                                | 166646/3000001 [39:42<298:38:54,  2.64it/s]