In [6]:
# !wget https://huggingface.co/datasets/malaysia-ai/filtered-aya-dataset-zsm/resolve/main/filtered-aya_dataset-zsm.jsonl

In [27]:
import json
import os
from tqdm import tqdm
from transformers import AutoTokenizer
from huggingface_hub import InferenceClient

generate_kwargs = dict(
    temperature=1.0,
    max_new_tokens=2048,
    top_p=0.95,
    repetition_penalty=1.0,
    do_sample=True,
)

client = InferenceClient(model="http://127.0.0.1:8080")

In [5]:
tokenizer = AutoTokenizer.from_pretrained('mesolitica/malaysian-mistral-7b-32k-instructions-v4')

In [36]:
data, answers = [], []
with open('filtered-aya_dataset-zsm.jsonl') as fopen:
    for l in fopen:
        l = json.loads(l)
        data.append(l['inputs'])
        answers.append(l['targets'].strip())

In [10]:
data[0]

'Tuliskan sebuah cerita fiksyen pendek berdasarkan judul "Bukan Airin"'

In [19]:
!mkdir generated-malaysian-mistral

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [20]:
def answer(q, i):
    filename = f'generated-malaysian-mistral/{i}.json'
    if os.path.exists(filename):
        return
    
    formatted_prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': q}], tokenize = False)
    while True:
        try:
            stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=False, details=True, return_full_text=False)
            answer = stream.generated_text.strip()
            break
        except Exception as e:
            # print(e)
            pass 
    
    d = {
        'question': q,
        'answer': answer,
    }
    
    with open(filename, 'w') as fopen:
        json.dump(d, fopen)

In [21]:
def consumer(queue, name):
    while True:
        if queue.qsize() == 0:
            break
        item = queue.get()
        answer(*item)
    print(f'consumer {name} done')

In [49]:
from threading import Thread
from queue import Queue

queue = Queue()
urls = [(q, no) for no, q in enumerate(data)]
for u in urls:
    queue.put(u)
    
ori_size = queue.qsize()

In [50]:
max_worker = 50
consumers = [Thread(target=consumer, args=(queue,i)) for i in range(max_worker)]
for i in range(len(consumers)):
    consumers[i].start()
    
pbar = tqdm(total=ori_size)
last_size = 0
while True:
    size = queue.qsize()
    if size == 0:
        break
    left = ori_size - size
    minus = left - last_size
    if minus > 0:
        pbar.update(minus)
        last_size += minus

pbar.close()

consumer 19 doneconsumer 17 done
consumer 1 done
consumer 15 done
consumer 3 done
consumer 16 done
consumer 0 done
consumer 10 done
consumer 9 done
consumer 12 done
consumer 11 done
consumer 8 done
consumer 6 done

consumer 14 done
consumer 5 done
consumer 20 done
consumer 21 doneconsumer 22 done

consumer 23 done
consumer 24 done
consumer 25 doneconsumer 26 done

consumer 27 done
consumer 28 done
consumer 29 doneconsumer 30 done

consumer 31 done
consumer 32 done
consumer 33 done
consumer 34 done
consumer 35 done
consumer 36 done
consumer 37 done
consumer 38 done
consumer 39 done
consumer 40 done
consumer 41 done
consumer 42 done
consumer 43 done
consumer 44 doneconsumer 45 done
consumer 46 done

consumer 47 done
consumer 48 doneconsumer 49 done


  0%|                                                                                                      | 0/10073 [00:00<?, ?it/s]



consumer 7 done
consumer 2 done
consumer 18 done
consumer 4 done
consumer 13 done


In [51]:
from glob import glob

files = glob('generated-malaysian-mistral/*.json')
len(files)

10073

In [52]:
rejected = []
for f in tqdm(files):
    try:
        with open(f) as fopen:
            json.load(fopen)
    except:
        rejected.append(f)

100%|███████████████████████████████████████████████████████████████████████████████████████| 10073/10073 [00:00<00:00, 78412.68it/s]


In [53]:
len(rejected)

0

In [54]:
all_data = []
for i in range(len(data)):
    with open(f'generated-malaysian-mistral/{i}.json') as fopen:
        generated = json.load(fopen)
    all_data.append({
        'prompt': data[i],
        'chosen': answers[i],
        'rejected': generated['answer'].strip()
    })
len(all_data)

10073

In [55]:
from datasets import Dataset

In [56]:
dataset = Dataset.from_list(all_data)
dataset

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 10073
})

In [57]:
dataset.push_to_hub(repo_id = 'mesolitica/DPO-filtered-aya_dataset-zsm')

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/689 [00:00<?, ?B/s]

Updating downloaded metadata with the new split.
