In [1]:
# !wget https://raw.githubusercontent.com/mesolitica/malaysian-dataset/master/llm-benchmark/tatabahasabm.tripod.com/quiz-tatabahasa.jsonl

In [2]:
from tqdm import tqdm
import os
import json
import random

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained('mesolitica/malaysian-llama2-7b-32k-instructions')
model = AutoModelForCausalLM.from_pretrained(
    'mesolitica/malaysian-llama2-7b-32k-instructions', 
    use_flash_attention_2 = True, 
    torch_dtype = torch.float16,
    device_map="cuda:0"
)

[2023-11-11 04:31:05,927] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
questions = []
with open('quiz-tatabahasa.jsonl') as fopen:
    for no, l in enumerate(fopen):
        l = json.loads(l)
        soalan = [l['question']]
        jawapan = None
        for c, k in l['choices'].items():
            soalan.append(f"{c}. {k['text']}")
            if k['answer']:
                jawapan = c
        
        data = {
            'no': no,
            'objektif': 'Jawab soalan yang diberikan' if l['instruction'] is None else l['instruction'],
            'soalan': '\n'.join(soalan),
            'jawapan': jawapan,
        }
        questions.append(data)
len(questions)

349

In [5]:
arange = set(range(len(questions)))

In [6]:
def convert_prompt(row, answer = False):
    if answer:
        prompt = f"""
objektif: {row['objektif']}
soalan: {row['soalan']}
jawapan: {row['jawapan']}
    """
    else:
        prompt = f"""
objektif: {row['objektif']}
soalan: {row['soalan']}
jawapan:
    """
    return prompt.strip()

In [7]:
i = 0
shots = random.sample(arange - {i}, 1)
prompts = []
for no, s in enumerate(shots):
    prompts.append(f'Contoh soalan {no + 1}\n' + convert_prompt(questions[s], answer = True))

prompts.append(convert_prompt(questions[i]))
prompt = '\n\n'.join(prompts)
print(prompt)

Contoh soalan 1
objektif: Pilih jawapan yang paling sesuai untuk ayat yang bergaris.
soalan: Pilih soalan yang paling sesuai untuk ayat yang bergaris.<br/>Budak perempuan itu menangis kerana kakinya luka.
A. Apakah yang menyebabkan kaki budak perempuan itu luka?
B. Apakah sebabnya maka budak perempuan itu menangis?
C. Bilakah kaki budak perempuan yang menangis itu luka?
D. Adakah budak perempuan itu menangis kerana kakinya luka?
jawapan: B

objektif: Jawab soalan yang diberikan
soalan: ........, sudah dapat memandu kereta rupa-rupanya kamu !
A. Oh
B. Eh
C. Hai
D. Ah
jawapan:


since Python 3.9 and will be removed in a subsequent version.
  shots = random.sample(arange - {i}, 1)


In [8]:
inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda')
inputs

{'input_ids': tensor([[ 1281,   517, 29882,   577,   284,   273, 29871, 29896,    13,   711,
          9761,   361, 29901,   349,  2638, 29882,   432,  1450, 21419,   343,
           574,  5112,   292,  3999, 29884,  1794,   443, 29873,  2679, 10156,
           271,   343,   574,  7655,  5397,   275, 29889,    13,   578,   284,
           273, 29901,   349,  2638, 29882,   577,   284,   273,   343,   574,
          5112,   292,  3999, 29884,  1794,   443, 29873,  2679, 10156,   271,
           343,   574,  7655,  5397,   275, 19423,  1182,  3779, 29933,   566,
           557,   639,  3451, 12323,   372, 29884,  1757,   574,   275, 13023,
          1648,   413,   557,   262,  3761,  8092,  1335, 29889,    13, 29909,
         29889,  6225,   557,   801,   343,   574,  1757, 29891,   774,   370,
         11052,   413,  9940,  8619,   557,   639,  3451, 12323,   372, 29884,
          8092,  1335, 29973,    13, 29933, 29889,  6225,   557,   801,   409,
         29890,   370,  1460, 29874,  

In [10]:
generate_kwargs = dict(
    inputs,
    max_new_tokens=3,
    top_p=0.95,
    top_k=50,
    temperature=0.1,
    do_sample=True,
    num_beams=1,
    repetition_penalty=1.05,
)
r = model.generate(**generate_kwargs)
print(tokenizer.decode(r[0]).split('jawapan:')[-1].strip().split())

['C']


In [11]:
for i in tqdm(range(len(questions))):
    shots = random.sample(arange - {i}, 1)
    prompts = []
    for no, s in enumerate(shots):
        prompts.append(f'Contoh soalan {no + 1}\n' + convert_prompt(questions[s], answer = True))

    prompts.append(convert_prompt(questions[i]))
    prompt = '\n\n'.join(prompts)
    inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda')
    repeat = []
    for _ in range(5):
        try:
            generate_kwargs = dict(
                inputs,
                max_new_tokens=3,
                top_p=0.95,
                top_k=50,
                temperature=0.5,
                do_sample=True,
                num_beams=1,
                repetition_penalty=1.05,
            )
            r = model.generate(**generate_kwargs)
            r = tokenizer.decode(r[0]).split('jawapan:')[-1].strip().split()
            repeat.append(r[0].replace('.', '').replace('</s>', '').split('\\')[0].split('/')[0])
    
        except Exception as e:
            print(e)
            pass
    
    questions[i]['output'] = repeat

since Python 3.9 and will be removed in a subsequent version.
  shots = random.sample(arange - {i}, 1)
100%|██████████| 349/349 [02:21<00:00,  2.47it/s]


In [12]:
import json

with open('output-1shot-llama2-7b-32k.json', 'w') as fopen:
    json.dump(questions, fopen)

In [13]:
def most_common(l):
    return max(set(l), key=l.count)

In [14]:
filtered = [q for q in questions if 'output' in q and len(q['output'])]
correct = 0
for q in filtered:
    correct += most_common(q['output']) == q['jawapan']
(correct / len(filtered)) * 100

24.068767908309454