In [1]:
# !wget https://raw.githubusercontent.com/mesolitica/malaysian-dataset/master/llm-benchmark/BM-pt3/BM-A-pt3

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained('mesolitica/llama-13b-hf-32768-fpf')
model = AutoModelForCausalLM.from_pretrained(
    'mesolitica/llama-13b-hf-32768-fpf', 
    use_flash_attention_2 = True, 
    torch_dtype = torch.float16,
    device_map="cuda:0"
)

[2023-11-11 01:14:33,875] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
from tqdm import tqdm
import os
import random

In [4]:
with open('BM-A-pt3') as fopen:
    text = fopen.read()
    
questions = []
for t in text.split('no: ')[1:]:
    t = t.strip()
    no = t.split('\n')[0]
    objektif = t.split('objektif: ')[1].split('\n')[0]
    soalan = t.split('soalan:')[1].split('jawapan:')[0].strip()
    jawapan = t.split('jawapan: ')[1].split(',')[0].strip()
    data = {
        'no': no,
        'objektif': objektif,
        'soalan': soalan,
        'jawapan': jawapan,
    }
    questions.append(data)

In [5]:
arange = set(range(len(questions)))

In [6]:
def convert_prompt(row, answer = False):
    if answer:
        prompt = f"""
objektif: {row['objektif']}
soalan: {row['soalan']}
jawapan: {row['jawapan']}
    """
    else:
        prompt = f"""
objektif: {row['objektif']}
soalan: {row['soalan']}
jawapan:
    """
    return prompt.strip()

In [7]:
i = 0
shots = random.sample(arange - {i}, 3)
prompts = []
for no, s in enumerate(shots):
    prompts.append(f'Contoh soalan {no + 1}\n' + convert_prompt(questions[s], answer = True))

prompts.append(convert_prompt(questions[i]))
prompt = '\n\n'.join(prompts)
print(prompt)

Contoh soalan 1
objektif: Pilih peribahasa atau maksud peribahasa yang paling sesuai.
soalan: Walaupun sudah beberapa kali dinasihatkan oleh guru, Aiman tetap malas mengulang kaji peiajarannya.
Peribahasa yang sesuai dengan pernyataan di atas ialah

A. melepaskan batuk di tangga
B. mencurah air ke daun keladi
C. alang-alang berdakwat biar hitam
D. pendayung sudah di tangan, perahu sudah di air
jawapan: B

Contoh soalan 2
objektif: Bahagian yang di dalam kurungan dalam ayat-ayat yang berikut mungkin mengandungi kesalahan bahasa dan mungkin juga tidak. Tandakan sama ada A, B atau C jika ayat itu mengandungi kesalahan dan tandakan D jika tiada kesalahan.
soalan: Halimah percaya (percaya benar) anaknya tidak terbabit dengan tuduhan jenayah kolar putih.
A. sungguh percaya benar
B. teramat percaya benar
C. sangat percaya benar
D. percaya benar
jawapan: D

Contoh soalan 3
objektif: Pilih peribahasa atau maksud peribahasa yang paling sesuai.
soalan: Tidak ada seorang pun daripada lima orang an

since Python 3.9 and will be removed in a subsequent version.
  shots = random.sample(arange - {i}, 3)


In [8]:
inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda')
inputs

{'input_ids': tensor([[ 1281,   517, 29882,   577,   284,   273, 29871, 29896,    13,   711,
          9761,   361, 29901,   349,  2638, 29882,   639,   747,   801, 11290,
           472,   585,  2136, 29879,   566,   639,   747,   801, 11290,   343,
           574,  5112,   292,  3999, 29884,  1794, 29889,    13,   578,   284,
           273, 29901,  5260,   585, 29886,   348,  5053,   801,   367,   495,
         14274,   413,  2606,  4538,  6840,  2455, 11052,   288,   280, 29882,
           330, 20144, 29892,   319, 25895,   260,   300,   481,  4439,   294,
           286,   996,   352,   574,   413,  1175, 29875,  1236,   423,  4758,
           812,  3761, 29889,    13,  5894,   747,   801, 11290,   343,   574,
          3999, 29884,  1794,   972,  6249,   639,  1460,   532,   273,   652,
           472,   294,   474,   284,   801,    13,    13, 29909, 29889,   592,
           280, 29886,  1278,   273, 17152,  2679,   652, 18806,  3249,    13,
         29933, 29889,  1757,  2764,  

In [9]:
generate_kwargs = dict(
    inputs,
    max_new_tokens=3,
    top_p=0.95,
    top_k=50,
    temperature=0.1,
    do_sample=True,
    num_beams=1,
    repetition_penalty=1.05,
)
r = model.generate(**generate_kwargs)
print(tokenizer.decode(r[0]).split('jawapan:')[-1].strip().split())

['A!', '...']


In [17]:
chars = ['_', '!', '�']

In [18]:
for i in tqdm(range(len(questions))):
    shots = random.sample(arange - {i}, 1)
    prompts = []
    for no, s in enumerate(shots):
        prompts.append(f'Contoh soalan {no + 1}\n' + convert_prompt(questions[s], answer = True))

    prompts.append(convert_prompt(questions[i]))
    prompt = '\n\n'.join(prompts)
    inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda')
    repeat = []
    for _ in range(5):
        try:
            generate_kwargs = dict(
                inputs,
                max_new_tokens=3,
                top_p=0.95,
                top_k=50,
                temperature=0.5,
                do_sample=True,
                num_beams=1,
                repetition_penalty=1.05,
            )
            r = model.generate(**generate_kwargs)
            r = tokenizer.decode(r[0]).split('jawapan:')[-1].strip().split()
            r = r[0].replace('.', '').replace('</s>', '').split('\\')[0].split('/')[0]
            for c in chars:
                r = r.replace(c, '')
            repeat.append(r)
    
        except Exception as e:
            print(e)
            pass
    
    questions[i]['output'] = repeat

since Python 3.9 and will be removed in a subsequent version.
  shots = random.sample(arange - {i}, 1)
100%|██████████| 54/54 [00:47<00:00,  1.14it/s]


In [19]:
import json

with open('output-3shot-llama2-13b-32k.json', 'w') as fopen:
    json.dump(questions, fopen)

In [20]:
def most_common(l):
    return max(set(l), key=l.count)

In [21]:
filtered = [q for q in questions if 'output' in q]
correct = 0
for q in filtered:
    correct += most_common(q['output']) == q['jawapan']
(correct / len(filtered)) * 100

31.48148148148148