In [1]:
# !wget https://raw.githubusercontent.com/mesolitica/malaysian-dataset/master/llm-benchmark/BM-pt3/BM-A-pt3

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained('mesolitica/llama-13b-hf-32768-fpf')
model = AutoModelForCausalLM.from_pretrained(
    'mesolitica/llama-13b-hf-32768-fpf', 
    use_flash_attention_2 = True, 
    torch_dtype = torch.float16,
    device_map="cuda:0"
)

[2023-11-10 06:41:42,817] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
from tqdm import tqdm
import os
import random

In [4]:
with open('BM-A-pt3') as fopen:
    text = fopen.read()
    
questions = []
for t in text.split('no: ')[1:]:
    t = t.strip()
    no = t.split('\n')[0]
    objektif = t.split('objektif: ')[1].split('\n')[0]
    soalan = t.split('soalan:')[1].split('jawapan:')[0].strip()
    jawapan = t.split('jawapan: ')[1].split(',')[0].strip()
    data = {
        'no': no,
        'objektif': objektif,
        'soalan': soalan,
        'jawapan': jawapan,
    }
    questions.append(data)

In [5]:
arange = set(range(len(questions)))

In [6]:
def convert_prompt(row, answer = False):
    if answer:
        prompt = f"""
objektif: {row['objektif']}
soalan: {row['soalan']}
jawapan: {row['jawapan']}
    """
    else:
        prompt = f"""
objektif: {row['objektif']}
soalan: {row['soalan']}
jawapan:
    """
    return prompt.strip()

In [7]:
i = 0
shots = random.sample(arange - {i}, 1)
prompts = []
for no, s in enumerate(shots):
    prompts.append(f'Contoh soalan {no + 1}\n' + convert_prompt(questions[s], answer = True))

prompts.append(convert_prompt(questions[i]))
prompt = '\n\n'.join(prompts)
print(prompt)

Contoh soalan 1
objektif: Baca petikan cerpen di bawah ini dengan teliti, kemudian jawab soalan-soalan yang berikut.
soalan: Sungai Layong yang mudah banjir itu menerima tetamu. Ada yang berenang, ada juga yang duduk berceratukan di tebingnya. Ibu-ibu separuh umur
terbongkok-bongkok mencuci pinggan-mangkuk. Anak-anak muda sibuk
mengangkat air. Ada yang sempat mengusik seorang dua anak gadis yang menuju
ke sungai. Beluntung membeliakkan matanya. Apa yang terjadi? Mengapa tiba-tiba
sahaja orang kampung berada di halaman rumahnya?

Tuyog mendengus mengejutkan budak-budak yang mandi di sungai. Mereka
segera naik ke tebing. Jantung Beluntung berdebar-debar. Perasaan ingin tahu
menguasai hatinya. Dia terjun dari belakang Tuyog lalu berlari ke rumah. 

"Mana ibu?" Mereka yang berada di dalam rumah itu tertinjau-tinjau ke arahnya. "Ibu kamu di dapur memasak." Beluntung diam dan terus ke dapur. Dia mendapati
ibu tertawa riang bersama jiran-jiran. 

Beluntung memeluk ibunya sebelum keluar mening

since Python 3.9 and will be removed in a subsequent version.
  shots = random.sample(arange - {i}, 1)


In [8]:
inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda')
inputs

{'input_ids': tensor([[ 1281,   517, 29882,   577,   284,   273, 29871, 29896,    13,   711,
          9761,   361, 29901,   350, 11989,  5697,  7941,  5147,  2238,   652,
           289,  1450,   801,   297, 29875,   972,  6249, 13547,  4812, 29892,
           413,   331,   566,   713,   432,  1450,   370,   577,   284,   273,
         29899,   578,   284,   273,   343,   574,  7655,   638,   329, 29889,
            13,   578,   284,   273, 29901,   317,   686,  1794,   365,   388,
           549,   343,   574, 17439,   801,  9892, 29926,   381,   372, 29884,
           286,   759,  2946,   260,   300,   314, 29884, 29889, 23255,   343,
           574,   289,  4578,   574, 29892,   594, 29874,  8740, 29874,   343,
           574,   270,   566,  2679,  7655,  2265,   271,  2679,   273,   652,
           734, 10549,  1460, 29874, 29889,   306,  2423, 29899,   747, 29884,
          2903, 16099,  1922,   332,    13,   357, 29890,   549, 29895,   554,
         29899, 29890,   549, 29895,  

In [9]:
generate_kwargs = dict(
    inputs,
    max_new_tokens=3,
    top_p=0.95,
    top_k=50,
    temperature=0.1,
    do_sample=True,
    num_beams=1,
    repetition_penalty=1.05,
)
r = model.generate(**generate_kwargs)
print(tokenizer.decode(r[0]).split('jawapan:')[-1].strip().split())

['A�']


In [10]:
for i in tqdm(range(len(questions))):
    shots = random.sample(arange - {i}, 1)
    prompts = []
    for no, s in enumerate(shots):
        prompts.append(f'Contoh soalan {no + 1}\n' + convert_prompt(questions[s], answer = True))

    prompts.append(convert_prompt(questions[i]))
    prompt = '\n\n'.join(prompts)
    inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda')
    repeat = []
    for _ in range(3):
        try:
            generate_kwargs = dict(
                inputs,
                max_new_tokens=3,
                top_p=0.95,
                top_k=50,
                temperature=0.5,
                do_sample=True,
                num_beams=1,
                repetition_penalty=1.05,
            )
            r = model.generate(**generate_kwargs)
            r = tokenizer.decode(r[0]).split('jawapan:')[1].strip().split()
            repeat.append(r[0].replace('.', '').replace('</s>', '').split('\\')[0].split('/')[0])
    
        except Exception as e:
            print(e)
            pass
    
    questions[i]['output'] = repeat

since Python 3.9 and will be removed in a subsequent version.
  shots = random.sample(arange - {i}, 1)
100%|██████████| 54/54 [00:26<00:00,  2.02it/s]


In [11]:
import json

with open('output-1shot-llama2-13b-32k.json', 'w') as fopen:
    json.dump(questions, fopen)

In [12]:
def most_common(l):
    return max(set(l), key=l.count)

In [14]:
filtered = [q for q in questions if 'output' in q]
correct = 0
for q in filtered:
    correct += most_common(q['output']) == q['jawapan']
(correct / len(filtered)) * 100

24.074074074074073