In [1]:
# !wget https://raw.githubusercontent.com/mesolitica/malaysian-dataset/master/llm-benchmark/BM-pt3/BM-A-pt3

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained('mesolitica/llama-13b-hf-32768-fpf')
model = AutoModelForCausalLM.from_pretrained(
    'mesolitica/llama-13b-hf-32768-fpf', 
    use_flash_attention_2 = True, 
    torch_dtype = torch.float16,
    device_map="cuda:0"
)

[2023-11-11 01:12:35,418] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
from tqdm import tqdm
import os
import random

In [4]:
with open('BM-A-pt3') as fopen:
    text = fopen.read()
    
questions = []
for t in text.split('no: ')[1:]:
    t = t.strip()
    no = t.split('\n')[0]
    objektif = t.split('objektif: ')[1].split('\n')[0]
    soalan = t.split('soalan:')[1].split('jawapan:')[0].strip()
    jawapan = t.split('jawapan: ')[1].split(',')[0].strip()
    data = {
        'no': no,
        'objektif': objektif,
        'soalan': soalan,
        'jawapan': jawapan,
    }
    questions.append(data)

In [5]:
arange = set(range(len(questions)))

In [6]:
def convert_prompt(row, answer = False):
    if answer:
        prompt = f"""
objektif: {row['objektif']}
soalan: {row['soalan']}
jawapan: {row['jawapan']}
    """
    else:
        prompt = f"""
objektif: {row['objektif']}
soalan: {row['soalan']}
jawapan:
    """
    return prompt.strip()

In [7]:
i = 0
shots = random.sample(arange - {i}, 1)
prompts = []
for no, s in enumerate(shots):
    prompts.append(f'Contoh soalan {no + 1}\n' + convert_prompt(questions[s], answer = True))

prompts.append(convert_prompt(questions[i]))
prompt = '\n\n'.join(prompts)
print(prompt)

Contoh soalan 1
objektif: Baca petikan di bawah ini dengan teliti, kemudian jawab soalan-soalan yang berikut.
soalan: Maka diambil padi yang dijemur itu segenggam, lalu disembunyikan butir-butir padi itu di
dalam lubang-lubang luka pada tumitnya. Setelah itu ia pun meminta izin dengan baik-
baik untuk pulang ke kampungnya, ke bumi tempat saudaranya yang tetap menantinya.
Maka berkatalah induk semangnya, orang kayangan yang masih juga mencurigainya
itu, "Baiklah jikalau kamu ingin pulang ke kampung kamu, akan tetapi, sebelum itu saya
akan memeriksa dan menggeledah kamu lebih dahulu. Barangkali ada padi yang
disembunyikan untuk kamu bawa sebagai oleh-oleh ke bumi."
Anak yatim piatu itu pun menjawab dengan tenangnya, "Silakan ! Periksalah dengan
teliti kalau-kalau ada apa-apa yang saya bawa. Sekarang saya sudah betul-betul jera
mengambil apa-apa. Saya sungguh-sungguh sudah bertaubat dan tidak mahu lagi
melakukannya."
Kemudian induk semang berkata, "Jikalau benar demikian katamu, maka jela

since Python 3.9 and will be removed in a subsequent version.
  shots = random.sample(arange - {i}, 1)


In [8]:
inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda')
inputs

{'input_ids': tensor([[ 1281,   517, 29882,   577,   284,   273, 29871, 29896,    13,   711,
          9761,   361, 29901,   350, 11989,  5697,  7941,   652,   289,  1450,
           801,   297, 29875,   972,  6249, 13547,  4812, 29892,   413,   331,
           566,   713,   432,  1450,   370,   577,   284,   273, 29899,   578,
           284,   273,   343,   574,  7655,   638,   329, 29889,    13,   578,
           284,   273, 29901,   341,  8245,   652,  1117,   309,   282, 10129,
           343,   574,   652, 12701,   332,   372, 29884,  2377,   996, 29887,
           314, 29892,   301, 22349,   766,  1590,   348, 29891,  7941,   541,
           381, 29899,  4187,   381,   282, 10129,   372, 29884,   652,    13,
         12293,   314, 14757,   574, 29899, 29870,   574,  8092,  1335,   282,
          1114, 21622,   277,  1460, 29874, 29889,  3789,   295,   801,   372,
         29884, 29871,   423,  6035,  2626, 15073,  5951,   262,   972,  6249,
          9922,   638, 29899,    13,  

In [9]:
generate_kwargs = dict(
    inputs,
    max_new_tokens=3,
    top_p=0.95,
    top_k=50,
    temperature=0.1,
    do_sample=True,
    num_beams=1,
    repetition_penalty=1.05,
)
r = model.generate(**generate_kwargs)
print(tokenizer.decode(r[0]).split('jawapan:')[-1].strip().split())

['A_', '_']


In [10]:
for i in tqdm(range(len(questions))):
    shots = random.sample(arange - {i}, 1)
    prompts = []
    for no, s in enumerate(shots):
        prompts.append(f'Contoh soalan {no + 1}\n' + convert_prompt(questions[s], answer = True))

    prompts.append(convert_prompt(questions[i]))
    prompt = '\n\n'.join(prompts)
    inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda')
    repeat = []
    for _ in range(3):
        try:
            generate_kwargs = dict(
                inputs,
                max_new_tokens=3,
                top_p=0.95,
                top_k=50,
                temperature=0.5,
                do_sample=True,
                num_beams=1,
                repetition_penalty=1.05,
            )
            r = model.generate(**generate_kwargs)
            r = tokenizer.decode(r[0]).split('jawapan:')[-1].strip().split()
            repeat.append(r[0].replace('.', '').replace('</s>', '').split('\\')[0].split('/')[0])
    
        except Exception as e:
            print(e)
            pass
    
    questions[i]['output'] = repeat

since Python 3.9 and will be removed in a subsequent version.
  shots = random.sample(arange - {i}, 1)
100%|██████████| 54/54 [00:41<00:00,  1.30it/s]


In [11]:
import json

with open('output-1shot-llama2-13b-32k.json', 'w') as fopen:
    json.dump(questions, fopen)

In [12]:
def most_common(l):
    return max(set(l), key=l.count)

In [13]:
filtered = [q for q in questions if 'output' in q]
correct = 0
for q in filtered:
    correct += most_common(q['output']) == q['jawapan']
(correct / len(filtered)) * 100

20.37037037037037