In [1]:
from transformers import LlamaTokenizer, LlamaForCausalLM, BitsAndBytesConfig
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm

In [2]:
BATCH_SIZE = 8
DEVICE = 0

In [3]:
model_dir = "/shared/4/models/llama2/pytorch-versions/llama-2-7b-chat/"
data_dir = "../data/mmlu/mmlu_mingqian.csv"
cache_dir= "/shared/4/models/"

In [4]:
data_df = pd.read_csv(data_dir)
data_df

Unnamed: 0,question,subject,true_option,groundtruth,dataset,length,question_id,option1,option2,option3,option4
0,Which of the following is the commonest cause ...,clinical_knowledge,1,Alzheimer's disease.,clinical_knowledge,13,486,Alzheimer's disease,Cerebrovascular (stroke) disease,Lewy body dementia,HIV infection
1,Which of the following is true in diplopia?,clinical_knowledge,2,The outer image is always the false image,clinical_knowledge,8,421,Diplopia can never occur if one eye is covered,The outer image is always the false image,A fourth nerve palsy occurs when the patient l...,A sixth nerve palsy causes a divergent squint
2,Fatty acids are transported into the mitochond...,clinical_knowledge,4,carnitine.,clinical_knowledge,9,404,thiokinase,coenzyme A (CoA),acetyl-CoA,carnitine
3,Which of the answers below best indicates the ...,clinical_knowledge,3,Blood type B (rhesus negative) and blood type ...,clinical_knowledge,25,446,"Blood type AB (rhesus negative), blood type B,...",Blood type B (rhesus positive) and blood type ...,Blood type B (rhesus negative) and blood type ...,Blood type B (rhesus negative) only
4,Which one of the following represents a IIIrd ...,clinical_knowledge,3,Unilateral fixed dilated pupil.,clinical_knowledge,16,437,Unilateral constricted pupil,Bilateral constricted pupils,Unilateral fixed dilated pupil,Oval shaped pupils
...,...,...,...,...,...,...,...,...,...,...,...
2452,Which of the following is NOT a good predictor...,us_foreign_policy,4,Regime type,us_foreign_policy,16,1843,Receipt of sensitive nuclear assistance,Wealth/GDP,Rivalry with a nuclear state,Regime type
2453,What was 'democratic enlargement'?,us_foreign_policy,4,Both b and c,us_foreign_policy,4,1827,A proposal for reform of the US system of gove...,A proposal for the extension of democratic rul...,A proposal for the extension of free markets,Both b and c
2454,What is meant by the phrase 'empire by invitat...,us_foreign_policy,1,Voluntary reliance on an external power for se...,us_foreign_policy,9,1869,Voluntary reliance on an external power for se...,Willful openness to colonization,Cultural imperialism,Open advocacy of imperialism for economic gain
2455,In what way did the George W Bush administrati...,us_foreign_policy,1,"It criticized international organizations, rat...",us_foreign_policy,16,1874,"It criticized international organizations, rat...",It expanded NATO to include former Soviet states,It focused on a more personal style of leadership,It increased international support for the Uni...


In [5]:
template = '''Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{{You will be presented with a role-playing context followed by a multiple-choice question. {role_context} Select only the option number that corresponds to the correct answer for the following question.}}\n\n### Input:\n{{{{{question}}} Provide the number of the correct option without explaining your reasoning.}} \n\n### Response:'''
flan_template = '''{role_context} {question} Please select the correct answer number:'''
role_context = "You are a helpful assistant."

answer_prompts = []
for idx, item in data_df.iterrows():
    question_text = item['question']
    option1 = item["option1"]
    option2 = item["option2"]
    option3 = item["option3"]
    option4 = item["option4"]

    choices_text = f'Options: 1. {option1}, 2. {option2}, 3. {option3}, 4. {option4}.'
    question_text = f"{question_text} {choices_text}"
    full_prompt = template.format(role_context=role_context, question=question_text)
    answer_prompts.append(full_prompt)

In [4]:
tokenizer = LlamaTokenizer.from_pretrained(model_dir, 
                                           cache_dir=cache_dir,
                                           padding_side='left',
                                           )

if not tokenizer.pad_token:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
model = LlamaForCausalLM.from_pretrained(model_dir, 
                                         cache_dir=cache_dir,
                                         #device_map="auto",
#                                         quantization_config=quantization_config,
                                         #load_in_8bit=True
                                        ).to(0)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


OutOfMemoryError: CUDA out of memory. Tried to allocate 172.00 MiB. GPU 0 has a total capacty of 23.68 GiB of which 60.12 MiB is free. Including non-PyTorch memory, this process has 23.62 GiB memory in use. Of the allocated memory 23.42 GiB is allocated by PyTorch, and 1.05 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
q = tokenizer("hi man", return_tensors='pt')
q

In [None]:
model.generate(**q.to("0"))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [8]:
BATCH_SIZE = 8
ques_batch = answer_prompts[0:(0+BATCH_SIZE)]
ques_batch_tokenized = tokenizer(ques_batch, return_tensors='pt', truncation=True, max_length=512, padding=True)

In [9]:
ques_batch_tokenized

{'input_ids': tensor([[    2,     2,     2,  ..., 29937, 13291, 29901],
        [    2,     2,     2,  ..., 29937, 13291, 29901],
        [    2,     2,     2,  ..., 29937, 13291, 29901],
        ...,
        [    2,     2,     2,  ..., 29937, 13291, 29901],
        [    2,     2,     2,  ..., 29937, 13291, 29901],
        [    2,     2,     1,  ..., 29937, 13291, 29901]]), 'attention_mask': tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        ...,
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 1,  ..., 1, 1, 1]])}

In [9]:
answ_generated = model.generate(**ques_batch_tokenized, max_new_tokens=30)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [None]:
answ_generated = model.generate(**ques_batch_tokenized, max_length=600)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [None]:
for idx in tqdm(range(0, 12, BATCH_SIZE)):
    ques_batch = answer_prompts[idx:(idx+BATCH_SIZE)]
    ques_batch_tokenized = tokenizer(ques_batch, return_tensors='pt', truncation=True, max_length=512, padding=True)
    answ_generated = model.generate(**ques_batch_tokenized, max_new_tokens=30)
    

  0%|                                                                                             | 0/2 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [None]:
Dataset.from_list(list)