In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
!pip install sentencepiece
!pip install "transformers[sentencepiece]"
!pip install transformers==4.31.0
!pip install -q -U trl accelerate git+https://github.com/huggingface/peft.git
!pip install -q datasets bitsandbytes einops wandb
from google.colab import drive
drive.mount('/content/drive')
import torch
import json
from datasets import load_dataset

import os

from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    set_peft_model_state_dict
)
from transformers import LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig


  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
base_model = "meta-llama/Llama-2-7b-hf"


model = LlamaForCausalLM.from_pretrained(
        base_model,
        load_in_8bit=True,
        torch_dtype=torch.float16,
        # quantization_config=quant_config,
        use_auth_token = "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",
        device_map = "auto"
    )


tokenizer = LlamaTokenizer.from_pretrained(base_model,
                                           use_auth_token = "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")
tokenizer.pad_token_id = (0)
tokenizer.padding_side = "left"


def tokenize(tokenizer, prompt, add_eos_token=True, cutoff_len = 512):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=cutoff_len,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < cutoff_len
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result


eos_token_added = True
template = {
    "prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",
    "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n",
    "response_split": "### Response:"
}

def generate_prompt(instruction, input, label, template=template, ):

    if input:
        res = template["prompt_input"].format(
            instruction=instruction, input=input
        )
    else:
        res = template["prompt_no_input"].format(
            instruction=instruction
        )
    if label:
        res = f"{res}{label}"
    return res

training = True
def generate_and_tokenize_prompt(tokenizer, data_point, training, cutoff_len=512):
    full_prompt = generate_prompt(
        data_point["instruction"],
        data_point["input"],
        data_point["output"],
    )
    tokenized_full_prompt = tokenize(tokenizer, full_prompt, cutoff_len=cutoff_len)
    if not training:
        user_prompt = generate_prompt(
            data_point["instruction"], data_point["input"]
        )
        tokenized_user_prompt = tokenize(
            user_prompt, add_eos_token=eos_token_added
        )
        user_prompt_len = len(tokenized_user_prompt["input_ids"])

        if eos_token_added:
            user_prompt_len -= 1

        tokenized_full_prompt["labels"] = [
            -100
        ] * user_prompt_len + tokenized_full_prompt["labels"][
            user_prompt_len:
        ]
    return tokenized_full_prompt

model = prepare_model_for_int8_training(model)
LORA = LoraConfig(
    r=4,
    lora_alpha=4,
    target_modules=["q_proj","v_proj",],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, LORA)

data = load_dataset("json", data_files="drive/MyDrive/alpaca_data.json")


resume_for_cp = False
if resume_for_cp:
    checkpoint_name = os.path.join(
        resume_for_cp, "pytorch_model.bin"
    )
    if not os.path.exists(checkpoint_name):
        checkpoint_name = os.path.join(
            resume_for_cp, "adapter_model.bin"
        )
        resume_for_cp = (False)

    if os.path.exists(checkpoint_name):
        print(f"Restarting from {checkpoint_name}")
        adapters_weights = torch.load(checkpoint_name)
        set_peft_model_state_dict(model, adapters_weights)
    else:
        print(f"Checkpoint {checkpoint_name} not found")

train_data = data["train"].shuffle().map(lambda x: generate_and_tokenize_prompt(tokenizer, x, training=True, cutoff_len=256))

small_data_train = data["train"].shuffle().select([i for i in range(20)]).map(lambda x: generate_and_tokenize_prompt(tokenizer, x, training=True))

import transformers

batch_size = 8
num_epochs = 1
learning_rate = 3e-4
gradient_accumulation_steps = 8
val_set_size = 2000
use_wandb = False
group_by_length = True

os.environ["WANDB_MODE"] = "disabled"
os.environ["WANDB_DISABLED"] = "true"

model.config.use_cache = False

trainer = transformers.Trainer(
    model=model,
    train_dataset=train_data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=100,
        num_train_epochs=num_epochs,
        learning_rate=learning_rate,
        fp16=False,
        logging_steps=10,
        optim="paged_adamw_32bit",
        evaluation_strategy="no",
        save_strategy="steps",
        save_steps=25,
        output_dir="drive/MyDrive/test/",
        load_best_model_at_end=False,
        ddp_find_unused_parameters=None,
        group_by_length=group_by_length,

    ),
    data_collator=transformers.DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    ),
)


old_state_dict = model.state_dict
model.state_dict = (
        lambda self, *_, **__: get_peft_model_state_dict(
            self, old_state_dict()
        )
    ).__get__(model, type(model))




config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

ImportError: ignored

## File Functions

In [None]:
trainer.train(resume_from_checkpoint=False)
model.save_pretrained("drive/MyDrive/test/")
trainer.save_model("drive/MyDrive/test")




Step,Training Loss
10,1.699
20,1.8766
30,2.0011
40,1.9406
50,1.48
60,1.2152
70,1.0554
80,0.9406
90,0.8156
100,0.7812




## Inference

In [None]:
torch.save(model.state_dict(), 'drive/MyDrive/test/model_state_dict.pth')

In [None]:
from transformers import LlamaForCausalLM, LlamaConfig

# # Load the configuration of the pretrained model
# config = LlamaConfig.from_pretrained(base_model)

# # Create a new instance of your model with the same configuration
# new_model = LlamaForCausalLM(config=config)

# LlamaForCausalLM

# Load the saved state dictionary into the new model
new_model.load_state_dict(torch.load('drive/MyDrive/test/model_state_dict.pth'))


KeyboardInterrupt: ignored

In [None]:
# Save the final model
trainer.save_model("drive/MyDrive/test/final_model")

# Optionally, you can save the tokenizer and configuration as well
tokenizer.save_pretrained("drive/MyDrive/test/final_model")
model.config.save_pretrained("drive/MyDrive/test/final_model")

In [None]:
# from transformers import BartForConditionalGeneration, BartConfig, AutoTokenizer, LlamaTokenizer
# from transformers import LlamaForCausalLM, LlamaTokenizer

# Load the saved model, tokenizer, and config

# tokenizer = LlamaTokenizer.from_pretrained(model_path)

# Now you can use the loaded model and tokenizer for inference or further training


NameError: ignored

In [2]:
model_path = "drive/MyDrive/test/final_model"
# config = LlamaForCausalLM.from_pretrained(model_path,
#             # load_in_8bit=True,
#             torch_dtype=torch.float16,)

repo_name = "meta-llama/Llama-2-7b-hf"

base_model = LlamaForCausalLM.from_pretrained(
        repo_name,
        load_in_8bit=True,
        torch_dtype=torch.float16,
        # quantization_config=quant_config,
        use_auth_token = "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",
        device_map = "auto"
    )





Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf",
                                           use_auth_token = "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")
tokenizer.pad_token_id = (0)
tokenizer.padding_side = "left"



tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

In [None]:
from peft import PeftModel

# adapters_weights = torch.load("drive/MyDrive/test/final_model/adapter_model.bin")
# set_peft_model_state_dict(base_model, adapters_weights)
m = PeftModel.from_pretrained(base_model, "drive/MyDrive/test/final_model/")
model = m.merge_and_unload()
trained_tokenizer = LlamaTokenizer.from_pretrained("drive/MyDrive/test/final_model/")



In [None]:
from transformers import pipeline

tokenizer.bos_token_id = 1
base_model.config.pad_token_id = tokenizer.pad_token_id = 0
base_model.config.bos_token_id = 1
base_model.config.eos_token_id = 2

prompt = "Who is the president of United States?"
pipe = pipeline(task="text-generation", model=base_model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

NameError: ignored

In [1]:
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from peft import PeftConfig, PeftModel

PEFT_MODEL = "drive/MyDrive/test/final_model"

config = PeftConfig.from_pretrained(PEFT_MODEL)
model = AutoModelForCausalLM.from_pretrained(
     "meta-llama/Llama-2-7b-hf",
    return_dict=True,
    # quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
     use_auth_token = "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"
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [2]:
tokenizer=AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf",
                                    use_auth_token = "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")
tokenizer.pad_token = tokenizer.eos_token

model = PeftModel.from_pretrained(model, PEFT_MODEL)



ValueError: ignored

In [None]:
generation_config = model.generation_config
generation_config.max_new_tokens = 10
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

In [None]:
# from transformers import PeftForConditionalGeneration, LlamaTokenizer

# Assuming model and tokenizer are already loaded as in your previous code

# Define your prompts
prompts = [
    "Who is the president of the United States?",
    "Which city is the capital of PRC?",
    "1+1=?"
]

# Generate responses for each prompt
for prompt in prompts:
    encoding = trained_tokenizer(prompt, return_tensors="pt").to(torch.device("cuda"))
    # output_ids = model.generate(input_ids,
    #                                  attention_mask=trained_tokenizer(prompt, return_tensors="pt")['attention_mask'],
    #                                  max_length=200, num_beams=5, no_repeat_ngram_size=2, top_k=50, top_p=0.95, temperature=0.7)
    with torch.inference_mode():
      outputs = model.generate(
          input_ids = encoding.input_ids,
          attention_mask = encoding.attention_mask,
          generation_config = generation_config
      )
    generated_text = trained_tokenizer.decode(outputs[0], skip_special_tokens=True)

    print(f"Prompt: {prompt}")
    print(f"Generated Response: {generated_text}\n")




Prompt: Who is the president of the United States?
Generated Response: Who is the president of the United States?
Who was the first president to live in the White House and why did he choose to do so? How many presidents have been assassinated while in office? What is a presidential pardon? These and many other questions are answered in this book.
Publisher: New York : Franklin Watts, c1996
Branch Call Number: J 973.049209
Characteristics: 32 p. : col. ill. ; 24 cm
Read more reviews of The Presidents at iDreamBooks.com
United States. — Presidency — Juvenile Literature

Prompt: Which city is the capital of PRC?
Generated Response: Which city is the capital of PRC?

Prompt: 1+1=?
Generated Response: 1+1=?
I’ve been thinking a lot lately about what it means to be a Christian. I’m not talking about being a member of a particular church or denomination, or even about whether or not you believe in God. What I mean is, what does it mean to live as a follower of Jesus Christ? What does that lo

In [None]:
from peft import PeftModel
from transformers import GenerationConfig

tokenizer = LlamaTokenizer.from_pretrained(base_model)
load_8bit = False

if not model:
  base_model = "drive/MyDrive/test/model.pth"
  model = LlamaForCausalLM.from_pretrained(
      base_model,
      load_in_8bit=load_8bit,
      torch_dtype=torch.float16,
      device_map="auto",
  )

lora_weights = "drive/MyDrive/test/lora_weights"

model = PeftModel.from_pretrained(
    model,
    lora_weights,
    torch_dtype=torch.float16,
)

model.config.pad_token_id = tokenizer.pad_token_id = 0
model.config.bos_token_id = 1
model.config.eos_token_id = 2

if not load_8bit:
  model.half()

model.eval()

ValueError: ignored

In [None]:
def get_response(output: str):
  return output.split(template["response_split"])[1].strip()

def evaluate(
    instruction,
    input=None,
    temperature=0.1,
    top_p=0.75,
    top_k=40,
    num_beams=4,
    max_new_tokens=128,
    stream_output=False,
    **kwargs,
):
    prompt = generate_prompt(instruction, input)
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(model.device)
    generation_config = GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        num_beams=num_beams,
        **kwargs,
    )

    generate_params = {
        "input_ids": input_ids,
        "generation_config": generation_config,
        "return_dict_in_generate": True,
        "output_scores": True,
        "max_new_tokens": max_new_tokens,
    }

    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
        )
    s = generation_output.sequences[0]
    output = tokenizer.decode(s)
    return get_response(output)

#### NEW APPROACH

In [2]:
base_model = "chainyo/alpaca-lora-7b" # meta


model = LlamaForCausalLM.from_pretrained(
            base_model,
            load_in_8bit=True,
            torch_dtype=torch.float16,
            device_map="auto",
        )


config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/39 [00:00<?, ?it/s]

pytorch_model-00001-of-00039.bin:   0%|          | 0.00/396M [00:00<?, ?B/s]

pytorch_model-00002-of-00039.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

pytorch_model-00003-of-00039.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

pytorch_model-00004-of-00039.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

pytorch_model-00005-of-00039.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

pytorch_model-00006-of-00039.bin:   0%|          | 0.00/315M [00:00<?, ?B/s]

pytorch_model-00007-of-00039.bin:   0%|          | 0.00/315M [00:00<?, ?B/s]

pytorch_model-00008-of-00039.bin:   0%|          | 0.00/315M [00:00<?, ?B/s]

pytorch_model-00009-of-00039.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

pytorch_model-00010-of-00039.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

pytorch_model-00011-of-00039.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

pytorch_model-00012-of-00039.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

pytorch_model-00013-of-00039.bin:   0%|          | 0.00/315M [00:00<?, ?B/s]

pytorch_model-00014-of-00039.bin:   0%|          | 0.00/315M [00:00<?, ?B/s]

pytorch_model-00015-of-00039.bin:   0%|          | 0.00/315M [00:00<?, ?B/s]

pytorch_model-00016-of-00039.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

pytorch_model-00017-of-00039.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

pytorch_model-00018-of-00039.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

pytorch_model-00019-of-00039.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

pytorch_model-00020-of-00039.bin:   0%|          | 0.00/315M [00:00<?, ?B/s]

pytorch_model-00021-of-00039.bin:   0%|          | 0.00/315M [00:00<?, ?B/s]

pytorch_model-00022-of-00039.bin:   0%|          | 0.00/315M [00:00<?, ?B/s]

pytorch_model-00023-of-00039.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

pytorch_model-00024-of-00039.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

pytorch_model-00025-of-00039.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

pytorch_model-00026-of-00039.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

pytorch_model-00027-of-00039.bin:   0%|          | 0.00/315M [00:00<?, ?B/s]

pytorch_model-00028-of-00039.bin:   0%|          | 0.00/315M [00:00<?, ?B/s]

pytorch_model-00029-of-00039.bin:   0%|          | 0.00/315M [00:00<?, ?B/s]

pytorch_model-00030-of-00039.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

pytorch_model-00031-of-00039.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

pytorch_model-00032-of-00039.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

pytorch_model-00033-of-00039.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

pytorch_model-00034-of-00039.bin:   0%|          | 0.00/315M [00:00<?, ?B/s]

pytorch_model-00035-of-00039.bin:   0%|          | 0.00/315M [00:00<?, ?B/s]

pytorch_model-00036-of-00039.bin:   0%|          | 0.00/315M [00:00<?, ?B/s]

pytorch_model-00037-of-00039.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

pytorch_model-00038-of-00039.bin:   0%|          | 0.00/304M [00:00<?, ?B/s]

pytorch_model-00039-of-00039.bin:   0%|          | 0.00/262M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/39 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [3]:
from peft import PeftModel

model = PeftModel.from_pretrained(
    model,
    "drive/MyDrive/test/checkpoint-800",
    torch_dtype=torch.float16,
)

model = model.merge_and_unload()



In [3]:
# tokenizer = LlamaTokenizer.from_pretrained("drive/MyDrive/test/final_model")
# tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
tokenizer = LlamaTokenizer.from_pretrained("chainyo/alpaca-lora-7b")

model.config.pad_token_id = tokenizer.pad_token_id = 0  # unk
model.config.bos_token_id = 1
model.config.eos_token_id = 2

model.eval()

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=31999)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear8bitLt(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): Llama

In [4]:
template = {
    "prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",
    "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n",
    "response_split": "### Response:"
}

def generate_prompt(instruction, input, label, template=template, ):

    if input:
        res = template["prompt_input"].format(
            instruction=instruction, input=input
        )
    else:
        res = template["prompt_no_input"].format(
            instruction=instruction
        )
    if label:
        res = f"{res}{label}"
    return res

training = False
def generate_and_tokenize_prompt(tokenizer, data_point, training, cutoff_len=512):
    full_prompt = generate_prompt(
        data_point["instruction"],
        data_point["input"],
        data_point["output"],
    )
    tokenized_full_prompt = tokenize(tokenizer, full_prompt, cutoff_len=cutoff_len)
    if not training:
        user_prompt = generate_prompt(
            data_point["instruction"], data_point["input"]
        )
        tokenized_user_prompt = tokenize(
            user_prompt, add_eos_token=eos_token_added
        )
        user_prompt_len = len(tokenized_user_prompt["input_ids"])

        if eos_token_added:
            user_prompt_len -= 1

        tokenized_full_prompt["labels"] = [
            -100
        ] * user_prompt_len + tokenized_full_prompt["labels"][
            user_prompt_len:
        ]
    return tokenized_full_prompt

In [5]:
from transformers import GenerationConfig

def evaluate(
  instruction,
  tokenizer,
  model,
  input=None,
  temperature=0.1,
  top_p=0.75,
  top_k=40,
  num_beams=4,
  max_new_tokens=128,
  stream_output=False,
  **kwargs,
):
    inputs = tokenizer(generate_prompt(instruction, None, None), return_tensors="pt")
    input_ids = inputs["input_ids"].to(torch.device("cuda"))
    generation_config = GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        num_beams=num_beams,
        **kwargs,
    )

    generate_params = {
        "input_ids": input_ids,
        "generation_config": generation_config,
        "return_dict_in_generate": True,
        "output_scores": True,
        "max_new_tokens": max_new_tokens,
    }

    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
        )
    s = generation_output.sequences[0]
    output = tokenizer.decode(s)
    template = {"response_split": "### Response:" }
    print(output)
    return output.split(template["response_split"])[1].strip()
    # return tokenizer.decode(output[0], skip_special_tokens=True)

print(evaluate("Who is the president of the United States?", tokenizer, model))

<unk>Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Who is the president of the United States?

### Response:
The president of the United States is Joe Biden.
The president of the United States is Joe Biden.
