In [None]:
import os

import torch
import mlflow
import numpy as np
import pandas as pd
import random
from peft import (
    get_peft_config, 
    PeftModel, 
    PeftConfig, 
    get_peft_model, 
    LoraConfig, 
    TaskType,
    prepare_model_for_kbit_training
)
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    BitsAndBytesConfig,
    TrainerCallback,
    TrainingArguments
)
from trl import SFTTrainer
from datasets import load_dataset, Dataset

In [None]:
# Taken from the lit-gpt repository
def generate_prompt(example):
    """Generates a standardized message to prompt the model with an instruction, optional input and a
    'response' field."""

    if example["context"]:
        return (
            "Below is an instruction that describes a task, paired with an input that provides further context. "
            "Write a response that appropriately completes the request.\n\n"
            f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['context']}\n\n### Response:"
        )
    return (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        f"### Instruction:\n{example['instruction']}\n\n### Response:"
    )


def extract_response_text(input_string):
    start_marker = '### Response:'
    end_marker = '###'
    
    start_index = input_string.find(start_marker)
    if start_index == -1:
        return None
    
    start_index += len(start_marker)
    
    end_index = input_string.find(end_marker, start_index)
    if end_index == -1:
        return input_string[start_index:]
    
    return input_string[start_index:end_index].strip()

In [None]:
tokenizer_id = "meta-llama/Llama-2-13b-hf"

In [None]:
model_id = "merged/llama2-13B-instruct-dolly"

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id, use_fast=False)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# When loading 1st time this will be slow
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map={"":0}
)

In [None]:
q = {"instruction": "Tell me the meaning of life", "context": ""}

#q={'instruction': 'What is the origin of orange wine?', "context":""}

prompt = generate_prompt(q)

In [None]:
### Response:""".format(test)
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to('cuda')

In [None]:
generation_output = model.generate(
  input_ids=input_ids, max_new_tokens=256
)

response = tokenizer.decode(generation_output[0])

In [None]:
extract_response_text(response)