<a href="https://colab.research.google.com/github/ohmreborn/question-generation-AIB2023/blob/main/llama-7b-hf/deployment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title ติดตั้ง Package ที่ต้องใช้
!pip install -q loralib==0.1.1
!pip install -q bitsandbytes==0.39.0
!pip install -q datasets==2.12.0
!pip install -q peft==0.3.0
!pip install -q transformers==4.28.1
!pip install -q sentencepiece==0.1.99
!pip install -q gradio==3.33.1
!pip install -q gdown==4.6.6
!pip install -q accelerate==0.19.0

In [None]:
#@title โหลด โมเดล
!mkdir checkpoint
import os
import sys

import torch
import transformers
from peft import PeftModel
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer
import gdown
import shutil
url = "https://drive.google.com/uc?export=download&id=1BsT2l8e00ZZM-Q1RcVUxWimZ3_rw7dXp"

output = 'adapter_config.json'
gdown.download(url, output, quiet=False)
url = 'https://drive.google.com/uc?export=download&id=1ErWZE4R_0zZjydVsnAQ7apPUhM31GuO6'
output = 'adapter_model.bin'
gdown.download(url, output, quiet=False)

path = 'adapter_model.bin'
destination = 'checkpoint/adapter_model.bin'
dest = shutil.move(path, destination)

path = 'adapter_config.json'
destination = 'checkpoint/adapter_config.json'
dest = shutil.move(path, destination)


def main(
    load_8bit: bool = True,
    base_model: str = "decapoda-research/llama-7b-hf",
    lora_weights: str = "/content/checkpoint",
):
    base_model = base_model or os.environ.get("BASE_MODEL", "")

    tokenizer = LlamaTokenizer.from_pretrained(base_model)
    
    # max_memory = {i:f"{int(mem/1024**3)}GB"for i,mem in enumerate(torch.cuda.mem_get_info())}
    model = LlamaForCausalLM.from_pretrained(
        base_model,
        load_in_8bit=load_8bit,
        torch_dtype=torch.float16,
        device_map="auto",
        # max_memory=max_memory
    )
    model = PeftModel.from_pretrained(
        model,
        lora_weights,
        torch_dtype=torch.float16,
    )

    

    # unwind broken decapoda-research config
    model.config.pad_token_id = tokenizer.pad_token_id = 0  # unk
    model.config.bos_token_id = 1
    model.config.eos_token_id = 2



    model.eval()
    if torch.__version__ >= "2" and sys.platform != "win32":
        model = torch.compile(model)
    return model,tokenizer
base_model='decapoda-research/llama-7b-hf' #@param {type:"string"}
model,tokenizer = main(base_model=base_model)
device = torch.device('cuda')

In [None]:
#@title สร้างฟังก์ชันสำหรับ generate ออกมา
from typing import Union
import requests

class Prompter(object):

    def __init__(self):
        
        url = "https://raw.githubusercontent.com/tloen/alpaca-lora/main/templates/alpaca.json"
        response = requests.request("GET", url)
        self.template = response.json()

    def generate_prompt(
        self,
        instruction: str,
        input: Union[None, str] = None,
        label: Union[None, str] = None,
    ) -> str:
        # returns the full prompt from instruction and optional input
        # if a label (=response, =output) is provided, it's also appended.
        if input:
            res = self.template["prompt_input"].format(
                instruction=instruction, input=input
            )
        else:
            res = self.template["prompt_no_input"].format(
                instruction=instruction
            )
        if label:
            res = f"{res}{label}"
        return res

    def get_response(self, output: str) -> str:
        return output.split(self.template["response_split"])[1].strip()

def evaluate(
    input=None,
    instruction="Please create an inference question in the style of TOEFL reading comprehension section. Also provide an answer in the format",
    temperature=0.75, # ทำให้ model มั่นใจมากขึ้นใน softmax function https://stackoverflow.com/questions/58764619/why-should-we-use-temperature-in-softmax/63471046#63471046
    top_p=0.95, # จะ เอา ค่าความน่าจะเป็นของ top ความน่าจะเป็นที่มากที่สุดมารวมกันจนมากกว่า 0.95 แล้วค่อยให้ model สุ่ม ออกมาhttps://www.linkedin.com/pulse/text-generation-temperature-top-p-sampling-gpt-models-selvakumar
    top_k=50, # เอา 50 แรก แต่ถ้า ใส่ค่า top p ไปด้วย จะทำให้ คิดของ top p ก่อน เช่น ถ้า 50 ตัวแรกมีความน่าจะเป็นรวมกัน = 0.90 ซึ่งไม่ถึงค่าที่ตั้งไว้ก็เอามาไว้ใช้สำหรับการทำนายครั้งถัดไป https://docs.cohere.com/docs/controlling-generation-with-top-k-top-p#2-pick-from-amongst-the-top-tokens-top-k
    repetition_penalty=1.2, # https://arxiv.org/pdf/1909.05858.pdf หน้าที่ 5
    max_new_tokens=1024,
    model=model,
    tokenizer=tokenizer,
):
    prompter = Prompter()
    prompt = prompter.generate_prompt(instruction, input,)
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    generation_config = GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        repetition_penalty=repetition_penalty
    )


    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
        )
    s = generation_output.sequences[0]
    output = tokenizer.decode(s)
    return prompter.get_response(output)



###Sample input

In [None]:
"""
Nutrition is the biochemical and physiological process by which an organism uses food to support its life. 
It provides organisms with nutrients, which can be metabolized to create energy and chemical structures. 
Failure to obtain sufficient nutrients causes malnutrition. 
"""

In [None]:
#@title Run app
import gradio as gr
demo = gr.Interface(fn=evaluate, inputs="text", outputs="text")
demo.launch()

In [None]:
# 30 วิ