<a href="https://colab.research.google.com/github/ohmreborn/question-generation-AIB2023/blob/main/llama-7b-hf/deployment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title ติดตั้ง Package ที่ต้องใช้
!pip install -q loralib==0.1.1
!pip install -q bitsandbytes==0.39.0
!pip install -q datasets==2.12.0
!pip install -q peft==0.3.0
!pip install -q transformers==4.28.1
!pip install -q sentencepiece==0.1.99
!pip install -q gradio==3.33.1
!pip install -q accelerate==0.19.0

In [None]:
#@title โหลด โมเดล
import os
import sys

import torch
import transformers
from peft import PeftModel
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer


def main(
    load_8bit: bool = True,
    base_model: str = "decapoda-research/llama-7b-hf",
    lora_weights: str = "ohmreborn/llama-lora-7b",
):
    tokenizer = LlamaTokenizer.from_pretrained(base_model)
    
    model = LlamaForCausalLM.from_pretrained(
        base_model,
        load_in_8bit=load_8bit,
        torch_dtype=torch.float16,
        device_map="auto",
    )
    model = PeftModel.from_pretrained(
        model,
        lora_weights,
        torch_dtype=torch.float16,
    )

    model.config.pad_token_id = tokenizer.pad_token_id = 0 
    model.config.bos_token_id = 1
    model.config.eos_token_id = 2

    model.eval()
    if torch.__version__ >= "2" and sys.platform != "win32":
        model = torch.compile(model)
    return model,tokenizer
base_model='decapoda-research/llama-7b-hf' #@param {type:"string"}
model,tokenizer = main(base_model=base_model)
device = torch.device('cuda')

In [None]:
#@title สร้างฟังก์ชันสำหรับ generate ออกมา
from typing import Union
import requests

class Prompter(object):

    def __init__(self):
        
        url = "https://raw.githubusercontent.com/tloen/alpaca-lora/main/templates/alpaca.json"
        response = requests.request("GET", url)
        self.template = response.json()

    def generate_prompt(
        self,
        instruction: str,
        input: Union[None, str] = None,
        label: Union[None, str] = None,
    ) -> str:
        if input:
            res = self.template["prompt_input"].format(
                instruction=instruction, input=input
            )
        else:
            res = self.template["prompt_no_input"].format(
                instruction=instruction
            )
        if label:
            res = f"{res}{label}"
        return res

    def get_response(self, output: str) -> str:
        return output.split(self.template["response_split"])[1].strip()

def generate(
    input=None,
    temperature=0.75, 
    top_p=0.95, # จะ เอา ค่าความน่าจะเป็นของ top ความน่าจะเป็นที่มากที่สุดมารวมกันจนมากกว่า 0.95 แล้วค่อยให้ model สุ่ม ออกมาhttps://www.linkedin.com/pulse/text-generation-temperature-top-p-sampling-gpt-models-selvakumar
    top_k=50, # เอา 50 แรก แต่ถ้า ใส่ค่า top p ไปด้วย จะทำให้ คิดของ top p ก่อน เช่น ถ้า 50 ตัวแรกมีความน่าจะเป็นรวมกัน = 0.90 ซึ่งไม่ถึงค่าที่ตั้งไว้ก็เอามาไว้ใช้สำหรับการทำนายครั้งถัดไป https://docs.cohere.com/docs/controlling-generation-with-top-k-top-p#2-pick-from-amongst-the-top-tokens-top-k
    max_new_tokens=1024,
    instruction="Please create an inference question in the style of TOEFL reading comprehension section. Also provide an answer in the format",
    model=model,
    tokenizer=tokenizer,
):
    prompter = Prompter()
    prompt = prompter.generate_prompt(instruction, input,)
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    generation_config = GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        repetition_penalty=1.2
    )


    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
        )
    s = generation_output.sequences[0]
    output = tokenizer.decode(s)
    return prompter.get_response(output)



In [None]:
print(generate(input=example_1))

In [None]:
#@title Run app
import gradio as gr
example_1 = """Education is the process of facilitating learning, or the acquisition of knowledge, skills, values, morals, beliefs, habits, and personal development. There are many types of potential educational aims and objectives, irrespective of the specific subject being learned. Some can cross multiple school disciplines.""",
example_2 = """History – discovery, collection, organization, and presentation of information about past events. History can also mean the period of time after writing was invented (the beginning of recorded history).""",
example_3 = """Culture – a set of patterns of human activity within a community or social group and the symbolic structures that give significance to such activity. Customs, laws, dress, architectural style, social standards, and traditions are all examples of cultural elements. Since 2010, Culture is considered the Fourth Pillar of Sustainable Development by UNESCO.""",
example_4 = """Health sciences are those sciences which focus on health, or health care, as core parts of their subject matter. Health sciences relate to multiple academic disciplines, including STEM disciplines and emerging patient safety disciplines."""
demo = gr.Interface(fn=generate,
                    inputs=[gr.Textbox(value=example_1),
                            gr.Slider(1,1024,value=1024,step=1,label='max_new_tokens'),
                            gr.Slider(0.05,1,value=0.75,step=0.05,label='temperature'),
                            gr.Slider(0.05,1,value=0.95,step=0.05,label='top_p'),
                            gr.Slider(1,100,value=65,step=1,label='top_k')], 
                    outputs=["text"],
                    examples = [[example_1,512,0.75,0.95,65],[example_2,512,0.75,0.95,60],[example_3,512,0.75,0.95,50],[example_4,512,0.75,0.95,45]]
                    )
demo.launch(share=True)