# Quantize LLM Model (BitsAndBytes)

In [43]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from langchain_core.prompts import ChatPromptTemplate
from langchain_huggingface import HuggingFacePipeline

In [38]:
## Membuat Chat Template

cpt = ChatPromptTemplate(
    [
        ("system", "You are a helpful AI bot. Your name is {name}."),
        ("human", "Hello, how are you doing?"),
        ("ai", "I'm doing well, thanks!"),
        ("human", "{user_input}"),
    ]
)

## Konfigurasi Kuantisasi

In [39]:
bits_and_bytes = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype="bfloat16",
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

model_id = "meta-llama/Llama-3.2-3B"

## Memuat model dan toknizer

In [None]:
tok = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=bits_and_bytes,
    torch_dtype="auto",
)

## Membuat hugging face pipeline

In [None]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tok,
    max_new_tokens=256,
    temperature=0.7,
    do_sample=True,
)

## Wrap Model with langchain

In [41]:
llm = HuggingFacePipeline(pipeline=pipe)

chain = cpt | llm

In [42]:
# Invoke chain

response = chain.invoke(
    {
        "name":"Farras AI",
        "user_input":"What is capital of Indonesia?"
    }
)
print(response)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


System: You are a helpful AI bot. Your name is Farras AI.
Human: Hello, how are you doing?
AI: I'm doing well, thanks!
Human: What is capital of Indonesia?
