<a href="https://colab.research.google.com/github/luca-michaelides/deep_learning_project/blob/main/llama_chats/chat_llama2_smoothllm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Initial test with Llama2 LLM

In [1]:
import os, gc, time, torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

hf_token = os.environ.get("HF_TOKEN")

model_id = "meta-llama/Llama-2-7b-chat-hf"

try:
    del model
except NameError:
    pass
gc.collect()
torch.cuda.empty_cache()

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_token)

print("Loading 4-bit quantized model (this may take 60-120s)...")
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True
)

print("Model loaded. Sample param device:", next(model.parameters()).device)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

prompt = "<s>[INST] <<SYS>>\nYou are a helpful assistant.\n<</SYS>>\n\nSay hello briefly. [/INST]"
inputs = tokenizer(prompt, return_tensors="pt")
inputs.pop("token_type_ids", None)
device = next(model.parameters()).device
inputs = {k: v.to(device) for k, v in inputs.items()}

t0 = time.time()
with torch.no_grad():
    out = model.generate(
        **inputs,
        max_new_tokens=24,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id
    )
t1 = time.time()
print("Generation time: %.2fs" % (t1 - t0))
print("Decoded output:\n", tokenizer.decode(out[0], skip_special_tokens=True))

print("\n--- nvidia-smi ---")
!nvidia-smi --query-gpu=name,memory.total,memory.used --format=csv


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hLoading tokenizer...


`torch_dtype` is deprecated! Use `dtype` instead!


Loading 4-bit quantized model (this may take 60-120s)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Model loaded. Sample param device: cuda:0
CUDA available: True
GPU: Tesla T4
Generation time: 2.37s
Decoded output:
 [INST] <<SYS>>
You are a helpful assistant.
<</SYS>>

Say hello briefly. [/INST]  Hello! It's nice to meet you! How can I assist you today?

--- nvidia-smi ---
name, memory.total [MiB], memory.used [MiB]
Tesla T4, 15360 MiB, 6568 MiB


In [17]:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

def format_llama2_chat_prompt(user_message, system_message="You are a helpful assistant."):
    return (
        "<s>[INST] <<SYS>>\n"
        f"{system_message}\n"
        "<</SYS>>\n\n"
        f"{user_message} [/INST]"
    )

import time, torch

# find device where most params live (works for sharded models)
_model_device = next(model.parameters()).device

def chat(user_input, max_tokens=64, temperature=0.1, do_sample=None):
    if do_sample is None:
        do_sample = (temperature > 0)

    prompt = format_llama2_chat_prompt(user_input)  # your formatting function
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs.pop("token_type_ids", None)

    # move inputs to model device (handles sharded / auto device_map)
    inputs = {k: v.to(_model_device) for k, v in inputs.items()}

    t0 = time.time()
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=do_sample,
            temperature=temperature if do_sample else 0.0,
            pad_token_id=tokenizer.eos_token_id,
            use_cache=True
        )
    torch.cuda.synchronize() if torch.cuda.is_available() else None
    dt = time.time() - t0

    decoded = tokenizer.decode(out[0], skip_special_tokens=True)
    # The model returns prompt+response; return only the part after the [/INST] marker if you prefer:
    # split on the closing tag and keep the tail:
    tail = decoded.split("[/INST]")[-1].strip()
    print(tail)
    pass


In [20]:
chat("Answer with 1 word only. What is the capital city of Spain?")

Madrid
