<a href="https://colab.research.google.com/github/mightyoctopus/hugging_face_quantization/blob/main/w3_d4_quantization_actual_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U -q transformers bitsandbytes accelerate

In [2]:
import os
from google.colab import userdata, drive
from huggingface_hub import login, snapshot_download
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextStreamer, AutoConfig
import torch

In [3]:
cache_path = "/content/drive/MyDrive/Colab Notebooks/huggingface_cache"

In [4]:
hf_token = userdata.get("HF_TOKEN")
login(hf_token, add_to_git_credential=True)

In [5]:
### Instruct Models:

LLAMA = "meta-llama/Llama-3.1-8B-Instruct"
PHI3 = "microsoft/Phi-3-mini-4k-instruct"
GEMMA2 = "google/gemma-2-2b-it"
QWEN2 = "Qwen/Qwen2-7B-Chat"
MIXTRAL = "mistralai/Mixtral-8x7B-Instruct-v0.1" # Use smaller if out of memory.

In [6]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Tell a light-hearted joke for a room of Data Scientists"}
]

In [7]:
bnb_config_4bit = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(LLAMA, cache_dir=cache_path)
tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer.apply_chat_template(
    messages,
    return_tensors="pt",
    tokenize=True
).to("cuda")
print(inputs)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    LLAMA,
    device_map="cuda",
    quantization_config=bnb_config_4bit,
    cache_dir=cache_path
)

In [None]:
memory = model.get_memory_footprint() / (1024 ** 2)
print(f"{memory:,.1f} MB")

In [None]:
outputs = model.generate(
    inputs,
    max_new_tokens=80
    )

print("RESULT: ", tokenizer.decode(outputs[0]))

In [27]:
### Clean up cache
# del inputs, outputs, model
# torch.cuda.empty_cache()

In [13]:
### Wrapping everything above into a modular function:

def generate(model_name, messages):

  # Manually download to Drive
  model_path = snapshot_download(
    repo_id=model_name,
    cache_dir=cache_path,
    local_dir_use_symlinks=False  # safer in Drive
  )

  tokenizer = AutoTokenizer.from_pretrained(model_path)

  tokenizer.pad_token = tokenizer.eos_token
  inputs = tokenizer.apply_chat_template(
      messages,
      tokenize=True,
      return_tensors="pt"
  ).to("cuda")

  streamer = TextStreamer(tokenizer)

  model = AutoModelForCausalLM.from_pretrained(
      model_path,
      device_map="auto",
      quantization_config=bnb_config_4bit,
  )

  outputs = model.generate(
      inputs,
      max_new_tokens=80,
      streamer=streamer,
  )

  del tokenizer, model, inputs, outputs
  torch.cuda.empty_cache()

In [None]:
generate(LLAMA, messages)