[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lwieske/bloom-quantization-and-qlora-finetuning/blob/main/bloom_7b1_4bit_inference.ipynb)
[![Open in SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/lwieske/bloom-quantization-and-qlora-finetuning/blob/main/bloom_7b1_4bit_inference.ipynb)

# BLOOM 7B1 4bit Inference with Nvidia T4

In [1]:
!pip -q install accelerate bitsandbytes transformers

In [2]:
import torch

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

In [3]:
#MODEL      = "bigscience/bloom-560m"
#MODEL      = "bigscience/bloom-1b1"
#MODEL      = "bigscience/bloom-1b7"
#MODEL      = "bigscience/bloom-3b"
MODEL      = "bigscience/bloom-7b1"

TOKENS     = 20

In [4]:
bnb_config = BitsAndBytesConfig(
                                    load_in_4bit=True,
                                    bnb_4bit_use_double_quant=True,
                                    bnb_4bit_quant_type="nf4",
                                    bnb_4bit_compute_dtype=torch.bfloat16
)

In [5]:
pretrained_model = AutoModelForCausalLM.from_pretrained(
                                                            MODEL,
                                                            quantization_config=bnb_config,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
!nvidia-smi

Mon Dec  4 10:36:45 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P0    27W /  70W |   5743MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

tokenizer.pad_token = tokenizer.eos_token

In [8]:
!nvidia-smi

Mon Dec  4 10:36:46 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P0    27W /  70W |   5743MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Inference with Pretrained BLOOM

In [9]:
def generate_output_tokens(model, input_tokens):
    outputs = model.generate(
        input_ids=input_tokens["input_ids"],
        attention_mask=input_tokens["attention_mask"],
        max_new_tokens=TOKENS,
        repetition_penalty=1.5,
        early_stopping=False,
        eos_token_id=tokenizer.eos_token_id,
    )
    return outputs

In [10]:
input_words = "What is the meaning of life?"

input_tokens = tokenizer(input_words, return_tensors="pt").to('cuda')

pretrained_output_tokens = generate_output_tokens(pretrained_model, input_tokens)

pretrained_output_words = tokenizer.batch_decode(pretrained_output_tokens, skip_special_tokens=True)

print(pretrained_output_words)

['What is the meaning of life? What does it mean to be human, and what are our responsibilities as humans?\nThe answer lies in']


In [11]:
!nvidia-smi

Mon Dec  4 10:36:57 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P0    66W /  70W |   6497MiB / 15360MiB |     98%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces