[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lwieske/bloom-quantization-and-qlora-finetuning/blob/main/bloom_1b7_inference.ipynb)
[![Open in SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/lwieske/bloom-quantization-and-qlora-finetuning/blob/main/bloom_1b7_inference.ipynb)

# BLOOM 1B7 Inference with Nvidia T4

In [1]:
!pip -q install transformers

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [3]:
#MODEL      = "bigscience/bloom-560m"
#MODEL      = "bigscience/bloom-1b1"
MODEL      = "bigscience/bloom-1b7"
#MODEL      = "bigscience/bloom-3b"
#MODEL      = "bigscience/bloom-7b1"

TOKENS     = 20

DEVICE     = "cuda:0"

In [4]:
pretrained_model = AutoModelForCausalLM.from_pretrained(MODEL)

pretrained_model = pretrained_model.to(DEVICE)

In [5]:
!nvidia-smi

Mon Dec  4 10:03:22 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   50C    P0    27W /  70W |   6675MiB / 15360MiB |     33%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [6]:
tokenizer           = AutoTokenizer.from_pretrained(MODEL)

tokenizer.pad_token = tokenizer.eos_token

In [7]:
!nvidia-smi

Mon Dec  4 10:03:23 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   50C    P0    26W /  70W |   6675MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Inference with Pretrained BLOOM

In [8]:
def generate_output_tokens(model, input_tokens):
    outputs = model.generate(
        input_ids=input_tokens["input_ids"],
        attention_mask=input_tokens["attention_mask"],
        max_new_tokens=TOKENS,
        repetition_penalty=1.5,
        early_stopping=False,
        eos_token_id=tokenizer.eos_token_id,
    )
    return outputs

In [9]:
input_words = "What is the meaning of life?"

input_tokens = tokenizer(input_words, return_tensors="pt").to(DEVICE)

pretrained_output_tokens = generate_output_tokens(pretrained_model, input_tokens)

pretrained_output_words = tokenizer.batch_decode(pretrained_output_tokens, skip_special_tokens=True)

print(pretrained_output_words)

['What is the meaning of life? What does it mean to be human, and what do we need in order for us not only survive']


In [10]:
!nvidia-smi

Mon Dec  4 10:03:24 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P0    71W /  70W |   6743MiB / 15360MiB |     94%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces