In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import time
import matplotlib.pyplot as plt


# Specify the model name
model_name = "meta-llama/Llama-3.2-1B-Instruct"

# To get access to a gated model on huggingface
# Type: "huggingface-cli login" on your teminal and
# input your access token

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Load the model with INT8 quantization using bitsandbytes
quantization_config = BitsAndBytesConfig(load_in_8bit=True)

# Load the model with bitsandbytes in 8-bit quantization mode
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,  # Enable 8-bit quantization
    device_map="auto"   # Automatically assign devices
)

# Move model to MPS if available, otherwise to CPU
#device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
#device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device = torch.device("cpu")
model = model.to(device)

# Define a sample prompt
prompt = "How can I improve my productivity while working from home?"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

# Function to calculate latency and generate text
def generate_text(input_ids, model, tokenizer, max_length=50):
    # Start the timer
    start_time = time.time()

    # Generate text
    with torch.no_grad():
        output = model.generate(input_ids, max_length=max_length, num_return_sequences=1, no_repeat_ngram_size=2)

    # End the timer
    latency = time.time() - start_time

    # Decode the output
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text, latency

# #Generate text and calculate latency

# max_length = [25, 50, 100, 150, 200]
# output_latency = []
# for length in max_length:
#     generated_text, latency = generate_text(input_ids, model, tokenizer, max_length=length)
#     output_latency.append(latency)
#     print(f"Max Length: {length}, Latency: {latency:.4f} seconds")

# # Plotting the latency vs max_length
# plt.figure(figsize=(10, 6))
# plt.plot(max_length, output_latency, marker='o')
# plt.title('Latency vs Output Token Length for %s Text Generation' % model_name)
# plt.xlabel('Output Token Length')
# plt.ylabel('Latency (seconds)')
# plt.grid(True)
# plt.show()


generated_text, latency = generate_text(input_ids, model, tokenizer, max_length=200)

print("Generated Text:\n", generated_text)
print(f"Latency: {latency:.4f} seconds")

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


RuntimeError: No GPU found. A GPU is needed for quantization.

In [2]:
model.device

device(type='cuda', index=0)

In [4]:
pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.1


In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m", device_map="auto", load_in_8bit=True)
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")

model.push_to_hub("bloom-560m-8bit")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/817M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/dsouzapeter/bloom-560m-8bit/commit/679d97655d1a4f3883548596f3274e8989992fcf', commit_message='Upload BloomForCausalLM', commit_description='', oid='679d97655d1a4f3883548596f3274e8989992fcf', pr_url=None, pr_revision=None, pr_num=None)

In [2]:
pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.1
