In [2]:
!pip install tqdm



In [5]:
!pip install transformers accelerate bitsandbytes>0.37.0 datasets

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.[0m[31m
[0m

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset
import time
import numpy as np
import torch

# Load the tokenizer
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load and configure models with BitsAndBytesConfig
quantization_config_8bit = BitsAndBytesConfig(load_in_8bit=True)
quantization_config_4bit = BitsAndBytesConfig(load_in_4bit=True)

# Load models with 8-bit and 4-bit quantization
model_fp32 = AutoModelForCausalLM.from_pretrained(model_name)
model_8bit = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config_8bit)
model_4bit = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config_4bit)

# Print memory footprints using `get_memory_footprint`
print(f"FP32 Model Memory Usage: {model_fp32.get_memory_footprint() / (1024 ** 3):.2f} GB")
print(f"8-bit Model Memory Usage: {model_8bit.get_memory_footprint() / (1024 ** 3):.2f} GB")
print(f"4-bit Model Memory Usage: {model_4bit.get_memory_footprint() / (1024 ** 3):.2f} GB")

# Measure inference latency
input_text = "The history of quantum mechanics begins with"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Latency measurement for FP32
start_time = time.time()
with torch.no_grad():
    model_fp32(input_ids)
latency_fp32 = time.time() - start_time
print(f"FP32 Model Latency: {latency_fp32:.2f} seconds")

# Latency measurement for 8-bit model
start_time = time.time()
with torch.no_grad():
    model_8bit(input_ids)
latency_8bit = time.time() - start_time
print(f"8-bit Model Latency: {latency_8bit:.2f} seconds")

# Latency measurement for 4-bit model
start_time = time.time()
with torch.no_grad():
    model_4bit(input_ids)
latency_4bit = time.time() - start_time
print(f"4-bit Model Latency: {latency_4bit:.2f} seconds")

# Load a subset of the Wikipedia dataset
dataset = load_dataset("wikipedia", "20220301.en", split="train[:3000]")

# Helper function to compute perplexity
def calculate_perplexity(model, tokenizer, dataset):
    texts = [entry["text"] for entry in dataset]
    encodings = tokenizer("\n\n".join(texts), return_tensors="pt", truncation=True, max_length=512)
    stride = 512
    nlls = []
    for i in range(0, encodings.input_ids.size(1), stride):
        input_ids = encodings.input_ids[:, i:i + stride].to(model.device)
        target_ids = input_ids.clone()
        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            nll = outputs.loss.item() * input_ids.size(1)
            nlls.append(nll)
    perplexity = np.exp(np.sum(nlls) / len(encodings.input_ids[0]))
    return perplexity

# Compute perplexity for FP32, 8-bit, and 4-bit models
print("Calculating perplexity...")
perplexity_fp32 = calculate_perplexity(model_fp32, tokenizer, dataset)
perplexity_8bit = calculate_perplexity(model_8bit, tokenizer, dataset)
perplexity_4bit = calculate_perplexity(model_4bit, tokenizer, dataset)

print(f"FP32 Model Perplexity: {perplexity_fp32:.2f}")
print(f"8-bit Model Perplexity: {perplexity_8bit:.2f}")
print(f"4-bit Model Perplexity: {perplexity_4bit:.2f}")

`low_cpu_mem_usage` was None, now set to True since model is quantized.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


FP32 Model Memory Usage: 0.48 GB
8-bit Model Memory Usage: 0.16 GB
4-bit Model Memory Usage: 0.12 GB
FP32 Model Latency: 2.10 seconds
8-bit Model Latency: 3.74 seconds
4-bit Model Latency: 0.10 seconds




README.md:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

wikipedia.py:   0%|          | 0.00/36.7k [00:00<?, ?B/s]

The repository for wikipedia contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/wikipedia.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0/41 [00:00<?, ?files/s]

train-00000-of-00041.parquet:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

train-00001-of-00041.parquet:   0%|          | 0.00/705M [00:00<?, ?B/s]

train-00002-of-00041.parquet:   0%|          | 0.00/558M [00:00<?, ?B/s]

train-00003-of-00041.parquet:   0%|          | 0.00/491M [00:00<?, ?B/s]

train-00004-of-00041.parquet:   0%|          | 0.00/431M [00:00<?, ?B/s]

train-00005-of-00041.parquet:   0%|          | 0.00/391M [00:00<?, ?B/s]

train-00006-of-00041.parquet:   0%|          | 0.00/366M [00:00<?, ?B/s]

train-00007-of-00041.parquet:   0%|          | 0.00/326M [00:00<?, ?B/s]

train-00008-of-00041.parquet:   0%|          | 0.00/329M [00:00<?, ?B/s]

train-00009-of-00041.parquet:   0%|          | 0.00/312M [00:00<?, ?B/s]

train-00010-of-00041.parquet:   0%|          | 0.00/267M [00:00<?, ?B/s]

train-00011-of-00041.parquet:   0%|          | 0.00/247M [00:00<?, ?B/s]

train-00012-of-00041.parquet:   0%|          | 0.00/229M [00:00<?, ?B/s]

train-00013-of-00041.parquet:   0%|          | 0.00/248M [00:00<?, ?B/s]

train-00014-of-00041.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

train-00015-of-00041.parquet:   0%|          | 0.00/236M [00:00<?, ?B/s]

train-00016-of-00041.parquet:   0%|          | 0.00/215M [00:00<?, ?B/s]

train-00017-of-00041.parquet:   0%|          | 0.00/229M [00:00<?, ?B/s]

train-00018-of-00041.parquet:   0%|          | 0.00/241M [00:00<?, ?B/s]

train-00019-of-00041.parquet:   0%|          | 0.00/228M [00:00<?, ?B/s]

train-00020-of-00041.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

train-00021-of-00041.parquet:   0%|          | 0.00/255M [00:00<?, ?B/s]

train-00022-of-00041.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

train-00023-of-00041.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

train-00024-of-00041.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00025-of-00041.parquet:   0%|          | 0.00/218M [00:00<?, ?B/s]

train-00026-of-00041.parquet:   0%|          | 0.00/212M [00:00<?, ?B/s]

train-00027-of-00041.parquet:   0%|          | 0.00/206M [00:00<?, ?B/s]

train-00028-of-00041.parquet:   0%|          | 0.00/199M [00:00<?, ?B/s]

train-00029-of-00041.parquet:   0%|          | 0.00/219M [00:00<?, ?B/s]

train-00030-of-00041.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

train-00031-of-00041.parquet:   0%|          | 0.00/216M [00:00<?, ?B/s]

train-00032-of-00041.parquet:   0%|          | 0.00/200M [00:00<?, ?B/s]

train-00033-of-00041.parquet:   0%|          | 0.00/203M [00:00<?, ?B/s]

train-00034-of-00041.parquet:   0%|          | 0.00/201M [00:00<?, ?B/s]

train-00035-of-00041.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00036-of-00041.parquet:   0%|          | 0.00/199M [00:00<?, ?B/s]

train-00037-of-00041.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00038-of-00041.parquet:   0%|          | 0.00/203M [00:00<?, ?B/s]

train-00039-of-00041.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00040-of-00041.parquet:   0%|          | 0.00/185M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6458670 [00:00<?, ? examples/s]

Calculating perplexity...
FP32 Model Perplexity: 24.92
8-bit Model Perplexity: 25.05
4-bit Model Perplexity: 27.51
