In [None]:
# Install necessary dependencies
# Make sure to install the transformers library from Hugging Face and other necessary libraries
!pip install transformers datasets accelerate torch
!pip install -U bitsandbytes

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [5]:
# Import the necessary libraries
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset, Dataset
import time
import os
import numpy as np
from huggingface_hub import login

In [3]:
# This will prompt you to enter your Hugging Face token
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Token has not been saved to git credential helper.


In [6]:
# Check for CUDA and move model to device (GPU if available)
device = "cuda" if torch.cuda.is_available() else "cpu"

device

'cuda'

In [7]:
# Load the Llama model and tokenizer from Hugging Face
model_name = "meta-llama/Llama-3.1-8B"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) # Binary classification task
#model = LlamaForSequenceClassification.from_pretrained(model_name, num_labels=2)  
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Move the model to the selected device (GPU or CPU)
model = model.to(device)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.1-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
"""
This cell loads the first 20 samples from the Amazon polarity dataset and selects exactly 
10 samples with label 0 (negative reviews) and 10 samples with label 1 (positive reviews). 
It then combines these selected samples and converts them back into a Hugging Face Dataset.
"""

# Load the dataset (first 20 samples)
dataset = load_dataset("amazon_polarity", split="test")

# Separate datapoints based on labels
positive_samples = [example for example in dataset if example["label"] == 1][:10]
negative_samples = [example for example in dataset if example["label"] == 0][:10]

# Combine selected samples
filtered_dataset = positive_samples + negative_samples

# Convert back to Hugging Face Dataset
filtered_dataset = Dataset.from_list(filtered_dataset)


In [9]:
# Ensure the tokenizer has a padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token if tokenizer.eos_token else "[PAD]"
    tokenizer.add_special_tokens({'pad_token': tokenizer.pad_token})
    model.resize_token_embeddings(len(tokenizer))

# ✅ Explicitly set the model's pad_token_id
model.config.pad_token_id = tokenizer.pad_token_id


inputs = tokenizer(filtered_dataset['content'], return_tensors="pt", padding=True, truncation=True)

true_labels = torch.tensor(filtered_dataset['label']).to(device)  


In [10]:
# Function to run inference and calculate time and performance
def run_inference(model, inputs):
    # Move inputs to the selected device (GPU or CPU)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    start_time = time.time()

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits  # Get the raw prediction logits
        predictions = torch.argmax(logits, dim=-1)  # Get the predicted labels (0 or 1)

    end_time = time.time()
    inference_time = end_time - start_time
    return inference_time, predictions


In [11]:
# Function to calculate accuracy
def calculate_accuracy(predictions, true_labels):
    correct = (predictions == true_labels).sum().item()
    accuracy = correct / len(true_labels)
    return accuracy

In [12]:
# Run inference in different precision modes (float32, float16, bf16, int8)
dtypes = [torch.float32, torch.float16, torch.bfloat16]

inference_times = {}
performance_results = {}
accuracies = {}

# Run for each dtype mode
for dtype in dtypes:
    model = model.to(dtype)
    inference_time, predictions = run_inference(model, inputs)
    inference_times[str(dtype)] = inference_time
    performance_results[str(dtype)] = predictions  # Store predictions for performance
    
    # Calculate accuracy for each dtype
    accuracy = calculate_accuracy(predictions, true_labels)
    accuracies[str(dtype)] = accuracy

In [18]:
# Load model with quantization support (on GPU)
# PyTorch .to(torch.int8) is not supported for model weights.
model_name = "meta-llama/Llama-3.1-8B"

model_int8 = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    load_in_8bit=True,
)

# ✅ Explicitly set the model's pad_token_id
model_int8.config.pad_token_id = tokenizer.pad_token_id

inference_time_int8, predictions_int8 = run_inference(model_int8, inputs)

inference_times["int8"] = inference_time_int8
performance_results["int8"] = predictions_int8  # Store predictions for int8
accuracy_int8 = calculate_accuracy(predictions_int8, true_labels)
accuracies["int8"] = accuracy_int8



The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.1-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# Check impact on storage requirements (model size)


# Function to estimate model size in memory based on precision
def estimate_model_size(model, precision="float32"):
    # Get the model's parameters and their types
    total_params = sum(p.numel() for p in model.parameters())  # Total number of parameters

    # Define bytes per parameter based on the precision
    if precision == "float32":
        bytes_per_param = 4  # float32 is 4 bytes
    elif precision == "float16":
        bytes_per_param = 2  # float16 is 2 bytes
    elif precision == "bfloat16":
        bytes_per_param = 2  # bfloat16 is also 2 bytes
    elif precision == "int8":
        bytes_per_param = 1  # int8 is 1 byte
    else:
        raise ValueError("Unsupported precision type")

    # Estimate total model size in bytes and convert to MB
    model_size_bytes = total_params * bytes_per_param
    model_size_mb = model_size_bytes / (1024 * 1024)  # Convert to MB

    return model_size_mb


In [24]:
# Display the results
print("Inference Times (in seconds):")
for dtype, time in inference_times.items():
    print(f"{dtype}: {time:.4f} sec")

print("\nAccuracies:")
for dtype, accuracy in accuracies.items():
    print(f"{dtype}: {accuracy:.4f}")

# Estimate model size in different precisions
model_size_float32 = estimate_model_size(model, precision="float32")
print(f"Model size (float32): {model_size_float32:.2f} MB")

model_size_float16 = estimate_model_size(model, precision="float16")
print(f"Model size (float16): {model_size_float16:.2f} MB")

model_size_bf16 = estimate_model_size(model, precision="bfloat16")
print(f"Model size (bfloat16): {model_size_bf16:.2f} MB")

# For int8 quantized model (requires additional quantization)
# If using bitsandbytes or torch quantization, apply 8-bit quantization
# Here we demonstrate a simple estimation based on quantized parameters
# You can use the actual quantization method as well for exact size
model_size_int8 = estimate_model_size(model, precision="int8")
print(f"Model size (int8): {model_size_int8:.2f} MB")

# Conclusion
# Based on the results, we can determine the trade-offs between precision, storage, and performance.
# If int8 shows significant reduction in storage without a large drop in performance or inference speed,
# it can be worth it for your application.


Inference Times (in seconds):
torch.float32: 2.1082 sec
torch.float16: 0.3358 sec
torch.bfloat16: 0.3302 sec
int8: 0.3743 sec

Accuracies:
torch.float32: 0.5000
torch.float16: 0.5000
torch.bfloat16: 0.5000
int8: 0.5000
Model size (float32): 28629.05 MB
Model size (float16): 14314.52 MB
Model size (bfloat16): 14314.52 MB
Model size (int8): 7157.26 MB
