In [1]:

# Install core packages

!pip install -q --upgrade pip

!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

!pip install -q -U bitsandbytes accelerate


!pip install -q transformers datasets evaluate

!pip install -q optimum[onnxruntime]

!pip install -q textattack

!pip install -q codecarbon mlflow

!pip install -q scikit-learn python-multipart fastapi uvicorn

print("Installation finished. If you see bitsandbytes warnings → restart runtime now.")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m0.9/1.8 MB[0m [31m29.0 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.8/1.8 MB[0m [31m25.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?

In [2]:

!mkdir -p /content/energy-efficient-llm-pipeline/src
%cd /content/energy-efficient-llm-pipeline


/content/energy-efficient-llm-pipeline


In [3]:
%%writefile src/__init__.py
# empty

Writing src/__init__.py


In [4]:
%%writefile src/data.py
from datasets import load_dataset

def load_and_preprocess_glue():
    dataset = load_dataset("glue", "sst2")
    train_ds = dataset["train"].shuffle(seed=42).select(range(1000))
    test_ds = dataset["validation"].select(range(500))
    return train_ds, test_ds

def preprocess_function(examples, tokenizer, max_length=128):
    return tokenizer(
        examples["sentence"], truncation=True, padding="max_length", max_length=max_length
    )


Writing src/data.py


In [16]:
%%writefile src/adversarial.py
import textattack
from textattack import Attacker
from textattack.attack_recipes import TextFoolerJin2019
from textattack.models.wrappers import HuggingFaceModelWrapper
from textattack.datasets import HuggingFaceDataset
from textattack.attack_results import SuccessfulAttackResult
from textattack.constraints.semantics.sentence_encoders import UniversalSentenceEncoder
from datasets import Dataset, concatenate_datasets, Features, Value, ClassLabel

def generate_adversarial_examples(model, tokenizer, dataset, num_examples=800):
    model_wrapper = HuggingFaceModelWrapper(model, tokenizer)

    # Strong attack: TextFoolerJin2019
    attack = TextFoolerJin2019.build(model_wrapper)

    # Relax USE threshold for more successes
    for constraint in attack.constraints:
        if isinstance(constraint, UniversalSentenceEncoder):
            constraint.threshold = 0.70
            print("Relaxed USE similarity threshold to 0.70 for higher success rate")

    subset = dataset.shuffle(seed=42).select(range(min(num_examples, len(dataset))))
    hf_dataset = HuggingFaceDataset(subset, split="train")

    attacker = Attacker(attack, hf_dataset)
    results = attacker.attack_dataset()

    adv_texts = []
    adv_labels = []
    success_count = 0

    for i, result in enumerate(results):
        # Correct success check for modern TextAttack
        if isinstance(result, SuccessfulAttackResult) and result.perturbed_text() != result.original_text():
            adv_texts.append(result.perturbed_text())
            adv_labels.append(result.original_result.ground_truth_output)
            success_count += 1
            if success_count % 20 == 0:
                print(f"Success {success_count}: {result.perturbed_text()[:60]}...")

    print(f"\nGenerated {len(adv_texts)} successful adversarial examples")

    features = Features({
        "sentence": Value("string"),
        "label": ClassLabel(names=["negative", "positive"])
    })

    return Dataset.from_dict({"sentence": adv_texts, "label": adv_labels}, features=features)


def create_mixed_dataset(original_ds, adv_ds, adv_ratio=0.4):
    if len(adv_ds) == 0:
        print("No adversarial examples generated → using original dataset only")
        return original_ds

    num_adv = int(len(original_ds) * adv_ratio)
    original_subset = original_ds.select(range(len(original_ds) - num_adv))

    mixed = concatenate_datasets([original_subset, adv_ds])
    mixed = mixed.shuffle(seed=42)

    print(f"Mixed dataset size: {len(mixed)} (original: {len(original_subset)}, adv: {len(adv_ds)})")
    return mixed

Overwriting src/adversarial.py


In [22]:
%%writefile src/model.py
from transformers import AutoModelForSequenceClassification, AutoTokenizer, BitsAndBytesConfig, pipeline
import torch

def load_baseline_model(model_dir="distilbert-finetuned-sst2"):
    model = AutoModelForSequenceClassification.from_pretrained(model_dir, num_labels=2)
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    return model, tokenizer

def load_quantized_bitsandbytes(model_dir):
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
    )
    model = AutoModelForSequenceClassification.from_pretrained(
        model_dir,
        quantization_config=quantization_config,
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    return model, tokenizer

def create_pipeline(model, tokenizer):
    return pipeline(
        "text-classification",
        model=model,
        tokenizer=tokenizer,
        batch_size=32,
        truncation=True,
        max_length=128,
    )

Overwriting src/model.py


In [7]:
%%writefile src/train.py
import sys
import os
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

import mlflow
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from evaluate import load
import torch
from codecarbon import OfflineEmissionsTracker
from src.data import load_and_preprocess_glue, preprocess_function
from src.adversarial import generate_adversarial_examples, create_mixed_dataset
from src.model import load_baseline_model

mlflow.set_tracking_uri("file:./mlruns")
mlflow.set_experiment("energy-efficient-distilbert-sst2")

accuracy_metric = load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

train_ds, test_ds = load_and_preprocess_glue()
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

tokenized_train = train_ds.map(lambda ex: preprocess_function(ex, tokenizer), batched=True)
tokenized_test  = test_ds.map(lambda ex: preprocess_function(ex, tokenizer), batched=True)

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    fp16=True,
    report_to="none",
)

with mlflow.start_run(run_name="baseline_finetune"):
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )

    tracker = OfflineEmissionsTracker(project_name="baseline_train", log_level="error")
    tracker.start()
    trainer.train()
    co2_kg = tracker.stop()

    eval_results = trainer.evaluate()
    mlflow.log_params({
        "epochs": 3,
        "batch_size": 16,
        "task": "sst2",
        "model": "distilbert-base-uncased"
    })
    mlflow.log_metric("final_accuracy", eval_results["eval_accuracy"])
    mlflow.log_metric("co2_kg", co2_kg)
    mlflow.log_metric("approx_energy_kwh", co2_kg / 0.35)

    trainer.save_model("./distilbert-finetuned-sst2")
    tokenizer.save_pretrained("./distilbert-finetuned-sst2")

# Adversarial training
print("Generating adversarial examples...")
adv_ds = generate_adversarial_examples(model, tokenizer, train_ds, num_examples=800)
mixed_ds = create_mixed_dataset(train_ds, adv_ds, adv_ratio=0.4)
tokenized_mixed = mixed_ds.map(lambda ex: preprocess_function(ex, tokenizer), batched=True)

with mlflow.start_run(run_name="adversarial_finetune"):
    training_args.num_train_epochs = 2

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_mixed,
        eval_dataset=tokenized_test,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )

    tracker = OfflineEmissionsTracker(project_name="adv_train", log_level="error")
    tracker.start()
    trainer.train()
    co2_kg_adv = tracker.stop()

    eval_results_adv = trainer.evaluate()
    mlflow.log_metric("final_accuracy_adv", eval_results_adv["eval_accuracy"])
    mlflow.log_metric("co2_kg_adv", co2_kg_adv)
    mlflow.log_metric("approx_energy_kwh_adv", co2_kg_adv / 0.35)

    trainer.save_model("./distilbert-robust-sst2")
    tokenizer.save_pretrained("./distilbert-robust-sst2")

print("Training completed. Models saved to:")
print("  - Baseline:   ./distilbert-finetuned-sst2")
print("  - Robust:     ./distilbert-robust-sst2")



Writing src/train.py


In [26]:
%%writefile src/inference_eval.py
import sys
import os

# Fix import path for Colab
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

import time
import torch
import mlflow
from codecarbon import OfflineEmissionsTracker
from transformers import AutoTokenizer
from src.data import load_and_preprocess_glue, preprocess_function
from src.model import load_quantized_bitsandbytes, create_pipeline, load_baseline_model



# Load test set (500 samples from GLUE validation)
_, test_ds = load_and_preprocess_glue()

model_dir = "./distilbert-robust-sst2"  # use robust model

# Baseline model
baseline_model, tokenizer = load_baseline_model(model_dir)
baseline_pipe = create_pipeline(baseline_model, tokenizer)

# Quantized model (8-bit by default – see note below for 4-bit)
quant_model, _ = load_quantized_bitsandbytes(model_dir)
quant_pipe = create_pipeline(quant_model, tokenizer)

def measure_latency(model, tokenizer, texts, runs=50, batch_size=128):
    """
    Manual batched forward pass – most accurate for throughput comparison
    """
    model.eval()
    device = next(model.parameters()).device
    times = []
    for _ in range(runs):
        start = time.perf_counter()
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            inputs = tokenizer(
                batch_texts,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=128
            ).to(device)
            with torch.no_grad():
                _ = model(**inputs)
        times.append(time.perf_counter() - start)
    avg_time_total = sum(times) / runs
    return avg_time_total / len(texts)  # seconds per sample

# Use a large sample for stable measurement
sample_texts = list(test_ds["sentence"][:400])  # or full len(test_ds) if you want max accuracy

baseline_lat = measure_latency(baseline_model, tokenizer, sample_texts)
quant_lat    = measure_latency(quant_model, tokenizer, sample_texts)

print(f"Baseline latency: {baseline_lat:.4f} s/sample | 8-bit: {quant_lat:.4f} s/sample")
print(f"Latency reduction: {((baseline_lat - quant_lat) / baseline_lat) * 100:.1f}%")


num_loops = 100
batch_size_energy = 64

device_baseline = next(baseline_model.parameters()).device
device_quant = next(quant_model.parameters()).device

# Baseline energy
tracker = OfflineEmissionsTracker(project_name="baseline_inference")
tracker.start()
for _ in range(num_loops):
    inputs = tokenizer(
        sample_texts[:batch_size_energy],
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=128
    ).to(device_baseline)
    with torch.no_grad():
        _ = baseline_model(**inputs)
baseline_emissions = tracker.stop()

# Quantized energy
tracker = OfflineEmissionsTracker(project_name="quantized_inference")
tracker.start()
for _ in range(num_loops):
    inputs = tokenizer(
        sample_texts[:batch_size_energy],
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=128
    ).to(device_quant)
    with torch.no_grad():
        _ = quant_model(**inputs)
emissions = tracker.stop()

print(f"\nBaseline energy: {baseline_emissions:.6f} kg CO₂eq")
print(f"Quantized energy: {emissions:.6f} kg CO₂eq")
if baseline_emissions > 0:
    reduction = ((baseline_emissions - emissions) / baseline_emissions) * 100
    print(f"Energy reduction: {reduction:.1f}%")

print("\nEvaluation finished.")

Overwriting src/inference_eval.py


In [9]:
%%writefile src/app.py
from fastapi import FastAPI, HTTPException, Security, Depends
from fastapi.security import APIKeyHeader
from pydantic import BaseModel
from src.model import load_quantized_bitsandbytes, create_pipeline

app = FastAPI(title="Energy-Efficient DistilBERT Inference")

# Basic API key security
API_KEY = "test123"
api_key_header = APIKeyHeader(name="X-API-Key")

async def get_api_key(api_key: str = Security(api_key_header)):
    if api_key != API_KEY:
        raise HTTPException(status_code=403, detail="Invalid API Key")
    return api_key

class PredictRequest(BaseModel):
    text: str

# Load quantized robust model at startup
model, tokenizer = load_quantized_bitsandbytes("./distilbert-robust-sst2")
pipe = create_pipeline(model, tokenizer)

@app.post("/predict", dependencies=[Depends(get_api_key)])
async def predict(request: PredictRequest):
    result = pipe(request.text)[0]
    return {
        "label": result["label"],
        "score": float(result["score"]),
        "model": "distilbert-8bit-quantized-robust"
    }

@app.get("/health")
async def health():
    return {"status": "healthy", "model_loaded": True}

Writing src/app.py


In [24]:
!python src/train.py

2026-02-03 19:51:19.471985: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1770148279.494351    6619 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1770148279.501691    6619 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1770148279.526586    6619 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770148279.526619    6619 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770148279.526628    6619 computation_placer.cc:177] computation placer alr

In [27]:
!python src/inference_eval.py

2026-02-03 19:55:08.138482: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1770148508.158099    7778 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1770148508.164122    7778 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1770148508.179323    7778 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770148508.179346    7778 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770148508.179353    7778 computation_placer.cc:177] computation placer alr