# Setting Up

In [None]:
!pip install -U \
  "transformers==4.44.2" \
  "datasets==3.0.1" \
  "evaluate>=0.4.3" \
  "accelerate>=1.0.0" \
  "scikit-learn>=1.5.0" \
  "statsmodels>=0.14.2" \
  "scipy>=1.13.0"
  "pandas"

Collecting transformers==4.44.2
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets==3.0.1
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting evaluate>=0.4.3
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting accelerate>=1.0.0
  Downloading accelerate-1.12.0-py3-none-any.whl.metadata (19 kB)
Collecting scikit-learn>=1.5.0
  Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers==4.44.2)
  Downloading tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting fsspec<=2024.6.1,>=2023.1.0 (from fsspec[http]<=2024.6.1,>=202

In [None]:
# imports
import os
from huggingface_hub import hf_hub_download, notebook_login
import torch as t
import numpy as np
import random
from transformers import AutoTokenizer, AutoModelForMaskedLM
from datasets import load_dataset
import collections
import time
import math
import pandas as pd
import json
import platform
import matplotlib.pyplot as plt
import tempfile
from datetime import datetime
from collections import Counter
from google.colab import files

In [None]:
# Logging into Hugging Face
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Defining our set seed for reproducability
SET_SEED = 3
random.seed(SET_SEED)
np.random.seed(SET_SEED)
t.manual_seed(SET_SEED)

# Marking device
DEVICE = t.device("cpu")


In [None]:
# Helping methods for the models
# load_mlm --> load the pre-trained masked language model & its tokenizer
def load_mlm(model_name):
  # model_name (string)
  tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
  model = AutoModelForMaskedLM.from_pretrained(model_name).to("cpu").eval()
  return tokenizer, model

# dptq --> apply dynamic post-training quantization for linear layer
def dptq(model):
  return t.ao.quantization.quantize_dynamic(
    model, {t.nn.Linear}, dtype=t.qint8
  )

def rebuild_datasets():
  # load the crows dataset then verify the len to ensure loaded properly
  crows = load_dataset("crows_pairs", split="test")
  print("CrowS-Pair dataset size:", len(crows))

  # load the StereoSet dataset then verify the len to ensure loaded properly
  stereoset = load_dataset("stereoset", "intrasentence", split="validation")
  print("StereoSet dataset size:", len(stereoset))
  return crows, stereoset

crows, stereoset = rebuild_datasets()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


crows_pairs.py: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

The repository for crows_pairs contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/crows_pairs.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/438k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/1508 [00:00<?, ? examples/s]

CrowS-Pair dataset size: 1508


README.md: 0.00B [00:00, ?B/s]

intrasentence/validation-00000-of-00001.(…):   0%|          | 0.00/599k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/2106 [00:00<?, ? examples/s]

StereoSet dataset size: 2106


In [None]:
# pseudo-log-likelyhood (PLL) for MLMs masking one token at a time

@t.inference_mode()
def pll_sentence(cur_model, cur_tokenizer, cur_text):

  # encode cur_text, get the token IDs (input for model) from batch dims, get the attention mask
  encode = cur_tokenizer(cur_text, return_tensors="pt")
  input_ids = encode["input_ids"][0]
  attn = encode["attention_mask"]
  total = 0.0

  # itterate over every token in input sequence
  for i in range(1, input_ids.size(0) -1):
    # clone of og tokens, in copied tensor token @ cur position i replaced w/ masked token
    masked = input_ids.clone()
    if tokenizer.mask_token_id is not None:
      masked[i] = tokenizer.mask_token_id

    # get raw prediction for every token, then extract raw prediction (logits) for masked position i
    out = cur_model(input_ids=masked.unsqueeze(0), attention_mask=attn)
    logits = out.logits[0, i]

    # add to total counter w/ cur log-likelyhood contribution
    total += t.log_softmax(logits, dim=-1)[input_ids[i]].item()

  return float(total)


# ======================================

# Crows-Pair eval breaking into stereotypical choice % broken by bias type
# Goal: get as close to 50/50 as possible

def crows_pair_eval(cur_model, cur_tokenizer, dataset, limit=None):
  # counters for whole dataset
  hits = 0
  total = 0

  # counters for each bias types
  by_cat_hits = collections.Counter()
  by_cat_total = collections.Counter()

  # if setting limit, pull that many from dataset; else take all in dataset
  iterable = dataset if limit is None else dataset.select(range(limit))

  for ex in iterable:
    # call getter
    s_more, s_less, more_is_stereo, cat = crows_pair_eval_getter(ex)

    # calc pll for each s_more and s_less then pick the higher PLL
    pll_more = pll_sentence(cur_model, cur_tokenizer, s_more)
    pll_less = pll_sentence(cur_model, cur_tokenizer, s_less)
    picked_more = pll_more > pll_less

    # check that model's choice aligns w/ stereotypical sentence
    picked_stereo = (picked_more and more_is_stereo) or ((not picked_more) and (not more_is_stereo))

    # increment running totals
    hits += int(picked_stereo)
    total += 1

    # increment the by category totals
    cat = ex["bias_type"]
    by_cat_hits[cat] += int(picked_stereo)
    by_cat_total[cat] += 1

  # calc the overall hit percentage
  overall = 100.0 * hits / max(total, 1)
  by_cat = {k: 100.0 * by_cat_hits[k] / max(by_cat_total[k], 1) for k in by_cat_total}
  return {"crows_overall_stereo_pct": overall, **{f"crows_{k}_stereo_pct": v for k, v in by_cat.items()}}

# ======================================

def crows_pair_eval_getter(ex, default_more_is_stereo=True):
  s_more = ex.get("sent_more", ex.get("sentence_more", None))
  s_less = ex.get("sent_less", ex.get("sentence_less", None))

  # figure out which one is the stereotypical sentence
  if "stereotype" in ex:
    lab = ex["stereotype"]
    if isinstance(lab, str):
      more_is_stereo = lab.strip().lower().startswith("stereo")
    elif isinstance(lab, (int, bool)):
      # Treat truthy as "sent_more is stereotypical"
      more_is_stereo = bool(lab)
    else:
      more_is_stereo = default_more_is_stereo
  else:
    more_is_stereo = default_more_is_stereo

  bias_type = str(ex.get("bias_type", "UNKNOWN")).upper()
  return s_more, s_less, more_is_stereo, bias_type


In [None]:
def norm_label(name: str) -> str:
    s = str(name).lower().replace("_","-")
    if "anti" in s: return "anti-stereotype"
    if "stereo" in s: return "stereotype"
    return "unrelated"

def eval_stereoset_intra_gold(model, tokenizer, dataset, limit=None, verbose=False):
    # map gold_label ids -> names from the dataset feature schema
    try:
        id2name_gold = dataset.features["sentences"].feature["gold_label"].names
    except Exception:
        id2name_gold = None  # if it breaks

    # counters
    stereo = 0
    anti = 0
    unrelated = 0
    total = 0
    used = 0
    skipped = 0

    # if we have a limit --> take that many values
    ds = dataset if limit is None else dataset.select(range(limit))
    for ex in ds:

        # get all the sentences & gold labels
        S = ex["sentences"]
        sents = S.get("sentence", [])
        golds = S.get("gold_label", [])

        # if something bad happens --> skip
        if not isinstance(sents, list) or not isinstance(golds, list) or len(sents) != len(golds):
            skipped += 1
            continue

        # bucket sentences by normalized gold label name
        buckets = {"stereotype": [], "anti-stereotype": [], "unrelated": []}
        for sent, gid in zip(sents, golds):
            if not isinstance(sent, str) or not sent.strip():
                continue
            lab_name = id2name_gold[gid] if (id2name_gold and isinstance(gid, int) and 0 <= gid < len(id2name_gold)) else gid
            lab = norm_label(lab_name)
            if lab in buckets:
                buckets[lab].append(sent.strip())

        # if something bad happens and there isn't correct data --> skip
        if not all(buckets[k] for k in buckets):
            skipped += 1
            continue

        # choose the first sentence for each category (simple, deterministic)
        triple = [("stereotype", buckets["stereotype"][0]),
                  ("anti-stereotype", buckets["anti-stereotype"][0]),
                  ("unrelated", buckets["unrelated"][0])]

        scores = [pll_sentence(model, tokenizer, s) for _, s in triple]
        winner = int(np.argmax(scores))
        lab = triple[winner][0]
        if lab == "stereotype":
          stereo += 1
        elif lab == "anti-stereotype":
          anti += 1
        else:
          unrelated += 1

        total += 1
        used += 1

    if verbose:
        print(f"used: {used} | skipped: {skipped}")

    denom = max(stereo + anti, 1)
    return {
        "stereoset_stereotype_pct": 100.0 * stereo / denom,
        "stereoset_lm_ok_pct": 100.0 * (1.0 - unrelated / max(total,1)),
        "stereoset_total": total,
    }

In [None]:
# testing to make sure the models are loaded okay and working
try:
  tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
  model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased").eval()
  output = crows_pair_eval(model, tokenizer, crows, limit=40)
  print("Crows smoke: ", output)
except Exception as e:
  print(e)

try:
  tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
  model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased").eval()
  out = eval_stereoset_intra_gold(model, tokenizer, stereoset, limit=40, verbose=True)
  print("StereoSet smoke:", out)
except Exception as e:
  print(e)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


KeyboardInterrupt: 

In [None]:
# Just for our sanity, here are the CrowS feature names
feat = crows.features.get("bias_type")
id2label = {i:name for i,name in enumerate(getattr(feat, "names", []))} if feat else {}
print("CrowS bias_type labels:", id2label or "(not labeled in this mirror)")

# Running the Models

In [None]:
# Constants

MODELS_PARTIAL = [
    "bert-base-uncased",
    "roberta-base",
    "distilbert-base-uncased",
    "distilroberta-base",
]

# set to None whenever you want to do a full test
# temp 80, 200
CROWS_SMOKE_LIMIT = None
STEREO_SMOKE_LIMIT = None

RESULTS_DIR = "/content/results"

In [None]:
# clean the model name so that we save to with no issues 1
def clean_model_name(name):
  return name.replace("/", "_").replace(" ", "_")

# ===========================

# function to calculate the size of the model
def model_disk_size_mb(model):
    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pt")
    path = tmp.name
    tmp.close()
    try:
        state = model.state_dict()
        # make a CPU-safe copy; only detach/CPU on Tensors
        safe_state = {}
        for k, v in state.items():
            if isinstance(v, t.Tensor):
                safe_state[k] = v.detach().cpu()
            else:
                safe_state[k] = v
        t.save(safe_state, path)
        return os.path.getsize(path) / (1024 * 1024)
    finally:
        try:
            os.unlink(path)
        except OSError:
            pass

In [None]:
# NOTE: This is for non-large models
crows, stereo = rebuild_datasets()

SMOKE_LIMITS = dict(crows=CROWS_SMOKE_LIMIT, stereo=STEREO_SMOKE_LIMIT) # see constants section to change / set limits
os.makedirs(RESULTS_DIR, exist_ok=True)
compression_rows = []

for model_name in MODELS_PARTIAL:
    print(f"Evaluating: {model_name}")
    tokenizer, model = load_mlm(model_name)

    # --- measure FP32 size ---
    size_fp = model_disk_size_mb(model)

    # --- run FP32 evaluation ---
    t0 = time.time()
    crows_fp = crows_pair_eval(model, tokenizer, crows, limit=SMOKE_LIMITS["crows"])
    stereo_fp = eval_stereoset_intra_gold(model, tokenizer, stereo, limit=SMOKE_LIMITS["stereo"])
    compression_rows.append({
        "model": model_name,
        "quantized": 0,
        "secs": time.time() - t0,
        "size_mb": size_fp,
        **crows_fp,
        **stereo_fp
    })

    # --- quantize  ---
    quantized_model = dptq(model.cpu()).eval()

    # --- measure INT8 size ---dwwdadwawda
    size_q = model_disk_size_mb(quantized_model)

    # --- run INT8 evaluationn ---
    t1 = time.time()
    crows_q = crows_pair_eval(quantized_model, tokenizer, crows, limit=SMOKE_LIMITS["crows"])
    stereo_q = eval_stereoset_intra_gold(quantized_model, tokenizer, stereo, limit=SMOKE_LIMITS["stereo"])
    compression_rows.append({
        "model": model_name,
        "quantized": 1,
        "secs": time.time() - t1,
        "size_mb": size_q,
        "compression_ratio": size_fp / max(size_q, 1e-6),
        **crows_q,
        **stereo_q
    })

# Save + display
comp_df = pd.DataFrame(compression_rows)
comp_df = comp_df.sort_values(["model", "quantized"]).reset_index(drop=True)
print(comp_df.to_string(index=False))

out_csv = f"{RESULTS_DIR}/partD_compression_full.csv"
out_json = f"{RESULTS_DIR}/partD_compression_full.json"
comp_df.to_csv(out_csv, index=False)
comp_df.to_json(out_json, orient="records", indent=2)
files.download(out_csv)
files.download(out_json)
print("Saved results to:")
print("  CSV :", out_csv)
print("  JSON:", out_json)

CrowS-Pair dataset size: 1508
StereoSet dataset size: 2106
Evaluating: bert-base-uncased


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, 

            model  quantized      secs    size_mb  crows_overall_stereo_pct  crows_0_stereo_pct  crows_1_stereo_pct  crows_2_stereo_pct  stereoset_stereotype_pct  stereoset_lm_ok_pct  stereoset_total  compression_ratio
bert-base-uncased          0 64.876558 417.827373                      60.0           66.666667                 0.0               100.0                      60.0                100.0                5                NaN
bert-base-uncased          1 22.009970 195.545247                      80.0           66.666667               100.0               100.0                     100.0                 80.0                5            2.13673


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Saved results to:
  CSV : /content/results/partD_compression_full.csv
  JSON: /content/results/partD_compression_full.json


In [None]:
# NOTE: this is for the large models, adjust as needed
ARE_DOING_CROWS = False
ARE_DOING_STEREO = not ARE_DOING_CROWS
SAVE_SUFFIX = "crows" if ARE_DOING_CROWS else "stereo"
BERT_LARGE = "bert-large-uncased"
ROBERTA_LARGE = "roberta-large"
RESULTS_DIR = "/content/results"

models_large = [ROBERTA_LARGE]
crows, stereo = rebuild_datasets()

SMOKE_LIMITS = dict(crows=CROWS_SMOKE_LIMIT, stereo=STEREO_SMOKE_LIMIT) # see constants section to change / set limits
os.makedirs(RESULTS_DIR, exist_ok=True)
compression_rows = []

for model_name in models_large:
    print(f"Evaluating: {model_name}")
    tokenizer, model = load_mlm(model_name)

    # --- measure FP32 size ---
    size_fp = model_disk_size_mb(model)

    # --- run FP32 evaluation -----------333343511
    t0 = time.time()
    crows_fp = crows_pair_eval(model, tokenizer, crows, limit=SMOKE_LIMITS["crows"])
    stereo_fp = eval_stereoset_intra_gold(model, tokenizer, stereo, limit=SMOKE_LIMITS["stereo"])
    compression_rows.append({
        "model": model_name,
        "quantized": 0,
        "secs": time.time() - t0,
        "size_mb": size_fp,
        **crows_fp,
        **stereo_fp
    })

    # --- quantize  ---
    quantized_model = dptq(model.cpu()).eval()

    # --- measure INT8 size ---dwwdadwawda11
    size_q = model_disk_size_mb(quantized_model)

    # --- run INT8 evaluationn ---
    t1 = time.time()
    crows_q = crows_pair_eval(quantized_model, tokenizer, crows, limit=SMOKE_LIMITS["crows"])
    stereo_q = eval_stereoset_intra_gold(quantized_model, tokenizer, stereo, limit=SMOKE_LIMITS["stereo"])
    compression_rows.append({
        "model": model_name,
        "quantized": 1,
        "secs": time.time() - t1,
        "size_mb": size_q,
        "compression_ratio": size_fp / max(size_q, 1e-6),
        **crows_q,
        **stereo_q
    })

  # Save + display
comp_df = pd.DataFrame(compression_rows)
comp_df = comp_df.sort_values(["model", "quantized"]).reset_index(drop=True)
print(comp_df.to_string(index=False))

out_csv = f"{RESULTS_DIR}/partD_compression_full.csv"
out_json = f"{RESULTS_DIR}/partD_compression_full.json"
comp_df.to_csv(out_csv, index=False)
comp_df.to_json(out_json, orient="records", indent=2)
files.download(out_csv)
files.download(out_json)
print("Saved results to:")
print("  CSV :", out_csv)
print("  JSON:", out_json)

CrowS-Pair dataset size: 1508
StereoSet dataset size: 2106
Evaluating: roberta-large


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  return t.ao.quantization.quantize_dynamic(


        model  quantized         secs     size_mb  crows_overall_stereo_pct  crows_0_stereo_pct  crows_1_stereo_pct  crows_2_stereo_pct  crows_3_stereo_pct  crows_4_stereo_pct  crows_5_stereo_pct  crows_6_stereo_pct  crows_7_stereo_pct  crows_8_stereo_pct  stereoset_stereotype_pct  stereoset_lm_ok_pct  stereoset_total  compression_ratio
roberta-large          0 47732.458862 1355.913981                 67.506631           68.604651           72.674419            62.21374                70.0           55.974843           67.857143           74.603175           75.238095           71.264368                 64.404297            97.245964             2106                NaN
roberta-large          1 17215.110769  538.100942                 61.339523           60.077519           65.697674            62.21374                60.0           52.201258           67.857143           68.253968           64.761905           59.770115                 63.351298            96.913580             2106   

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Saved results to:
  CSV : /content/results/partD_compression_full.csv
  JSON: /content/results/partD_compression_full.json


Statistics

In [None]:
import pandas as pd
import statsmodels.api as sm
import scipy.stats as st
import glob
import numpy as np
import matplotlib.pyplot as plt

In [None]:
all_csvs = glob.glob("/content/results/*/partC_*.csv")
if not all_csvs:
    raise FileNotFoundError("No result CSVs found — run Part C first!")

df_all = pd.concat([pd.read_csv(f) for f in all_csvs], ignore_index=True)

df_all["quantized"] = df_all["quantized"].astype(int)

NameError: name 'glob' is not defined

In [None]:
# Linear Regression
# Does compression predict bias % (CrowS overall)?
# this segment is essentially testing -->

#H0: Compression does not significantly change bias levels
#Ha: Compression does significantly change bias level
#let us use β₁ = .05 for this test

## THIS DOES NOT WORK YET, SINCE THERE ARE ONLY TWO DATA POINTS WITH
## ONE MODEL, WE CANNOT GENERATE INFERENCE STATS (0 degrees of freedom)
## when more models are added, this will work and be interpretable

y = df_all["crows_overall_stereo_pct"]
X = sm.add_constant(df_all["quantized"])
ols = sm.OLS(y, X).fit()

print("\n=== Linear Regression: CrowS Bias ~ Quantization ===")
print(ols.summary())

In [None]:
# Logistic Regression
# Convert bias to binary outcome: 1 if above median bias

#H0: Compression has no effect on the probability of being above-median bias
#Ha: Compression does significantly change that probability
#let us use β₁ = .05 for this test

## ODDS RATIO: Compressed models are roughly {odd_ratio} times more
## likely to exhibit above-median bias than uncompressed ones

median_bias = y.median()
df_all["above_median_bias"] = (y > median_bias).astype(int)

logit = sm.Logit(df_all["above_median_bias"], X).fit()
print("\n=== Logistic Regression: Above-Median Bias ~ Quantization ===")
#print(logit.summary())

params = logit.params
conf = logit.conf_int()
conf['OR'] = np.exp(params)
conf.columns = ['2.5%', '97.5%', 'OR']
conf_exp = np.exp(conf[['2.5%', '97.5%']])

# Extract specifically for the compression variable
or_est = np.exp(params['quantized'])
ci_low, ci_high = np.exp(conf.loc['quantized', ['2.5%', '97.5%']])

print(f"\nOdds Ratio for Compression (INT8 vs FP32): {or_est:.3f}")
print(f"95% Confidence Interval: [{ci_low:.3f}, {ci_high:.3f}]")

# Optional: print all coefficients as odds ratios
print("\nAll model parameters (in odds-ratio form):")
print(np.exp(logit.params))

In [None]:
# t-test
# group 1: uncompressed
# group 2: compressed

## we are testing whether the average bias percentage differs significantly
## between the two groups

# H0: mu1 = mu2
# Ha: mu != mu2


fp32 = df_all[df_all["quantized"] == 0]["crows_overall_stereo_pct"]
int8 = df_all[df_all["quantized"] == 1]["crows_overall_stereo_pct"]
t_stat, p_val = st.ttest_ind(fp32, int8, equal_var=False)

print("\n=== T-Test: FP32 vs INT8 CrowS Bias ===")
print(f"t = {t_stat:.3f},  p = {p_val:.4f}")

In [None]:
# we are using repeated sampling WITH REPLACEMENT to generate pseudo
# confidence intervals for the bias level on each model

# these intervals will be potentially useful in more advanced statistical
# analysis

boot_means = [np.mean(np.random.choice(int8, size=len(int8), replace=True))
              for _ in range(1000)]
ci_low, ci_high = np.percentile(boot_means, [2.5, 97.5])
print(f"95% bootstrap CI for INT8 bias: [{ci_low:.2f}, {ci_high:.2f}]")

In [None]:
##simple visual
plt.figure(figsize=(5,4))
plt.boxplot([fp32, int8], labels=["FP32", "INT8"])
plt.ylabel("CrowS Overall Stereotype %")
plt.title("Bias vs Compression Type")
plt.show()

# Variable Bit Quantization


In [None]:
import copy

def quantize_tensor_uniform(x, N):
  # x = current tensor & N is the number of bits we are quantizing
  # simulates the effect of quantization while keeping teh tensors float for normal pytorch opperations
  # Note: N is capped (on a high end) at 32

  # get the min/max of signed bits (for N=8 --> [-128, 127])
  # representing ALL possible values that can be seen (at least all that are in Z)
  bit_min = -(2 ** (N-1))
  bit_max = (2 ** (N-1)) - 1

  # if we have a 0 tensor --> just return it and move on
  # if everything 0, nothing to quantize
  tensor_max = x.abs().max()
  if tensor_max == 0:
    return x

  # determine what the scale is (ie set the max value in tensor = 2^n - 1)
  scaler = tensor_max / bit_max

  # (x \ scaler) --> map each float weight into "integer" units
  # .round() --> snap the floats to the nearest int value (quantization step)
  # .clamp(bit_min, bit_max) --> make sure nothing out of bounds or has invalid value
  # x_int * scaler --> put back into floats
  x_int = (x / scaler).round().clamp(bit_min, bit_max)
  x_quantized = x_int * scaler
  return x_quantized

def dptq_nb(model, bits: int, modules=(t.nn.Linear,)):
  # returns a copy of the model w/ weights quantized to 'N' bits
  model_copy = copy.deepcopy(model)

  for m in model_copy.modules():
    if isinstance(m, modules):
      with t.no_grad():
        m.weight.data = quantize_tensor_uniform(m.weight.data, bits)
        if m.bias is not None:
          m.bias.data = quantize_tensor_uniform(m.bias.data, bits)

  return model_copy

# calculate the theoretical size of the model
def theoretical_size(size_fp32_model, N):
  return size_fp32_model * (N / 32.0)

In [None]:
VAR_QUAN_MODELS = [
    "bert-base-uncased",
    "roberta-base",
    "distilbert-base-uncased",
    "distilroberta-base",
]

# NUMBER OF BITS TO (how many bits total - ie not how many bits to take off)
N_BITS = 16

# set to None whenever you want to do a full test
# temp 80, 200
CROWS_SMOKE_LIMIT_VAR = None
STEREO_SMOKE_LIMIT_VAR = None

# non large variable size compression models
crows, stereo = rebuild_datasets()

SMOKE_LIMITS = dict(crows=CROWS_SMOKE_LIMIT_VAR, stereo=STEREO_SMOKE_LIMIT_VAR)
os.makedirs(RESULTS_DIR, exist_ok=True)
compression_rows = []
cleaned_model_name = ""

for model_name in VAR_QUAN_MODELS:
    print(f"Evaluating: {model_name}")
    cleaned_model_name = clean_model_name(model_name)
    tokenizer, model = load_mlm(model_name)

    size_fp = model_disk_size_mb(model)

    quant_bits = N_BITS

    # --- quantize  ---
    quantized_model = dptq_nb(model.cpu(), bits=quant_bits).eval()

    # --- measure size w/ number of bits
    size_q_theoretical = theoretical_size(size_fp, quant_bits)

    # --- run INT8 evaluationn ---
    t1 = time.time()
    crows_q = crows_pair_eval(quantized_model, tokenizer, crows, limit=SMOKE_LIMITS["crows"])
    stereo_q = eval_stereoset_intra_gold(quantized_model, tokenizer, stereo, limit=SMOKE_LIMITS["stereo"])
    compression_rows.append({
        "model": model_name,
        "quantized": 1,
        "bits": quant_bits,
        "secs": time.time() - t1,
        "size_mb": size_q_theoretical,
        "compression_ratio": size_fp / max(size_q_theoretical, 1e-6),
        **crows_q,
        **stereo_q
    })

    # Save + display
    comp_df = pd.DataFrame(compression_rows)
    comp_df = comp_df.sort_values(["model", "quantized", "bits"]).reset_index(drop=True)
    print(comp_df.to_string(index=False))

    out_csv = f"{RESULTS_DIR}/{cleaned_model_name}_{N_BITS}_quant.csv"
    out_json = f"{RESULTS_DIR}/{cleaned_model_name}_{N_BITS}_quant.json"
    comp_df.to_csv(out_csv, index=False)
    comp_df.to_json(out_json, orient="records", indent=2)
    files.download(out_csv)
    files.download(out_json)
    print("Saved results to:")
    print("  CSV :", out_csv)
    print("  JSON:", out_json)

CrowS-Pair dataset size: 1508
StereoSet dataset size: 2106
Evaluating: bert-base-uncased


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


            model  quantized  bits         secs    size_mb  compression_ratio  crows_overall_stereo_pct  crows_0_stereo_pct  crows_1_stereo_pct  crows_2_stereo_pct  crows_3_stereo_pct  crows_4_stereo_pct  crows_5_stereo_pct  crows_6_stereo_pct  crows_7_stereo_pct  crows_8_stereo_pct  stereoset_stereotype_pct  stereoset_lm_ok_pct  stereoset_total
bert-base-uncased          1    16 15258.463891 208.913686                2.0                 60.941645           59.689922            58.72093           57.251908           78.333333           46.540881           77.380952            69.84127           73.333333            60.91954                 60.544554            95.916429             2106


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Saved results to:
  CSV : /content/results/bert-base-uncased_16_quant.csv
  JSON: /content/results/bert-base-uncased_16_quant.json
Evaluating: roberta-base


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

            model  quantized  bits         secs    size_mb  compression_ratio  crows_overall_stereo_pct  crows_0_stereo_pct  crows_1_stereo_pct  crows_2_stereo_pct  crows_3_stereo_pct  crows_4_stereo_pct  crows_5_stereo_pct  crows_6_stereo_pct  crows_7_stereo_pct  crows_8_stereo_pct  stereoset_stereotype_pct  stereoset_lm_ok_pct  stereoset_total
bert-base-uncased          1    16 15258.463891 208.913686                2.0                 60.941645           59.689922            58.72093           57.251908           78.333333           46.540881           77.380952           69.841270           73.333333           60.919540                 60.544554            95.916429             2106
     roberta-base          1    16 18649.446432 237.873464                2.0                 61.273210           58.139535            66.27907           57.633588           71.666667           57.232704           66.666667           63.492063           69.523810           64.367816                 63.9

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Saved results to:
  CSV : /content/results/roberta-base_16_quant.csv
  JSON: /content/results/roberta-base_16_quant.json
Evaluating: distilbert-base-uncased


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

                  model  quantized  bits         secs    size_mb  compression_ratio  crows_overall_stereo_pct  crows_0_stereo_pct  crows_1_stereo_pct  crows_2_stereo_pct  crows_3_stereo_pct  crows_4_stereo_pct  crows_5_stereo_pct  crows_6_stereo_pct  crows_7_stereo_pct  crows_8_stereo_pct  stereoset_stereotype_pct  stereoset_lm_ok_pct  stereoset_total
      bert-base-uncased          1    16 15258.463891 208.913686                2.0                 60.941645           59.689922           58.720930           57.251908           78.333333           46.540881           77.380952           69.841270           73.333333           60.919540                 60.544554            95.916429             2106
distilbert-base-uncased          1    16  9546.506688 127.781483                2.0                 60.809019           57.364341           61.627907           58.015267           80.000000           51.572327           76.190476           65.079365           72.380952           59.770115   

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Saved results to:
  CSV : /content/results/distilbert-base-uncased_16_quant.csv
  JSON: /content/results/distilbert-base-uncased_16_quant.json
Evaluating: distilroberta-base


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


                  model  quantized  bits         secs    size_mb  compression_ratio  crows_overall_stereo_pct  crows_0_stereo_pct  crows_1_stereo_pct  crows_2_stereo_pct  crows_3_stereo_pct  crows_4_stereo_pct  crows_5_stereo_pct  crows_6_stereo_pct  crows_7_stereo_pct  crows_8_stereo_pct  stereoset_stereotype_pct  stereoset_lm_ok_pct  stereoset_total
      bert-base-uncased          1    16 15258.463891 208.913686                2.0                 60.941645           59.689922           58.720930           57.251908           78.333333           46.540881           77.380952           69.841270           73.333333           60.919540                 60.544554            95.916429             2106
distilbert-base-uncased          1    16  9546.506688 127.781483                2.0                 60.809019           57.364341           61.627907           58.015267           80.000000           51.572327           76.190476           65.079365           72.380952           59.770115   

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Saved results to:
  CSV : /content/results/distilroberta-base_16_quant.csv
  JSON: /content/results/distilroberta-base_16_quant.json
