# Model Compression - Emotions Dataset - Tranformer based models

This notebook provides model compression

## Compression tecniques:
1. **Pruning** - model layers pruning
2. **Quantization** - unint8 quantization


## Import Required Libraries


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import nltk
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.util import ngrams
import warnings

'''For compression part'''
import os
import json
import time
import argparse
import inspect
from dataclasses import dataclass
from typing import Dict, List
import torch
import torch.nn as nn
import torch.nn.utils.prune as prune
import matplotlib.pyplot as plt
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    balanced_accuracy_score,
    cohen_kappa_score)
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    set_seed)

warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Download NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

print("✅ Libraries imported successfully")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")


✅ Libraries imported successfully
Pandas version: 2.2.2
NumPy version: 2.0.2


## Load Data


### Mount

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


### Copy train and validation data & unzip

In [3]:
!cp -r gdrive/MyDrive/__PHd_2025/courses/2026a/NLP/data .
!unzip data/data.zip
!mv *csv data/.
!ls data/

cp: cannot open 'gdrive/MyDrive/__PHd_2025/courses/2026a/NLP/data/train.gsheet' for reading: Operation not supported
Archive:  data/data.zip
  inflating: validation.csv          
  inflating: train.csv               
data.zip  train.csv  validation.csv


### Load and take a look

In [4]:
# Load training data
df_t = pd.read_csv('./data/train.csv')
df_v = pd.read_csv('./data/validation.csv')

print(f"Dataset shape: train: {df_t.shape} val: {df_v.shape}")
print(f"Columns: train: {list(df_t.columns)}, val: {list(df_v.columns)}")

print("Train labels distribution:")
print(df_t.label.value_counts())
print("Validation labels distribution:")
print(df_v.label.value_counts())

Dataset shape: train: (16000, 2) val: (2000, 2)
Columns: train: ['text', 'label'], val: ['text', 'label']
Train labels distribution:
label
1    5362
0    4666
3    2159
4    1937
2    1304
5     572
Name: count, dtype: int64
Validation labels distribution:
label
1    704
0    550
3    275
4    212
2    178
5     81
Name: count, dtype: int64


## Clone code from repo

In [5]:
!rm -rf ModelCompression_NLP/ # remove previous version

#if you are clonning a public version, use:
!git clone https://github.com/natalyasegal/ModelCompression_NLP.git

import sys
sys.path.append('/content/ModelCompression_NLP')   # add package root to Python path

#from utils.swap import swap_categories
from compress.compress import parse_int_list, supersample_train_df, train_one_model, apply_global_magnitude_pruning, linear_sparsity, CSVDataset, make_training_args, compute_metrics, WeightedLossTrainer, dynamic_int8_quantize, plot_losses_from_trainer, model_disk_size_mb, TrainResult

from eval.eval import evaluate_all_versions_from_outputs

Cloning into 'ModelCompression_NLP'...
remote: Enumerating objects: 29, done.[K
remote: Counting objects: 100% (29/29), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 29 (delta 8), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (29/29), 150.55 KiB | 2.95 MiB/s, done.
Resolving deltas: 100% (8/8), done.


## Compress Roberta and compare

In [6]:
def main(argv=None):
    p = argparse.ArgumentParser()
    p.add_argument("--train_csv", type=str, default="data/train.csv")
    p.add_argument("--text_col", type=str, default="text")
    p.add_argument("--label_col", type=str, default="label")
    p.add_argument("--out_dir", type=str, default="./outputs_part2")
    p.add_argument("--seed", type=int, default=42)
    p.add_argument("--val_ratio", type=float, default=0.15)
    p.add_argument("--epochs", type=int, default=10)
    p.add_argument("--batch_size", type=int, default=16)
    p.add_argument("--lr", type=float, default=2e-5)
    p.add_argument("--weight_decay", type=float, default=0.01)
    p.add_argument("--max_length", type=int, default=128)
    p.add_argument("--patience", type=int, default=2)
    p.add_argument("--no_weighted_loss", action="store_true")
    p.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu")
    p.add_argument("--show_plots", action="store_true")

    p.add_argument(
        "--supersample_factors",
        type=str,
        default="1,1,4,2,3,8",
        help='e.g. "1,1,1,1,1,1" (no oversample) or "1,1,1,1,1,2" (double class 5)'
    )

    # Model Compression
    p.add_argument("--prune_amount", type=float, default=0.35)
    p.add_argument("--prune_recover_epochs", type=int, default=1)

    args, _ = p.parse_known_args(argv)

    set_seed(args.seed)
    os.makedirs(args.out_dir, exist_ok=True)

    df = pd.read_csv(args.train_csv)
    num_labels = int(df[args.label_col].nunique())

    train_df, val_df = train_test_split(
        df,
        test_size=args.val_ratio,
        random_state=args.seed,
        stratify=df[args.label_col],
    )

    # Apply supersampling ONLY to training split
    if args.supersample_factors is not None:
        factors = parse_int_list(args.supersample_factors)
        train_df = supersample_train_df(train_df, args.label_col, factors=factors, seed=args.seed)
        print("Applied supersampling factors:", factors)
        print("New train label counts:\n", train_df[args.label_col].value_counts().sort_index())

    candidates = [
        "roberta-base",
        "cardiffnlp/twitter-roberta-base",
        "distilroberta-base"]

    results: List[TrainResult] = []
    for m in candidates:
        print(f"\n=== Training {m} ===")
        r = train_one_model(
            model_name=m,
            train_df=train_df,
            val_df=val_df,
            text_col=args.text_col,
            label_col=args.label_col,
            num_labels=num_labels,
            out_root=args.out_dir,
            seed=args.seed,
            epochs=args.epochs,
            batch_size=args.batch_size,
            lr=args.lr,
            weight_decay=args.weight_decay,
            max_length=args.max_length,
            patience=args.patience,
            use_weighted_loss=(not args.no_weighted_loss),
            device=args.device,
            show_plots=args.show_plots)
        results.append(r)
        print("Saved:", r.saved_dir)
        print("Loss plot:", r.loss_plot_path)
        print("Size(MB):", round(r.size_mb, 2))
        print("Eval macro-F1:", round(r.best_metric, 6))

    best = max(results, key=lambda x: x.best_metric)
    print("\n=== BEST MODEL ===")
    print("Best:", best.model_name)
    print("Dir :", best.saved_dir)

    # -------------------------
    # COMPRESSION 1: PRUNING + RECOVERY FINETUNE
    # -------------------------
    print("\n=== COMPRESSION 1: PRUNING + RECOVERY FINETUNE ===")
    best_tok = AutoTokenizer.from_pretrained(best.saved_dir, use_fast=True)
    best_model = AutoModelForSequenceClassification.from_pretrained(best.saved_dir)

    pruned_model = apply_global_magnitude_pruning(best_model, amount=args.prune_amount)
    sp = linear_sparsity(pruned_model)
    print(f"Pruned linear sparsity: {sp:.3f}")

    pruned_dir = os.path.join(args.out_dir, "BEST_PRUNED")
    os.makedirs(pruned_dir, exist_ok=True)

    train_ds = CSVDataset(train_df, best_tok, args.text_col, args.label_col, max_length=args.max_length)
    val_ds = CSVDataset(val_df, best_tok, args.text_col, args.label_col, max_length=args.max_length)
    collator = DataCollatorWithPadding(best_tok)

    class_counts = train_df[args.label_col].value_counts().sort_index().reindex(range(num_labels), fill_value=0).values
    inv = 1.0 / torch.tensor(class_counts + 1e-9, dtype=torch.float)
    class_weights = inv / inv.sum()

    pruned_args = make_training_args(
        output_dir=pruned_dir,
        num_train_epochs=args.prune_recover_epochs,
        learning_rate=min(args.lr, 1e-5),
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="macro_f1",
        greater_is_better=True,
        logging_strategy="epoch",
        report_to="none",
        seed=args.seed,
        fp16=("cuda" in args.device and torch.cuda.is_available()))

    if args.no_weighted_loss:
        pruned_trainer = Trainer(
            model=pruned_model,
            args=pruned_args,
            train_dataset=train_ds,
            eval_dataset=val_ds,
            tokenizer=best_tok,
            data_collator=collator,
            compute_metrics=compute_metrics)
    else:
        pruned_trainer = WeightedLossTrainer(
            class_weights=class_weights,
            model=pruned_model,
            args=pruned_args,
            train_dataset=train_ds,
            eval_dataset=val_ds,
            tokenizer=best_tok,
            data_collator=collator,
            compute_metrics=compute_metrics)

    pruned_trainer.train()
    pruned_metrics = pruned_trainer.evaluate()

    pruned_trainer.save_model(pruned_dir)
    best_tok.save_pretrained(pruned_dir)

    pruned_loss_png = os.path.join(pruned_dir, "loss_curve.png")
    plot_losses_from_trainer(pruned_trainer, pruned_loss_png, show=args.show_plots)

    pruned_size = model_disk_size_mb(pruned_trainer.model)
    print("Pruned dir:", pruned_dir)
    print("Pruned size(MB):", round(pruned_size, 2))
    print("Pruned metrics:", {k: round(float(v), 6) for k, v in pruned_metrics.items() if isinstance(v, (int, float, np.floating))})

    # -------------------------
    # COMPRESSION 2: DYNAMIC INT8 QUANTIZATION (CPU)
    # -------------------------
    print("\n=== COMPRESSION 2: DYNAMIC INT8 QUANTIZATION (CPU) ===")
    best_model_clean = AutoModelForSequenceClassification.from_pretrained(best.saved_dir)
    qmodel = dynamic_int8_quantize(best_model_clean)

    texts = val_df[args.text_col].astype(str).tolist()
    labels = val_df[args.label_col].astype(int).to_numpy()

    preds_all = []
    bs = 32
    for i in range(0, len(texts), bs):
        batch = texts[i:i + bs]
        enc = best_tok(batch, padding=True, truncation=True, max_length=args.max_length, return_tensors="pt")
        with torch.no_grad():
            logits = qmodel(**enc).logits
        preds_all.append(logits.argmax(dim=-1).cpu().numpy())

    preds = np.concatenate(preds_all, axis=0)

    q_metrics = {
        "macro_f1": float(f1_score(labels, preds, average="macro")),
        "accuracy": float(accuracy_score(labels, preds)),
        "balanced_accuracy": float(balanced_accuracy_score(labels, preds)),
        "kappa": float(cohen_kappa_score(labels, preds)),
    }
    for c in range(num_labels):
        mask = labels == c
        q_metrics[f"acc_c{c}"] = float((preds[mask] == labels[mask]).mean()) if mask.sum() else float("nan")

    q_size = model_disk_size_mb(qmodel)

    quant_dir = os.path.join(args.out_dir, "BEST_INT8_CPU")
    os.makedirs(quant_dir, exist_ok=True)
    torch.save(qmodel.state_dict(), os.path.join(quant_dir, "pytorch_model.bin"))
    best_tok.save_pretrained(quant_dir)
    with open(os.path.join(quant_dir, "meta.json"), "w") as f:
        json.dump({"base_model_dir": best.saved_dir, "note": "dynamic int8 quantized (CPU) Linear layers"}, f, indent=2)

    print("Quant dir:", quant_dir)
    print("Quant size(MB):", round(q_size, 2))
    print("Quant metrics:", {k: round(float(v), 6) for k, v in q_metrics.items()})

    # Summary JSON
    summary_path = os.path.join(args.out_dir, "summary.json")
    summary = {
        "best_model": best.model_name,
        "best_dir": best.saved_dir,
        "best_metrics": best.metrics,
        "pruned_dir": pruned_dir,
        "pruned_metrics": {k: float(v) for k, v in pruned_metrics.items() if isinstance(v, (int, float, np.floating))},
        "pruned_sparsity": sp,
        "quant_dir": quant_dir,
        "quant_metrics": q_metrics,
    }
    with open(summary_path, "w") as f:
        json.dump(summary, f, indent=2)

    print("\n=== DONE ===")
    print("Summary:", summary_path)
    print("All outputs in:", args.out_dir)


if __name__ == "__main__":
    main()


Applied supersampling factors: [1, 1, 4, 2, 3, 8]
New train label counts:
 label
0    3966
1    4558
2    4432
3    3670
4    4941
5    3888
Name: count, dtype: int64

=== Training roberta-base ===


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Macro F1,Accuracy,Balanced Accuracy,Kappa,Acc C0,Acc C1,Acc C2,Acc C3,Acc C4,Acc C5
1,0.3998,0.228453,0.900251,0.929583,0.94169,0.908847,0.958571,0.896766,0.979592,0.947531,0.87931,0.988372
2,0.1635,0.244094,0.899826,0.93,0.930583,0.909321,0.96,0.894279,0.989796,0.966049,0.889655,0.883721
3,0.1229,0.27556,0.901075,0.9325,0.927239,0.912443,0.962857,0.902985,0.989796,0.938272,0.92069,0.848837
4,0.0904,0.34494,0.895528,0.929167,0.907315,0.907716,0.955714,0.914179,0.918367,0.953704,0.934483,0.767442
5,0.0625,0.428163,0.898781,0.930417,0.91703,0.909465,0.948571,0.920398,0.913265,0.941358,0.941379,0.837209


Saved: ./outputs_part2/roberta-base
Loss plot: ./outputs_part2/roberta-base/loss_curve.png
Size(MB): 475.58
Eval macro-F1: 0.901075

=== Training cardiffnlp/twitter-roberta-base ===


config.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Macro F1,Accuracy,Balanced Accuracy,Kappa,Acc C0,Acc C1,Acc C2,Acc C3,Acc C4,Acc C5
1,0.3558,0.227766,0.899978,0.92875,0.941152,0.907801,0.954286,0.899254,0.984694,0.944444,0.875862,0.988372
2,0.14,0.254911,0.902477,0.930833,0.92394,0.910251,0.96,0.896766,0.989796,0.978395,0.893103,0.825581
3,0.0888,0.325324,0.897504,0.928333,0.919407,0.906758,0.968571,0.909204,0.954082,0.92284,0.889655,0.872093
4,0.0597,0.383849,0.894258,0.932083,0.901161,0.911325,0.967143,0.926617,0.877551,0.966049,0.913793,0.755814


model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

Saved: ./outputs_part2/cardiffnlp__twitter-roberta-base
Loss plot: ./outputs_part2/cardiffnlp__twitter-roberta-base/loss_curve.png
Size(MB): 475.58
Eval macro-F1: 0.902477

=== Training distilroberta-base ===


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Macro F1,Accuracy,Balanced Accuracy,Kappa,Acc C0,Acc C1,Acc C2,Acc C3,Acc C4,Acc C5
1,0.3974,0.236061,0.901322,0.92875,0.942218,0.907751,0.957143,0.894279,0.97449,0.972222,0.855172,1.0
2,0.1511,0.233079,0.906029,0.9325,0.9334,0.912438,0.96,0.899254,0.97449,0.950617,0.92069,0.895349
3,0.1054,0.24851,0.905696,0.935417,0.930716,0.916158,0.968571,0.90796,0.984694,0.947531,0.903448,0.872093
4,0.0774,0.295665,0.898656,0.929583,0.918726,0.908429,0.957143,0.911692,0.943878,0.978395,0.872414,0.848837


Saved: ./outputs_part2/distilroberta-base
Loss plot: ./outputs_part2/distilroberta-base/loss_curve.png
Size(MB): 313.32
Eval macro-F1: 0.906029

=== BEST MODEL ===
Best: distilroberta-base
Dir : ./outputs_part2/distilroberta-base

=== COMPRESSION 1: PRUNING + RECOVERY FINETUNE ===
Pruned linear sparsity: 0.350


Epoch,Training Loss,Validation Loss,Macro F1,Accuracy,Balanced Accuracy,Kappa,Acc C0,Acc C1,Acc C2,Acc C3,Acc C4,Acc C5
1,0.0865,0.255354,0.908203,0.93625,0.939298,0.917388,0.96,0.902985,0.989796,0.969136,0.906897,0.906977


Pruned dir: ./outputs_part2/BEST_PRUNED
Pruned size(MB): 313.32
Pruned metrics: {'eval_loss': 0.255354, 'eval_macro_f1': 0.908203, 'eval_accuracy': 0.93625, 'eval_balanced_accuracy': 0.939298, 'eval_kappa': 0.917388, 'eval_acc_c0': 0.96, 'eval_acc_c1': 0.902985, 'eval_acc_c2': 0.989796, 'eval_acc_c3': 0.969136, 'eval_acc_c4': 0.906897, 'eval_acc_c5': 0.906977, 'eval_runtime': 1.6171, 'eval_samples_per_second': 1484.11, 'eval_steps_per_second': 92.757, 'epoch': 1.0}

=== COMPRESSION 2: DYNAMIC INT8 QUANTIZATION (CPU) ===
Quant dir: ./outputs_part2/BEST_INT8_CPU
Quant size(MB): 190.14
Quant metrics: {'macro_f1': 0.904116, 'accuracy': 0.93125, 'balanced_accuracy': 0.931705, 'kappa': 0.91083, 'acc_c0': 0.954286, 'acc_c1': 0.902985, 'acc_c2': 0.969388, 'acc_c3': 0.947531, 'acc_c4': 0.92069, 'acc_c5': 0.895349}

=== DONE ===
Summary: ./outputs_part2/summary.json
All outputs in: ./outputs_part2


## Take a look at models outputs

In [8]:
!ls outputs_part2

BEST_INT8_CPU  cardiffnlp__twitter-roberta-base  roberta-base
BEST_PRUNED    distilroberta-base		 summary.json


In [11]:
!cat outputs_part2/summary.json

{
  "best_model": "distilroberta-base",
  "best_dir": "./outputs_part2/distilroberta-base",
  "best_metrics": {
    "eval_loss": 0.23307904601097107,
    "eval_macro_f1": 0.9060287962991002,
    "eval_accuracy": 0.9325,
    "eval_balanced_accuracy": 0.9333998839323306,
    "eval_kappa": 0.9124375599627049,
    "eval_acc_c0": 0.96,
    "eval_acc_c1": 0.8992537313432836,
    "eval_acc_c2": 0.9744897959183674,
    "eval_acc_c3": 0.9506172839506173,
    "eval_acc_c4": 0.9206896551724137,
    "eval_acc_c5": 0.8953488372093024,
    "eval_runtime": 1.5772,
    "eval_samples_per_second": 1521.726,
    "eval_steps_per_second": 95.108,
    "epoch": 4.0
  },
  "pruned_dir": "./outputs_part2/BEST_PRUNED",
  "pruned_metrics": {
    "eval_loss": 0.25535401701927185,
    "eval_macro_f1": 0.9082033722603821,
    "eval_accuracy": 0.93625,
    "eval_balanced_accuracy": 0.9392983485622555,
    "eval_kappa": 0.9173879101076072,
    "eval_acc_c0": 0.96,
    "eval_acc_c1": 0.9029850746268657,
    "eval_acc_

## Save

In [12]:
!mv outputs_part2 outputs_part2_2
#!cp -r outputs_part2_2 gdrive/MyDrive/__PHd_2025/courses/2026a/NLP/.

In [13]:
#!ls outputs_part2_1 gdrive/MyDrive/__PHd_2025/courses/2026a/NLP/outputs_part2_2