In [None]:
!pip install transformers datasets torch scikit-learn matplotlib seaborn

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupt

In [None]:
!pip install sentence-transformers setfit scipy evaluate seqeval

Collecting setfit
  Downloading setfit-1.1.1-py3-none-any.whl.metadata (12 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading setfit-1.1.1-py3-none-any.whl (75 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.4/75.4 kB[0m [31m825.3 kB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=a0cb31ccb9e87ce8f603d38d34b3a27e902a17ce00b89def

# Load Dataset

In [1]:
import pandas as pd
df = pd.read_csv('super_sms_dataset.csv', encoding='latin-1')
df['label'] = df['Labels'].fillna(0)
df['text'] = df['SMSes'].astype(str).dropna()
df = df[['text', 'label']]
df.columns = ['sms', 'label']
df['label']=df['label'].astype(int)
df

Unnamed: 0,sms,label
0,There be an update for your delivery CC 017281...,1
1,watch your favorite english movies of all genr...,1
2,aur what is the status for fms,0
3,hi shalini sundi thank you for dialling speci...,1
4,m tryin to understand too...,0
...,...,...
67005,Haha sorry I will be arriving late.,0
67006,Not going out today something cropped up I am ...,0
67007,No no I didn't decide not to turn up I oversle...,0
67008,Spring apartment homes,0


# Train Test Split

In [2]:
from sklearn.model_selection import train_test_split

texts = df['sms'].tolist()
labels = df['label'].tolist()

train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

print(f"Train size: {len(train_texts)}, Test size: {len(test_texts)}")

Train size: 53608, Test size: 13402


# Tokenizing

In [3]:
from transformers import RobertaTokenizer, DistilBertTokenizer

roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

train_encodings_roberta = roberta_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_roberta = roberta_tokenizer(test_texts, truncation=True, padding=True)

train_encodings_distilbert = distilbert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_distilbert = distilbert_tokenizer(test_texts, truncation=True, padding=True)




vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [4]:
import torch

class SpamDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets for RoBERTa and DistilBERT
train_dataset_roberta = SpamDataset(train_encodings_roberta, train_labels)
test_dataset_roberta = SpamDataset(test_encodings_roberta, test_labels)

train_dataset_distilbert = SpamDataset(train_encodings_distilbert, train_labels)
test_dataset_distilbert = SpamDataset(test_encodings_distilbert, test_labels)


In [5]:
from transformers import RobertaForSequenceClassification, DistilBertForSequenceClassification

# Load pre-trained models
roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
distilbert_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)



model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
You should pr

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.we

In [6]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [7]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
)

# Create Trainer for both models
trainer_roberta = Trainer(
    model=roberta_model,
    args=training_args,
    train_dataset=train_dataset_roberta,
    eval_dataset=test_dataset_roberta
)

trainer_distilbert = Trainer(
    model=distilbert_model,
    args=training_args,
    train_dataset=train_dataset_distilbert,
    eval_dataset=test_dataset_distilbert
)

2025-02-15 21:18:31.401633: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [8]:
# Train RoBERTa
trainer_roberta.train()

# Train DistilBERT
trainer_distilbert.train()



OutOfMemoryError: CUDA out of memory. Tried to allocate 12.00 MiB. GPU 0 has a total capacity of 3.80 GiB of which 7.75 MiB is free. Process 117635 has 1.22 GiB memory in use. Including non-PyTorch memory, this process has 2.52 GiB memory in use. Of the allocated memory 2.37 GiB is allocated by PyTorch, and 54.78 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

def evaluate_model(trainer, test_dataset, test_labels, model_name):
    predictions = trainer.predict(test_dataset)
    pred_labels = predictions.predictions.argmax(axis=-1)

    accuracy = accuracy_score(test_labels, pred_labels)
    f1 = f1_score(test_labels, pred_labels)
    precision = precision_score(test_labels, pred_labels)
    recall = recall_score(test_labels, pred_labels)
    roc_auc = roc_auc_score(test_labels, pred_labels)

    print(f"{model_name} - Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, ROC AUC: {roc_auc:.4f}")

# Evaluate models
evaluate_model(trainer_roberta, test_dataset_roberta, test_labels, "RoBERTa")
evaluate_model(trainer_distilbert, test_dataset_distilbert, test_labels, "DistilBERT")

In [None]:
def calculate_model_size(model, model_name):
    model_size = sum(p.numel() for p in model.parameters()) * 4 / 1024**2
    print(f"{model_name} Model Size: {model_size:.2f} MB")

calculate_model_size(roberta_model, "RoBERTa")
calculate_model_size(distilbert_model, "DistilBERT")

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer

bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization
train_encodings_bert = bert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_bert = bert_tokenizer(test_texts, truncation=True, padding=True)

train_dataset_bert = SpamDataset(train_encodings_bert, train_labels)
test_dataset_bert = SpamDataset(test_encodings_bert, test_labels)

# Define trainer for BERT
trainer_bert = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_dataset_bert,
    eval_dataset=test_dataset_bert
)

trainer_bert.train()
evaluate_model(trainer_bert, test_dataset_bert, test_labels, "BERT")
calculate_model_size(bert_model, "BERT")

In [None]:
from transformers import MobileBertForSequenceClassification, MobileBertTokenizer

# Load MobileBERT
mobilebert_model = MobileBertForSequenceClassification.from_pretrained('google/mobilebert-uncased', num_labels=2)
mobilebert_tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')

# Tokenization
train_encodings_mobilebert = mobilebert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_mobilebert = mobilebert_tokenizer(test_texts, truncation=True, padding=True)

train_dataset_mobilebert = SpamDataset(train_encodings_mobilebert, train_labels)
test_dataset_mobilebert = SpamDataset(test_encodings_mobilebert, test_labels)

# Define trainer for MobileBERT
trainer_mobilebert = Trainer(
    model=mobilebert_model,
    args=training_args,
    train_dataset=train_dataset_mobilebert,
    eval_dataset=test_dataset_mobilebert
)

# Train and Evaluate
trainer_mobilebert.train()
evaluate_model(trainer_mobilebert, test_dataset_mobilebert, test_labels, "MobileBERT")
calculate_model_size(mobilebert_model, "MobileBERT")

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load TinyBERT
tinybert_model = AutoModelForSequenceClassification.from_pretrained('huawei-noah/TinyBERT_General_4L_312D', num_labels=2)
tinybert_tokenizer = AutoTokenizer.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')

# Tokenization
train_encodings_tinybert = tinybert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_tinybert = tinybert_tokenizer(test_texts, truncation=True, padding=True)

train_dataset_tinybert = SpamDataset(train_encodings_tinybert, train_labels)
test_dataset_tinybert = SpamDataset(test_encodings_tinybert, test_labels)

trainer_tinybert = Trainer(
    model=tinybert_model,
    args=training_args,
    train_dataset=train_dataset_tinybert,
    eval_dataset=test_dataset_tinybert
)

trainer_tinybert.train()
evaluate_model(trainer_tinybert, test_dataset_tinybert, test_labels, "TinyBERT")
calculate_model_size(tinybert_model, "TinyBERT")

In [None]:
results = []

def evaluate_and_store(trainer, test_dataset, test_labels, model_name, model):
    metrics = evaluate_model(trainer, test_dataset, test_labels, model_name) or {}
    model_size = calculate_model_size(model, model_name) or 0

    results.append([
        model_name,
        metrics.get("accuracy", 0),  # Default to 0 if missing
        metrics.get("f1", 0),
        metrics.get("precision", 0),
        metrics.get("recall", 0),
        metrics.get("roc_auc", 0),
        f"{model_size:.2f} MB"
    ])

evaluate_and_store(trainer_bert, test_dataset_bert, test_labels, "BERT", bert_model)
evaluate_and_store(trainer_roberta, test_dataset_roberta, test_labels, "RoBERTa", roberta_model)
evaluate_and_store(trainer_distilbert, test_dataset_distilbert, test_labels, "DistilBERT", distilbert_model)
evaluate_and_store(trainer_mobilebert, test_dataset_mobilebert, test_labels, "MobileBERT", mobilebert_model)
evaluate_and_store(trainer_tinybert, test_dataset_tinybert, test_labels, "TinyBERT", tinybert_model)