##Dataset processing and import

In [None]:
%%capture
!pip install datasets>=2.18.0 transformers>=4.38.2 sentence-transformers>=2.5.1 setfit>=1.0.3 accelerate>=0.27.2 seqeval>=1.2.2

In [1]:
from datasets import load_dataset
from tqdm import tqdm

# Load dataset from CSV using datasets library with encoding specified
dataset = load_dataset("csv", data_files="spam.csv", encoding="latin-1")

# Correct the preprocessing function and apply with batched=True
def preprocess_function(examples):
    # Process labels correctly by checking each string in 'v1'
    examples["label"] = [1 if lbl == "spam" else 0 for lbl in examples["v1"]]
    return examples

# Apply preprocessing to the dataset with batched=True
dataset = dataset.map(preprocess_function, batched=True)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Split dataset
train_data = dataset["train"].train_test_split(test_size=0.2, seed=42)["train"]
test_data = dataset["train"].train_test_split(test_size=0.2, seed=42)["test"]

##BERT-Model

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

#Load model and tokenizer
model_id = "bert-base-cased"
model = AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels=2
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initi

In [4]:
train_data

Dataset({
    features: ['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'label'],
    num_rows: 4457
})

In [5]:
from transformers import DataCollatorWithPadding

# Pad to the longest sequence in the batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def preprocess_function(examples):
   """Tokenize input data"""
   return tokenizer(examples["v2"], truncation=True)

# Tokenize train/test data
tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_test = test_data.map(preprocess_function, batched=True)

2025-02-14 17:21:06.367419: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-14 17:21:06.379074: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-14 17:21:06.382733: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-14 17:21:06.392680: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Map: 100%|███████████████████████████████████████| 11

In [6]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
# Define the compute_metrics function to calculate precision, recall, and F1 score
import numpy as np
import evaluate

def compute_metrics(eval_pred):
    """Calculate precision, recall, and F1 score."""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Load the metric modules
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")
    f1_metric = evaluate.load("f1")

    # Compute the metrics
    precision = precision_metric.compute(predictions=predictions, references=labels)["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels)["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels)["f1"]

    return {"precision": precision, "recall": recall, "f1": f1}

In [8]:
from transformers import TrainingArguments, Trainer

# Training arguments for parameter tuning
training_args = TrainingArguments(
   "model",
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=1,
   weight_decay=0.01,
   save_strategy="epoch",
   report_to="none"
)

# Trainer which executes the training process
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [9]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [10]:
import torch

torch.cuda.empty_cache()  # Clears cache memory
torch.cuda.ipc_collect()  # Frees up unused memory


In [11]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 12.00 MiB. GPU 0 has a total capacity of 3.80 GiB of which 11.69 MiB is free. Including non-PyTorch memory, this process has 3.73 GiB memory in use. Of the allocated memory 3.19 GiB is allocated by PyTorch, and 443.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
trainer.evaluate()

##MobileBERT Model

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

#Load model and tokenizer
model_id = "google/mobilebert-uncased"
model = AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels=2
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
from transformers import DataCollatorWithPadding

# Pad to the longest sequence in the batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def preprocess_function(examples):
   """Tokenize input data"""
   return tokenizer(examples["v2"], truncation=True)

# Tokenize train/test data
tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_test = test_data.map(preprocess_function, batched=True)

In [None]:
from transformers import TrainingArguments, Trainer

# Training arguments for parameter tuning
training_args = TrainingArguments(
   "model",
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=1,
   weight_decay=0.01,
   save_strategy="epoch",
   report_to="none"
)

# Trainer which executes the training process
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

##TinyBERT

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

#Load model and tokenizer
model_id = "huawei-noah/TinyBERT_General_4L_312D"
model = AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels=2
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
from transformers import DataCollatorWithPadding

# Pad to the longest sequence in the batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def preprocess_function(examples):
   """Tokenize input data"""
   return tokenizer(examples["v2"], truncation=True)

# Tokenize train/test data
tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_test = test_data.map(preprocess_function, batched=True)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

##Model Size Evaluation

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

def calculate_model_size(model):
    """Calculate model size in MB without loading into GPU"""
    with torch.no_grad():
        param_size = 0
        for param in model.parameters():
            param_size += param.nelement() * param.element_size()
        buffer_size = 0
        for buffer in model.buffers():
            buffer_size += buffer.nelement() * buffer.element_size()
        size_all_mb = (param_size + buffer_size) / (1024 ** 2)
    return size_all_mb

# List of models to evaluate
models_to_compare = [
    ("google/mobilebert-uncased", "MobileBERT"),
    ("huawei-noah/TinyBERT_General_4L_312D", "TinyBERT"),
    ("bert-base-cased", "Original BERT")  # For comparison
]

# Calculate and print model sizes
print("\nModel Size Comparison:")
for model_id, model_name in models_to_compare:
    # Load model in CPU mode
    model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    # Calculate size
    model_size = calculate_model_size(model)
    param_count = sum(p.numel() for p in model.parameters())

    print(f"\n{model_name}:")
    print(f"- Parameters: {param_count:,}")
    print(f"-Memory Size: {model_size:.2f} MB")
    print(f"- Tokenizer Vocab Size: {tokenizer.vocab_size:,}")

# Then proceed with training code from previous answer