
Details of the code:
*   Trained TinyBERT on IMDB dataset.
*   Optimized using PSO.
*   Applied L1 Pruning.

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

In [None]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Step 1: Load the IMDb dataset
dataset = load_dataset("imdb")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
# Step 2: Load TinyBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("huawei-noah/TinyBERT_General_4L_312D")
model = BertForSequenceClassification.from_pretrained("huawei-noah/TinyBERT_General_4L_312D", num_labels=2)
model.to(device)

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/409 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/62.7M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 312, padding_idx=0)
      (position_embeddings): Embedding(512, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-1

In [None]:
# Step 3: Preprocess the data
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=128)

In [None]:
encoded_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
# Step 4: Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
)



In [None]:
# Step 5: Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['test'],
)

In [None]:
# Step 6: Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.3198,0.427635
2,0.3093,0.380393
3,0.2309,0.411779


TrainOutput(global_step=4689, training_loss=0.31010834350594074, metrics={'train_runtime': 228.2647, 'train_samples_per_second': 328.566, 'train_steps_per_second': 20.542, 'total_flos': 268856179200000.0, 'train_loss': 0.31010834350594074, 'epoch': 3.0})

In [None]:
# Step 7: Evaluate the model
eval_result = trainer.evaluate()
print(f"Evaluation results: {eval_result}")

Evaluation results: {'eval_loss': 0.41177889704704285, 'eval_runtime': 19.0756, 'eval_samples_per_second': 1310.574, 'eval_steps_per_second': 81.937, 'epoch': 3.0}


In [None]:
import numpy as np
from datasets import load_metric

# Step 1: Load the accuracy metric
metric = load_metric("accuracy")

# Step 2: Define a custom compute_metrics function to calculate accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Step 3: Redefine the Trainer to include compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['test'],
    compute_metrics=compute_metrics,  # Include accuracy computation
)

# Step 4: Evaluate the model
eval_result = trainer.evaluate()

# Step 5: Print accuracy
print(f"Accuracy: {eval_result['eval_accuracy']:.4f}")

  metric = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

The repository for accuracy contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/accuracy.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Accuracy: 0.8585


In [4]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Step 1: Load the IMDb dataset
dataset = load_dataset("imdb")

# Step 2: Load TinyBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("huawei-noah/TinyBERT_General_4L_312D")
model = BertForSequenceClassification.from_pretrained("huawei-noah/TinyBERT_General_4L_312D", num_labels=2)
model.to(device)

# Step 3: Preprocess the data
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=128)

encoded_dataset = dataset.map(preprocess_function, batched=True)

# Step 4: Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,  # L2 regularization
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
)

# Step 5: Implement custom Trainer with L1 regularization
class L1Trainer(Trainer):
    def __init__(self, l1_lambda=1e-5, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.l1_lambda = l1_lambda

    # Override the compute_loss method to add L1 regularization
    def compute_loss(self, model, inputs, return_outputs=False):
        # Compute the standard loss using Hugging Face's method
        loss, outputs = super().compute_loss(model, inputs, return_outputs=True)

        # Apply L1 regularization (penalty for non-zero weights)
        l1_loss = 0
        for param in model.parameters():
            l1_loss += torch.sum(torch.abs(param))

        # Add L1 regularization to the loss
        loss += self.l1_lambda * l1_loss

        return (loss, outputs) if return_outputs else loss

# Step 6: Define the L1Trainer with L1 regularization
trainer = L1Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['test'],
    l1_lambda=1e-5,  # L1 regularization coefficient
)

# Step 7: Train the model with L1 regularization
trainer.train()

# Step 8: Evaluate the model
eval_result = trainer.evaluate()
print(f"Evaluation results: {eval_result}")

# Step 9: Load the accuracy metric for evaluation
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

# Step 10: Define a custom compute_metrics function to calculate accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Step 11: Redefine the Trainer to include compute_metrics and L1 regularization
trainer = L1Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['test'],
    compute_metrics=compute_metrics,
    l1_lambda=1e-5  # L1 regularization coefficient
)

# Step 12: Evaluate the model with L1 regularization
eval_result = trainer.evaluate()

# Step 13: Print the evaluation results and accuracy
print(f"Evaluation results: {eval_result}")
print(f"Accuracy: {eval_result['eval_accuracy']:.4f}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,4.2171,4.368706
2,3.8392,3.907282
3,3.6981,3.842487


Evaluation results: {'eval_loss': 3.842486619949341, 'eval_runtime': 20.7792, 'eval_samples_per_second': 1203.129, 'eval_steps_per_second': 75.22, 'epoch': 3.0}


Evaluation results: {'eval_loss': 3.842486619949341, 'eval_model_preparation_time': 0.0012, 'eval_accuracy': 0.85944, 'eval_runtime': 21.296, 'eval_samples_per_second': 1173.931, 'eval_steps_per_second': 73.394}
Accuracy: 0.8594
