In [None]:
from transformers import BertTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score
import torch
import torch.nn.utils.prune as prune
from pyswarm import pso

# Load IMDb dataset
dataset = load_dataset('imdb')

# Use a small subset of the dataset for faster experimentation
small_train_dataset = dataset['train'].shuffle(seed=42).select(range(1000))
small_test_dataset = dataset['test'].shuffle(seed=42).select(range(1000))

# Load TinyBERT model and tokenizer using AutoModelForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('prajjwal1/bert-tiny')
model = AutoModelForSequenceClassification.from_pretrained('prajjwal1/bert-tiny', num_labels=2)

# Make all model parameters contiguous
for param in model.parameters():
    param.data = param.data.contiguous()

# Tokenize data with fixed sequence length
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)  # Define max_length

tokenized_train = small_train_dataset.map(tokenize_function, batched=True)
tokenized_test = small_test_dataset.map(tokenize_function, batched=True)

# Remove columns that are not tensors
tokenized_train = tokenized_train.remove_columns(["text"])
tokenized_test = tokenized_test.remove_columns(["text"])

# Define compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# Define PSO fitness function (objective function)
def fitness_function(hyperparams):
    lr, batch_size, epochs = hyperparams
    batch_size = int(batch_size)
    epochs = int(epochs)

    # Reduce training arguments for faster execution
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        learning_rate=lr,
        evaluation_strategy="steps",
        eval_steps=50,  # Evaluate every 50 steps to save time
        save_steps=50,  # Save model checkpoint every 50 steps
        logging_steps=50,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        compute_metrics=compute_metrics  # Pass the compute metrics function
    )

    # Make sure model parameters are contiguous before training
    for param in model.parameters():
        param.data = param.data.contiguous()

    # Train and evaluate the model
    trainer.train()
    eval_results = trainer.evaluate()

    # Return negative accuracy to minimize the fitness function
    return -eval_results['eval_accuracy']

# PSO to optimize hyperparameters: learning rate, batch size, epochs
lb = [1e-5, 8, 1]  # Lower bounds for hyperparameters
ub = [1e-3, 16, 2]  # Upper bounds for hyperparameters, reduced search space

best_hyperparams, _ = pso(fitness_function, lb, ub, swarmsize=5, maxiter=3)

# Output optimized hyperparameters
print("Best hyperparameters:", best_hyperparams)

# Train final model with optimized hyperparameters
final_lr, final_batch_size, final_epochs = best_hyperparams
final_batch_size = int(final_batch_size)
final_epochs = int(final_epochs)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=final_epochs,
    per_device_train_batch_size=final_batch_size,
    learning_rate=final_lr,
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
     # Pass the compute metrics function
)

# Make all parameters contiguous before final training
for param in model.parameters():
    param.data = param.data.contiguous()

# Apply L1 unstructured pruning to all linear layers in the model
def apply_l1_pruning(model, amount=0.4):
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            prune.l1_unstructured(module, name='weight', amount=amount)
            prune.remove(module, 'weight')  # Remove the pruning mask, making it permanent

# Apply L1 pruning with 40% sparsity
apply_l1_pruning(model, amount=0.4)

# Train the final pruned model and evaluate
trainer.train()
final_results = trainer.evaluate()
print("Final accuracy after pruning:", final_results['eval_accuracy'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss,Accuracy
50,0.7061,0.690304,0.512
100,0.688,0.676241,0.551




Step,Training Loss,Validation Loss,Accuracy
50,0.5988,0.571143,0.718
100,0.5352,0.581957,0.72




Step,Training Loss,Validation Loss,Accuracy
50,0.3715,0.606737,0.741
100,0.393,0.617281,0.731




Step,Training Loss,Validation Loss,Accuracy
50,0.2589,0.616885,0.746




Step,Training Loss,Validation Loss,Accuracy
50,0.4778,0.694002,0.689




Step,Training Loss,Validation Loss,Accuracy
50,0.1564,0.867132,0.738




Step,Training Loss,Validation Loss,Accuracy
50,0.0645,1.105214,0.744




Step,Training Loss,Validation Loss,Accuracy
50,0.0746,1.158583,0.751




Step,Training Loss,Validation Loss,Accuracy
50,0.0532,1.171298,0.751




Step,Training Loss,Validation Loss,Accuracy
50,0.1361,1.392025,0.71




Step,Training Loss,Validation Loss,Accuracy
50,0.0421,1.573745,0.752




Step,Training Loss,Validation Loss,Accuracy
50,0.0515,1.787563,0.749




Step,Training Loss,Validation Loss,Accuracy
50,0.091,1.738338,0.752




Step,Training Loss,Validation Loss,Accuracy
50,0.0853,1.758838,0.751




Step,Training Loss,Validation Loss,Accuracy
50,0.133,2.105597,0.744




Step,Training Loss,Validation Loss,Accuracy
50,0.0335,2.17104,0.749
100,0.4915,2.055323,0.753




Step,Training Loss,Validation Loss,Accuracy
50,0.0254,2.279177,0.743




Step,Training Loss,Validation Loss,Accuracy
50,0.0122,2.251292,0.75




Step,Training Loss,Validation Loss,Accuracy
50,0.0001,2.310796,0.748




Step,Training Loss,Validation Loss,Accuracy
50,0.0143,2.738878,0.741
100,0.4619,2.303587,0.743


Stopping search: maximum iterations reached --> 3
Best hyperparameters: [1.20512667e-04 1.43749001e+01 1.87742404e+00]




Epoch,Training Loss,Validation Loss,Accuracy
1,No log,2.037895,0.747


Final accuracy after pruning: 0.747


In [None]:
# Define the directory where the model and tokenizer will be saved
output_dir = "./saved_model"

# Save the model and tokenizer after training
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Optionally, you can also save the training arguments and optimizer states if needed
trainer.save_model(output_dir)  # This will save the model, tokenizer, and additional components


In [None]:
from transformers import BertTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score
import torch
import torch.nn.utils.prune as prune
from pyswarm import pso

# Load IMDb dataset
dataset = load_dataset('imdb')

# Use a small subset of the dataset for faster experimentation
small_train_dataset = dataset['train'].shuffle(seed=42).select(range(1000))
small_test_dataset = dataset['test'].shuffle(seed=42).select(range(1000))

# Load TinyBERT model and tokenizer using AutoModelForSequenceClassification
tokenizer2 = BertTokenizer.from_pretrained('prajjwal1/bert-tiny')
model2 = AutoModelForSequenceClassification.from_pretrained('prajjwal1/bert-tiny', num_labels=2)

# Make all model parameters contiguous
for param in model2.parameters():
    param.data = param.data.contiguous()

# Tokenize data with fixed sequence length
def tokenize_function(examples):
    return tokenizer2(examples['text'], padding="max_length", truncation=True, max_length=128)  # Define max_length

tokenized_train = small_train_dataset.map(tokenize_function, batched=True)
tokenized_test = small_test_dataset.map(tokenize_function, batched=True)

# Remove columns that are not tensors
tokenized_train = tokenized_train.remove_columns(["text"])
tokenized_test = tokenized_test.remove_columns(["text"])

# Define compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# Define PSO fitness function (objective function)
def fitness_function(hyperparams):
    lr, batch_size, epochs = hyperparams
    batch_size = int(batch_size)
    epochs = int(epochs)

    # Reduce training arguments for faster execution
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        learning_rate=lr,
        evaluation_strategy="steps",
        eval_steps=50,  # Evaluate every 50 steps to save time
        save_steps=50,  # Save model checkpoint every 50 steps
        logging_steps=50,
    )

    trainer = Trainer(
        model=model2,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        compute_metrics=compute_metrics  # Pass the compute metrics function
    )

    # Make sure model parameters are contiguous before training
    for param in model2.parameters():
        param.data = param.data.contiguous()

    # Train and evaluate the model
    trainer.train()
    eval_results = trainer.evaluate()

    # Return negative accuracy to minimize the fitness function
    return -eval_results['eval_accuracy']

# PSO to optimize hyperparameters: learning rate, batch size, epochs
lb = [1e-6, 8, 2] # Lower bounds for hyperparameters
ub = [1e-2, 32, 4]  # Upper bounds for hyperparameters, reduced search space

best_hyperparams, _ = pso(fitness_function, lb, ub, swarmsize=10, maxiter=10)

# Output optimized hyperparameters
print("Best hyperparameters:", best_hyperparams)

# Train final model with optimized hyperparameters
final_lr, final_batch_size, final_epochs = best_hyperparams
final_batch_size = int(final_batch_size)
final_epochs = int(final_epochs)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=final_epochs,
    per_device_train_batch_size=final_batch_size,
    learning_rate=final_lr,
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model2,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
     # Pass the compute metrics function
)

# Make all parameters contiguous before final training
for param in model2.parameters():
    param.data = param.data.contiguous()

# Apply L1 unstructured pruning to all linear layers in the model
def apply_l1_pruning(model, amount=0.4):
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            prune.l1_unstructured(module, name='weight', amount=amount)
            prune.remove(module, 'weight')  # Remove the pruning mask, making it permanent

# Apply L1 pruning with 40% sparsity
apply_l1_pruning(model2, amount=0.4)

# Train the final pruned model and evaluate
trainer.train()
final_results = trainer.evaluate()
print("Final accuracy after pruning:", final_results['eval_accuracy'])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss,Accuracy
50,0.72,0.692737,0.502
100,0.693,0.684405,0.575




Step,Training Loss,Validation Loss,Accuracy
50,0.7444,0.795512,0.512
100,0.7511,0.705262,0.488
150,0.703,0.710719,0.512




Step,Training Loss,Validation Loss,Accuracy
50,0.6949,0.692914,0.512
100,0.7009,0.692874,0.512




Step,Training Loss,Validation Loss,Accuracy
50,0.8259,0.693855,0.488
100,0.7404,0.772562,0.488
150,0.7389,0.704731,0.512
200,0.7688,0.694125,0.488
250,0.7328,0.729898,0.488
300,0.7428,0.693204,0.512
350,0.7249,0.693658,0.512




Step,Training Loss,Validation Loss,Accuracy
50,0.7871,0.925119,0.488
100,0.7601,0.727703,0.512
150,0.7104,0.698703,0.512




Step,Training Loss,Validation Loss,Accuracy
50,0.7216,0.696518,0.488
100,0.7186,0.692891,0.512
150,0.694,0.692872,0.512
200,0.7057,0.693176,0.488




Step,Training Loss,Validation Loss,Accuracy
50,0.824,0.711986,0.512
100,0.7468,0.692917,0.512
150,0.7403,0.693023,0.512
200,0.7083,0.694107,0.488




Step,Training Loss,Validation Loss,Accuracy
50,0.7054,0.694026,0.488
100,0.7038,0.692957,0.512




Step,Training Loss,Validation Loss,Accuracy
50,0.725,0.694345,0.488




Step,Training Loss,Validation Loss,Accuracy
50,0.7554,0.717207,0.512




Step,Training Loss,Validation Loss,Accuracy
50,0.7231,0.720801,0.488




Step,Training Loss,Validation Loss,Accuracy
50,0.7366,0.702134,0.488
100,0.7152,0.692985,0.512




Step,Training Loss,Validation Loss,Accuracy
50,0.6944,0.693119,0.512




Step,Training Loss,Validation Loss,Accuracy
50,0.7388,0.696474,0.512
100,0.7052,0.696484,0.512




Step,Training Loss,Validation Loss,Accuracy
50,0.7311,0.697414,0.512
100,0.7075,0.694162,0.488




Step,Training Loss,Validation Loss,Accuracy
50,0.7023,0.696854,0.488
100,0.7014,0.692526,0.514




Step,Training Loss,Validation Loss,Accuracy
50,0.7085,0.698629,0.519




Step,Training Loss,Validation Loss,Accuracy
50,0.6918,0.695477,0.514




Step,Training Loss,Validation Loss,Accuracy
50,0.7015,0.692967,0.512




Step,Training Loss,Validation Loss,Accuracy
50,0.6991,0.696892,0.512




Step,Training Loss,Validation Loss,Accuracy
50,0.6692,0.696851,0.512




Step,Training Loss,Validation Loss,Accuracy
50,0.7204,0.713115,0.512




Step,Training Loss,Validation Loss,Accuracy
50,0.6982,0.69386,0.488




Step,Training Loss,Validation Loss,Accuracy
50,0.7237,0.697949,0.512
100,0.7097,0.706213,0.488




Step,Training Loss,Validation Loss,Accuracy
50,0.7144,0.693492,0.488




Step,Training Loss,Validation Loss,Accuracy
50,0.6958,0.697726,0.512




Step,Training Loss,Validation Loss,Accuracy
50,0.6893,0.693458,0.512




Step,Training Loss,Validation Loss,Accuracy
50,0.7047,0.707811,0.512




Step,Training Loss,Validation Loss,Accuracy
50,0.6917,0.692864,0.512




Step,Training Loss,Validation Loss,Accuracy
50,0.7089,0.697317,0.512




Step,Training Loss,Validation Loss,Accuracy
50,0.7083,0.699389,0.488




Step,Training Loss,Validation Loss,Accuracy
50,0.7089,0.768785,0.512




Step,Training Loss,Validation Loss,Accuracy
50,0.6756,0.693956,0.488




Step,Training Loss,Validation Loss,Accuracy
50,0.7043,0.703227,0.49




Step,Training Loss,Validation Loss,Accuracy
50,0.7313,0.693302,0.513




Step,Training Loss,Validation Loss,Accuracy
50,0.6886,0.703871,0.512




Step,Training Loss,Validation Loss,Accuracy
50,0.7333,0.711941,0.512




Step,Training Loss,Validation Loss,Accuracy
50,0.6839,0.706777,0.512




Step,Training Loss,Validation Loss,Accuracy
50,0.7127,0.694786,0.511
100,0.6593,0.694952,0.489




Step,Training Loss,Validation Loss,Accuracy
50,0.6605,0.713758,0.512
100,0.5964,0.698066,0.512




Step,Training Loss,Validation Loss,Accuracy
50,0.6243,0.692934,0.512




Step,Training Loss,Validation Loss,Accuracy
50,0.7129,0.797185,0.512
100,0.5696,0.822207,0.512




Step,Training Loss,Validation Loss,Accuracy
50,0.5654,0.893482,0.512
100,0.5059,0.76516,0.513




Step,Training Loss,Validation Loss,Accuracy
50,0.6802,0.722474,0.558




Step,Training Loss,Validation Loss,Accuracy
50,0.818,0.700396,0.488
100,0.7087,0.693702,0.488




Step,Training Loss,Validation Loss,Accuracy
50,0.71,0.704051,0.491




Step,Training Loss,Validation Loss,Accuracy
50,0.6614,0.709933,0.536
100,0.6058,0.722244,0.56




Step,Training Loss,Validation Loss,Accuracy
50,0.6352,0.739404,0.548




Step,Training Loss,Validation Loss,Accuracy
50,0.607,0.855736,0.545
100,0.5257,0.813366,0.532




Step,Training Loss,Validation Loss,Accuracy
50,0.6354,0.822557,0.541
100,0.5173,0.755572,0.536




Step,Training Loss,Validation Loss,Accuracy
50,0.5321,0.776867,0.545




Step,Training Loss,Validation Loss,Accuracy
50,0.5207,0.782078,0.537
100,0.4649,0.812105,0.541




Step,Training Loss,Validation Loss,Accuracy
50,0.4826,0.838679,0.533
100,0.4569,0.808638,0.536




Step,Training Loss,Validation Loss,Accuracy
50,0.5036,0.797293,0.524




Step,Training Loss,Validation Loss,Accuracy
50,0.6582,0.795255,0.524
100,0.7132,0.742887,0.529




Step,Training Loss,Validation Loss,Accuracy
50,0.7225,0.787626,0.486




Step,Training Loss,Validation Loss,Accuracy
50,0.7236,0.779239,0.516
100,0.6927,0.797702,0.511




Step,Training Loss,Validation Loss,Accuracy
50,0.7217,0.737734,0.512




Step,Training Loss,Validation Loss,Accuracy
50,0.7194,0.693801,0.512
100,0.7019,0.69321,0.461




Step,Training Loss,Validation Loss,Accuracy
50,0.7306,0.698339,0.512
100,0.7071,0.693577,0.512




Step,Training Loss,Validation Loss,Accuracy
50,0.7155,0.692901,0.512
100,0.7013,0.694085,0.488




Step,Training Loss,Validation Loss,Accuracy
50,0.7188,0.699668,0.512
100,0.6964,0.693323,0.488




Step,Training Loss,Validation Loss,Accuracy
50,0.7244,0.715591,0.488
100,0.7066,0.693483,0.488




Step,Training Loss,Validation Loss,Accuracy
50,0.7245,0.699057,0.488




Step,Training Loss,Validation Loss,Accuracy
50,0.7754,0.700599,0.505
100,0.7427,0.78284,0.488




Step,Training Loss,Validation Loss,Accuracy
50,0.7247,0.731671,0.488




Step,Training Loss,Validation Loss,Accuracy
50,0.7178,0.707123,0.489
100,0.6989,0.70737,0.488




Step,Training Loss,Validation Loss,Accuracy
50,0.7185,0.703722,0.488




Step,Training Loss,Validation Loss,Accuracy
50,0.7153,0.692943,0.512
100,0.6987,0.694252,0.488




Step,Training Loss,Validation Loss,Accuracy
50,0.7325,0.699132,0.512
100,0.7138,0.692871,0.512




Step,Training Loss,Validation Loss,Accuracy
50,0.7154,0.692934,0.512
100,0.7012,0.694111,0.488




Step,Training Loss,Validation Loss,Accuracy
50,0.706,0.693866,0.488
100,0.6986,0.694623,0.488




Step,Training Loss,Validation Loss,Accuracy
50,0.7063,0.693135,0.512
100,0.7005,0.693826,0.488




Step,Training Loss,Validation Loss,Accuracy
50,0.7219,0.703884,0.51




Step,Training Loss,Validation Loss,Accuracy
50,0.6884,0.78441,0.518
100,0.5885,0.794289,0.529




Step,Training Loss,Validation Loss,Accuracy
50,0.6145,0.822555,0.515




Step,Training Loss,Validation Loss,Accuracy
50,0.5986,0.783828,0.546
100,0.5704,0.81965,0.513




Step,Training Loss,Validation Loss,Accuracy
50,0.58,0.793872,0.519
100,0.5634,0.811548,0.52




Step,Training Loss,Validation Loss,Accuracy
50,0.5555,0.777554,0.542
100,0.5449,0.801468,0.504




Step,Training Loss,Validation Loss,Accuracy
50,0.618,0.821645,0.531
100,0.5139,0.797517,0.528




Step,Training Loss,Validation Loss,Accuracy
50,0.5406,0.805403,0.525
100,0.5158,0.789481,0.517




Step,Training Loss,Validation Loss,Accuracy
50,0.5202,0.787494,0.523
100,0.5121,0.779889,0.516




Step,Training Loss,Validation Loss,Accuracy
50,0.5125,0.78939,0.524
100,0.5104,0.828143,0.516




Step,Training Loss,Validation Loss,Accuracy
50,0.5658,1.005315,0.505




Step,Training Loss,Validation Loss,Accuracy
50,0.5289,0.712593,0.522
100,0.5176,0.73585,0.516




Step,Training Loss,Validation Loss,Accuracy
50,0.4957,0.795823,0.52




Step,Training Loss,Validation Loss,Accuracy
50,0.4875,0.816818,0.523
100,0.4661,0.823618,0.494




Step,Training Loss,Validation Loss,Accuracy
50,0.4575,0.811931,0.503
100,0.4131,0.796653,0.522




Step,Training Loss,Validation Loss,Accuracy
50,0.4051,0.815235,0.517
100,0.3989,0.877808,0.53




Step,Training Loss,Validation Loss,Accuracy
50,0.4408,0.853057,0.517
100,0.3926,0.888861,0.519




Step,Training Loss,Validation Loss,Accuracy
50,0.3856,1.080053,0.506
100,0.3821,0.975994,0.504




Step,Training Loss,Validation Loss,Accuracy
50,0.4378,1.039556,0.509




Step,Training Loss,Validation Loss,Accuracy
50,0.3881,0.999146,0.496
100,0.3645,0.977009,0.516




Step,Training Loss,Validation Loss,Accuracy
50,0.4175,0.821214,0.516




Step,Training Loss,Validation Loss,Accuracy
50,0.5039,0.744664,0.512




Step,Training Loss,Validation Loss,Accuracy
50,0.3966,0.980867,0.518




Step,Training Loss,Validation Loss,Accuracy
50,0.3866,1.064654,0.52
100,0.3663,0.957733,0.493




Step,Training Loss,Validation Loss,Accuracy
50,0.4532,0.808009,0.501
100,0.3929,0.913511,0.507




Step,Training Loss,Validation Loss,Accuracy
50,0.3904,0.996037,0.498
100,0.3752,0.944867,0.495




Step,Training Loss,Validation Loss,Accuracy
50,0.4409,0.898167,0.51
100,0.4136,0.877962,0.519




Step,Training Loss,Validation Loss,Accuracy
50,0.3705,0.965718,0.503
100,0.3463,0.971133,0.518




Step,Training Loss,Validation Loss,Accuracy
50,0.3984,0.98399,0.522
100,0.375,0.922323,0.527




Step,Training Loss,Validation Loss,Accuracy
50,0.3724,0.99664,0.516
100,0.3325,0.986392,0.52




Step,Training Loss,Validation Loss,Accuracy
50,0.3908,0.999755,0.513




Step,Training Loss,Validation Loss,Accuracy
50,0.4896,0.838047,0.508




Step,Training Loss,Validation Loss,Accuracy
50,0.3724,1.013661,0.511




Step,Training Loss,Validation Loss,Accuracy
50,0.3633,0.923375,0.493
100,0.342,0.900181,0.494




Step,Training Loss,Validation Loss,Accuracy
50,0.3415,0.876085,0.53




Step,Training Loss,Validation Loss,Accuracy
50,0.3809,0.862216,0.507
100,0.3821,0.938072,0.506




Step,Training Loss,Validation Loss,Accuracy
50,0.3456,1.115117,0.518
100,0.3175,0.984769,0.525


Stopping search: maximum iterations reached --> 10
Best hyperparameters: [3.74602665e-03 3.08171434e+01 3.46398788e+00]




Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.975413,0.524
2,No log,0.923616,0.527
3,No log,0.986001,0.519


Final accuracy after pruning: 0.519


In [None]:
from transformers import BertTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score
import torch
import torch.nn.utils.prune as prune
from pyswarm import pso

# Load IMDb dataset
dataset = load_dataset('imdb')

# Use a small subset of the dataset for faster experimentation
small_train_dataset = dataset['train'].shuffle(seed=42).select(range(1000))
small_test_dataset = dataset['test'].shuffle(seed=42).select(range(1000))

# Load TinyBERT model and tokenizer using AutoModelForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('prajjwal1/bert-tiny')
model = AutoModelForSequenceClassification.from_pretrained('prajjwal1/bert-tiny', num_labels=2)

# Make all model parameters contiguous
for param in model.parameters():
    param.data = param.data.contiguous()

# Tokenize data with fixed sequence length
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)  # Define max_length

tokenized_train = small_train_dataset.map(tokenize_function, batched=True)
tokenized_test = small_test_dataset.map(tokenize_function, batched=True)

# Remove columns that are not tensors
tokenized_train = tokenized_train.remove_columns(["text"])
tokenized_test = tokenized_test.remove_columns(["text"])

# Define compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# Define PSO fitness function (objective function)
def fitness_function(hyperparams):
    lr, batch_size, epochs = hyperparams
    batch_size = int(batch_size)
    epochs = int(epochs)

    # Reduce training arguments for faster execution
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        learning_rate=lr,
        evaluation_strategy="steps",
        eval_steps=50,  # Evaluate every 50 steps to save time
        save_steps=50,  # Save model checkpoint every 50 steps
        logging_steps=50,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        compute_metrics=compute_metrics  # Pass the compute metrics function
    )

    # Make sure model parameters are contiguous before training
    for param in model.parameters():
        param.data = param.data.contiguous()

    # Train and evaluate the model
    trainer.train()
    eval_results = trainer.evaluate()

    # Return negative accuracy to minimize the fitness function
    return -eval_results['eval_accuracy']

# PSO to optimize hyperparameters: learning rate, batch size, epochs
lb = [1e-5, 8, 1]  # Lower bounds for hyperparameters
ub = [1e-3, 16, 2]  # Upper bounds for hyperparameters, reduced search space

best_hyperparams, _ = pso(fitness_function, lb, ub, swarmsize=5, maxiter=3)

# Output optimized hyperparameters
print("Best hyperparameters:", best_hyperparams)

# Train final model with optimized hyperparameters
final_lr, final_batch_size, final_epochs = best_hyperparams
final_batch_size = int(final_batch_size)
final_epochs = int(final_epochs)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=final_epochs,
    per_device_train_batch_size=final_batch_size,
    learning_rate=final_lr,
    evaluation_strategy="epoch",
    lr_scheduler_type="cosine",  # Use cosine decay scheduler
    warmup_ratio=0.1  # Warm up for the first 10% of steps
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
     # Pass the compute metrics function
)

# Make all parameters contiguous before final training
for param in model.parameters():
    param.data = param.data.contiguous()

# Apply L1 unstructured pruning to all linear layers in the model
def apply_l1_pruning(model, amount=0.4):
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            prune.l1_unstructured(module, name='weight', amount=amount)
            prune.remove(module, 'weight')  # Remove the pruning mask, making it permanent

# Apply L1 pruning with 40% sparsity
apply_l1_pruning(model, amount=0.4)

# Train the final pruned model and evaluate
trainer.train()
final_results = trainer.evaluate()
print("Final accuracy after pruning:", final_results['eval_accuracy'])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
50,0.692,0.676135,0.603




Step,Training Loss,Validation Loss,Accuracy
50,0.6544,0.740582,0.55
100,0.6177,0.626256,0.67




Step,Training Loss,Validation Loss,Accuracy
50,0.4054,0.625574,0.671




Step,Training Loss,Validation Loss,Accuracy
50,0.4314,0.697841,0.698
100,0.406,0.757886,0.689




Step,Training Loss,Validation Loss,Accuracy
50,0.2291,1.47688,0.652
100,0.5428,0.812483,0.686




Step,Training Loss,Validation Loss,Accuracy
50,0.1963,1.365159,0.684
100,0.2688,1.022093,0.705




Step,Training Loss,Validation Loss,Accuracy
50,0.0368,2.062054,0.694




Step,Training Loss,Validation Loss,Accuracy
50,0.026,1.878678,0.727




Step,Training Loss,Validation Loss,Accuracy
50,0.2185,2.434066,0.671
100,0.3616,0.922229,0.667
150,0.0721,1.222421,0.738




Step,Training Loss,Validation Loss,Accuracy
50,0.0973,1.879456,0.715




Step,Training Loss,Validation Loss,Accuracy
50,0.043,1.879284,0.738
100,0.1046,1.342417,0.75




Step,Training Loss,Validation Loss,Accuracy
50,0.0131,2.024549,0.752




Step,Training Loss,Validation Loss,Accuracy
50,0.0001,2.430258,0.74
100,0.0358,2.45667,0.735




Step,Training Loss,Validation Loss,Accuracy
50,0.0,3.128725,0.744
100,0.0357,4.067825,0.668




Step,Training Loss,Validation Loss,Accuracy
50,0.0201,3.685213,0.737




Step,Training Loss,Validation Loss,Accuracy
50,0.0,4.0418,0.741




Step,Training Loss,Validation Loss,Accuracy
50,0.0,4.470319,0.743
100,0.0,4.553106,0.743




Step,Training Loss,Validation Loss,Accuracy
50,0.0,4.665607,0.743
100,0.0,4.705122,0.743




Step,Training Loss,Validation Loss,Accuracy
50,0.0,4.865749,0.743
100,0.0,4.917309,0.743




Step,Training Loss,Validation Loss,Accuracy
50,0.0,4.992571,0.745


Stopping search: maximum iterations reached --> 3
Best hyperparameters: [3.93913342e-04 1.49004917e+01 1.86924486e+00]




Epoch,Training Loss,Validation Loss,Accuracy
1,No log,3.992364,0.74


Final accuracy after pruning: 0.74
