In [None]:
from transformers import BertTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score
import torch
import torch.nn.utils.prune as prune
from pyswarm import pso

# Load IMDb dataset
dataset = load_dataset('imdb')

# Use a small subset of the dataset for faster experimentation
small_train_dataset = dataset['train'].shuffle(seed=42).select(range(1000))
small_test_dataset = dataset['test'].shuffle(seed=42).select(range(1000))

# Load TinyBERT model and tokenizer using AutoModelForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('prajjwal1/bert-tiny')
model = AutoModelForSequenceClassification.from_pretrained('prajjwal1/bert-tiny', num_labels=2)

# Make all model parameters contiguous
for param in model.parameters():
    param.data = param.data.contiguous()

# Tokenize data with fixed sequence length
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)  # Define max_length

tokenized_train = small_train_dataset.map(tokenize_function, batched=True)
tokenized_test = small_test_dataset.map(tokenize_function, batched=True)

# Remove columns that are not tensors
tokenized_train = tokenized_train.remove_columns(["text"])
tokenized_test = tokenized_test.remove_columns(["text"])

# Define compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# Define PSO fitness function (objective function)
def fitness_function(hyperparams):
    lr, batch_size, epochs = hyperparams
    batch_size = int(batch_size)
    epochs = int(epochs)

    # Reduce training arguments for faster execution
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        learning_rate=lr,
        evaluation_strategy="steps",
        eval_steps=50,  # Evaluate every 50 steps to save time
        save_steps=50,  # Save model checkpoint every 50 steps
        logging_steps=50,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        compute_metrics=compute_metrics  # Pass the compute metrics function
    )

    # Make sure model parameters are contiguous before training
    for param in model.parameters():
        param.data = param.data.contiguous()

    # Train and evaluate the model
    trainer.train()
    eval_results = trainer.evaluate()

    # Return negative accuracy to minimize the fitness function
    return -eval_results['eval_accuracy']

# PSO to optimize hyperparameters: learning rate, batch size, epochs
lb = [1e-5, 8, 1]  # Lower bounds for hyperparameters
ub = [1e-3, 16, 2]  # Upper bounds for hyperparameters, reduced search space

best_hyperparams, _ = pso(fitness_function, lb, ub, swarmsize=5, maxiter=3)

# Output optimized hyperparameters
print("Best hyperparameters:", best_hyperparams)

# Train final model with optimized hyperparameters
final_lr, final_batch_size, final_epochs = best_hyperparams
final_batch_size = int(final_batch_size)
final_epochs = int(final_epochs)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=final_epochs,
    per_device_train_batch_size=final_batch_size,
    learning_rate=final_lr,
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
     # Pass the compute metrics function
)

# Make all parameters contiguous before final training
for param in model.parameters():
    param.data = param.data.contiguous()

# Apply L1 unstructured pruning to all linear layers in the model
def apply_l1_pruning(model, amount=0.4):
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            prune.l1_unstructured(module, name='weight', amount=amount)
            prune.remove(module, 'weight')  # Remove the pruning mask, making it permanent

# Apply L1 pruning with 40% sparsity
apply_l1_pruning(model, amount=0.4)

# Train the final pruned model and evaluate
trainer.train()
final_results = trainer.evaluate()
print("Final accuracy after pruning:", final_results['eval_accuracy'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss,Accuracy
50,0.7061,0.690304,0.512
100,0.688,0.676241,0.551




Step,Training Loss,Validation Loss,Accuracy
50,0.5988,0.571143,0.718
100,0.5352,0.581957,0.72




Step,Training Loss,Validation Loss,Accuracy
50,0.3715,0.606737,0.741
100,0.393,0.617281,0.731




Step,Training Loss,Validation Loss,Accuracy
50,0.2589,0.616885,0.746




Step,Training Loss,Validation Loss,Accuracy
50,0.4778,0.694002,0.689




Step,Training Loss,Validation Loss,Accuracy
50,0.1564,0.867132,0.738




Step,Training Loss,Validation Loss,Accuracy
50,0.0645,1.105214,0.744




Step,Training Loss,Validation Loss,Accuracy
50,0.0746,1.158583,0.751




Step,Training Loss,Validation Loss,Accuracy
50,0.0532,1.171298,0.751




Step,Training Loss,Validation Loss,Accuracy
50,0.1361,1.392025,0.71




Step,Training Loss,Validation Loss,Accuracy
50,0.0421,1.573745,0.752




Step,Training Loss,Validation Loss,Accuracy
50,0.0515,1.787563,0.749




Step,Training Loss,Validation Loss,Accuracy
50,0.091,1.738338,0.752




Step,Training Loss,Validation Loss,Accuracy
50,0.0853,1.758838,0.751




Step,Training Loss,Validation Loss,Accuracy
50,0.133,2.105597,0.744




Step,Training Loss,Validation Loss,Accuracy
50,0.0335,2.17104,0.749
100,0.4915,2.055323,0.753




Step,Training Loss,Validation Loss,Accuracy
50,0.0254,2.279177,0.743




Step,Training Loss,Validation Loss,Accuracy
50,0.0122,2.251292,0.75




Step,Training Loss,Validation Loss,Accuracy
50,0.0001,2.310796,0.748




Step,Training Loss,Validation Loss,Accuracy
50,0.0143,2.738878,0.741
100,0.4619,2.303587,0.743


Stopping search: maximum iterations reached --> 3
Best hyperparameters: [1.20512667e-04 1.43749001e+01 1.87742404e+00]




Epoch,Training Loss,Validation Loss,Accuracy
1,No log,2.037895,0.747


Final accuracy after pruning: 0.747


In [None]:
# Define the directory where the model and tokenizer will be saved
output_dir = "./saved_model"

# Save the model and tokenizer after training
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Optionally, you can also save the training arguments and optimizer states if needed
trainer.save_model(output_dir)  # This will save the model, tokenizer, and additional components


In [None]:
from transformers import BertTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score
import torch
import torch.nn.utils.prune as prune
from pyswarm import pso

# Load IMDb dataset
dataset = load_dataset('imdb')

# Use a small subset of the dataset for faster experimentation
small_train_dataset = dataset['train'].shuffle(seed=42).select(range(1000))
small_test_dataset = dataset['test'].shuffle(seed=42).select(range(1000))

# Load TinyBERT model and tokenizer using AutoModelForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('prajjwal1/bert-tiny')
model = AutoModelForSequenceClassification.from_pretrained('prajjwal1/bert-tiny', num_labels=2)

# Make all model parameters contiguous
for param in model.parameters():
    param.data = param.data.contiguous()

# Tokenize data with fixed sequence length
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)  # Define max_length

tokenized_train = small_train_dataset.map(tokenize_function, batched=True)
tokenized_test = small_test_dataset.map(tokenize_function, batched=True)

# Remove columns that are not tensors
tokenized_train = tokenized_train.remove_columns(["text"])
tokenized_test = tokenized_test.remove_columns(["text"])

# Define compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# Define PSO fitness function (objective function)
def fitness_function(hyperparams):
    lr, batch_size, epochs = hyperparams
    batch_size = int(batch_size)
    epochs = int(epochs)

    # Reduce training arguments for faster execution
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        learning_rate=lr,
        evaluation_strategy="steps",
        eval_steps=50,  # Evaluate every 50 steps to save time
        save_steps=50,  # Save model checkpoint every 50 steps
        logging_steps=50,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        compute_metrics=compute_metrics  # Pass the compute metrics function
    )

    # Make sure model parameters are contiguous before training
    for param in model.parameters():
        param.data = param.data.contiguous()

    # Train and evaluate the model
    trainer.train()
    eval_results = trainer.evaluate()

    # Return negative accuracy to minimize the fitness function
    return -eval_results['eval_accuracy']

# PSO to optimize hyperparameters: learning rate, batch size, epochs
lb = [1e-5, 8, 1]  # Lower bounds for hyperparameters
ub = [1e-3, 16, 2]  # Upper bounds for hyperparameters, reduced search space

best_hyperparams, _ = pso(fitness_function, lb, ub, swarmsize=5, maxiter=3)

# Output optimized hyperparameters
print("Best hyperparameters:", best_hyperparams)

# Train final model with optimized hyperparameters
final_lr, final_batch_size, final_epochs = best_hyperparams
final_batch_size = int(final_batch_size)
final_epochs = int(final_epochs)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=final_epochs,
    per_device_train_batch_size=final_batch_size,
    learning_rate=final_lr,
    evaluation_strategy="epoch",
    lr_scheduler_type="cosine",  # Use cosine decay scheduler
    warmup_ratio=0.1  # Warm up for the first 10% of steps
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
     # Pass the compute metrics function
)

# Make all parameters contiguous before final training
for param in model.parameters():
    param.data = param.data.contiguous()

# Apply L1 unstructured pruning to all linear layers in the model
def apply_l1_pruning(model, amount=0.4):
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            prune.l1_unstructured(module, name='weight', amount=amount)
            prune.remove(module, 'weight')  # Remove the pruning mask, making it permanent

# Apply L1 pruning with 40% sparsity
apply_l1_pruning(model, amount=0.4)

# Train the final pruned model and evaluate
trainer.train()
final_results = trainer.evaluate()
print("Final accuracy after pruning:", final_results['eval_accuracy'])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
50,0.692,0.676135,0.603




Step,Training Loss,Validation Loss,Accuracy
50,0.6544,0.740582,0.55
100,0.6177,0.626256,0.67




Step,Training Loss,Validation Loss,Accuracy
50,0.4054,0.625574,0.671




Step,Training Loss,Validation Loss,Accuracy
50,0.4314,0.697841,0.698
100,0.406,0.757886,0.689




Step,Training Loss,Validation Loss,Accuracy
50,0.2291,1.47688,0.652
100,0.5428,0.812483,0.686




Step,Training Loss,Validation Loss,Accuracy
50,0.1963,1.365159,0.684
100,0.2688,1.022093,0.705




Step,Training Loss,Validation Loss,Accuracy
50,0.0368,2.062054,0.694




Step,Training Loss,Validation Loss,Accuracy
50,0.026,1.878678,0.727




Step,Training Loss,Validation Loss,Accuracy
50,0.2185,2.434066,0.671
100,0.3616,0.922229,0.667
150,0.0721,1.222421,0.738




Step,Training Loss,Validation Loss,Accuracy
50,0.0973,1.879456,0.715




Step,Training Loss,Validation Loss,Accuracy
50,0.043,1.879284,0.738
100,0.1046,1.342417,0.75




Step,Training Loss,Validation Loss,Accuracy
50,0.0131,2.024549,0.752




Step,Training Loss,Validation Loss,Accuracy
50,0.0001,2.430258,0.74
100,0.0358,2.45667,0.735




Step,Training Loss,Validation Loss,Accuracy
50,0.0,3.128725,0.744
100,0.0357,4.067825,0.668




Step,Training Loss,Validation Loss,Accuracy
50,0.0201,3.685213,0.737




Step,Training Loss,Validation Loss,Accuracy
50,0.0,4.0418,0.741




Step,Training Loss,Validation Loss,Accuracy
50,0.0,4.470319,0.743
100,0.0,4.553106,0.743




Step,Training Loss,Validation Loss,Accuracy
50,0.0,4.665607,0.743
100,0.0,4.705122,0.743




Step,Training Loss,Validation Loss,Accuracy
50,0.0,4.865749,0.743
100,0.0,4.917309,0.743




Step,Training Loss,Validation Loss,Accuracy
50,0.0,4.992571,0.745


Stopping search: maximum iterations reached --> 3
Best hyperparameters: [3.93913342e-04 1.49004917e+01 1.86924486e+00]




Epoch,Training Loss,Validation Loss,Accuracy
1,No log,3.992364,0.74


Final accuracy after pruning: 0.74


In [None]:
from transformers import BertTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, f1_score
import torch
import torch.nn.utils.prune as prune
from torch.nn.functional import cross_entropy
from pyswarm import pso

# Load IMDb dataset
dataset = load_dataset('imdb')

# Use a small subset of the dataset for faster experimentation
small_train_dataset = dataset['train'].shuffle(seed=42).select(range(1000))
small_test_dataset = dataset['test'].shuffle(seed=42).select(range(1000))

# Load TinyBERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('prajjwal1/bert-tiny')
model = AutoModelForSequenceClassification.from_pretrained('prajjwal1/bert-tiny', num_labels=2)

# Make all model parameters contiguous
for param in model.parameters():
    param.data = param.data.contiguous()

# Tokenize data with fixed sequence length
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

tokenized_train = small_train_dataset.map(tokenize_function, batched=True)
tokenized_test = small_test_dataset.map(tokenize_function, batched=True)

# Remove columns that are not tensors
tokenized_train = tokenized_train.remove_columns(["text"])
tokenized_test = tokenized_test.remove_columns(["text"])

# Define compute metrics function with additional metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Convert logits and labels to tensors to compute cross-entropy loss
    logits_tensor = torch.tensor(logits)
    labels_tensor = torch.tensor(labels)

    accuracy = accuracy_score(labels, predictions)
    recall = recall_score(labels, predictions, average="binary")
    f1 = f1_score(labels, predictions, average="binary")
    loss = cross_entropy(logits_tensor, labels_tensor).item()  # Cross-entropy loss calculation

    return {
        "accuracy": accuracy,
        "recall": recall,
        "f1": f1,
        "loss": loss
    }

# Define PSO fitness function (objective function)
def fitness_function(hyperparams):
    lr, batch_size, epochs = hyperparams
    batch_size = int(batch_size)
    epochs = int(epochs)

    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        learning_rate=lr,
        evaluation_strategy="steps",
        eval_steps=50,
        save_steps=50,
        logging_steps=50,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        compute_metrics=compute_metrics
    )

    for param in model.parameters():
        param.data = param.data.contiguous()

    trainer.train()
    eval_results = trainer.evaluate()
    return -eval_results['eval_accuracy']  # Minimize negative accuracy for PSO

# PSO to optimize hyperparameters
lb = [1e-5, 8, 1]
ub = [1e-3, 16, 2]
best_hyperparams, _ = pso(fitness_function, lb, ub, swarmsize=5, maxiter=3)

print("Best hyperparameters:", best_hyperparams)
final_lr, final_batch_size, final_epochs = best_hyperparams
final_batch_size = int(final_batch_size)
final_epochs = int(final_epochs)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=final_epochs,
    per_device_train_batch_size=final_batch_size,
    learning_rate=final_lr,
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics
)

for param in model.parameters():
    param.data = param.data.contiguous()

# Apply structured pruning on entire channels of linear layers
def apply_structured_pruning(model, amount=0.4):
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            prune.ln_structured(module, name='weight', amount=amount, n=2, dim=0)
            prune.remove(module, 'weight')

# Apply structured pruning with 40% sparsity
apply_structured_pruning(model, amount=0.4)

# Train the final pruned model and evaluate with added metrics
trainer.train()
final_results = trainer.evaluate()
print("Final evaluation after structured pruning:", final_results)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Recall,F1
50,0.6897,0.667675,0.628,0.627049,0.621951




Step,Training Loss,Validation Loss,Accuracy,Recall,F1
50,0.6274,0.671813,0.631,0.875,0.698283
100,0.5414,0.610344,0.686,0.735656,0.695736




Step,Training Loss,Validation Loss,Accuracy,Recall,F1
50,0.3235,0.641385,0.697,0.745902,0.706111




Step,Training Loss,Validation Loss,Accuracy,Recall,F1
50,0.3973,0.763328,0.711,0.854508,0.742654
100,0.3882,0.576943,0.722,0.797131,0.736742




Step,Training Loss,Validation Loss,Accuracy,Recall,F1
50,0.2478,1.684048,0.626,0.954918,0.713629
100,0.3771,1.078143,0.71,0.663934,0.690832




Step,Training Loss,Validation Loss,Accuracy,Recall,F1
50,0.1684,1.183388,0.689,0.915984,0.741909
100,0.272,0.942802,0.726,0.793033,0.73855




Step,Training Loss,Validation Loss,Accuracy,Recall,F1
50,0.0931,1.156278,0.733,0.702869,0.719832




Step,Training Loss,Validation Loss,Accuracy,Recall,F1
50,0.022,1.30978,0.737,0.739754,0.732995




Step,Training Loss,Validation Loss,Accuracy,Recall,F1
50,0.0982,1.142066,0.717,0.631148,0.685206
100,0.2601,1.22488,0.684,0.858607,0.72617
150,0.0218,1.507363,0.725,0.756148,0.728529




Step,Training Loss,Validation Loss,Accuracy,Recall,F1
50,0.0999,1.991304,0.718,0.618852,0.681716




Step,Training Loss,Validation Loss,Accuracy,Recall,F1
50,0.0337,2.242895,0.705,0.891393,0.746781
100,0.1795,1.59365,0.73,0.678279,0.7103




Step,Training Loss,Validation Loss,Accuracy,Recall,F1
50,0.0229,2.577767,0.72,0.678279,0.70276




Step,Training Loss,Validation Loss,Accuracy,Recall,F1
50,0.0299,2.708285,0.724,0.64959,0.696703
100,0.1673,2.325746,0.733,0.711066,0.722164




Step,Training Loss,Validation Loss,Accuracy,Recall,F1
50,0.0,3.294989,0.736,0.803279,0.748092
100,0.3619,2.269863,0.732,0.786885,0.741313
150,0.034,2.229881,0.731,0.790984,0.741595




Step,Training Loss,Validation Loss,Accuracy,Recall,F1
50,0.0,3.497745,0.73,0.786885,0.739884




Step,Training Loss,Validation Loss,Accuracy,Recall,F1
50,0.0001,4.455726,0.708,0.856557,0.741135
100,0.0692,3.187846,0.73,0.688525,0.713376




Step,Training Loss,Validation Loss,Accuracy,Recall,F1
50,0.0084,3.80882,0.731,0.821721,0.748833
100,0.0268,3.257816,0.736,0.727459,0.728953
150,0.0143,3.190976,0.736,0.815574,0.750943




Step,Training Loss,Validation Loss,Accuracy,Recall,F1
50,0.0,4.09306,0.72,0.625,0.685393
100,0.0,3.94911,0.729,0.768443,0.734574
150,0.0,3.9518,0.73,0.770492,0.735812




Step,Training Loss,Validation Loss,Accuracy,Recall,F1
50,0.0,4.557057,0.728,0.854508,0.754069
100,0.1587,4.294619,0.682,0.438525,0.573727
150,0.0353,3.856544,0.716,0.637295,0.686534
200,0.0557,3.768021,0.72,0.651639,0.694323




Step,Training Loss,Validation Loss,Accuracy,Recall,F1
50,0.0066,5.766375,0.689,0.485656,0.603822


Stopping search: maximum iterations reached --> 3
Best hyperparameters: [7.10991852e-04 8.16467595e+00 1.96990985e+00]




Epoch,Training Loss,Validation Loss,Accuracy,Recall,F1
1,No log,1.549414,0.742,0.758197,0.741483


Final evaluation after structured pruning: {'eval_loss': 1.549413800239563, 'eval_accuracy': 0.742, 'eval_recall': 0.7581967213114754, 'eval_f1': 0.7414829659318637, 'eval_runtime': 0.689, 'eval_samples_per_second': 1451.29, 'eval_steps_per_second': 181.411, 'epoch': 1.0}


In [None]:
# Define the directory where the model and tokenizer will be saved
output_dir = "./saved_model"

# Save the model and tokenizer after training
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Optionally, you can also save the training arguments and optimizer states if needed
trainer.save_model(output_dir)  # This will save the model, tokenizer, and additional components