In [None]:
# !pip install accelerate -U
# !pip install install torch torchvision torchaudio --upgrade

In [None]:
# !pip install huggingface_hub
!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('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')"

In [None]:
from datasets import load_dataset
from transformers import DataCollatorWithPadding

dataset = load_dataset("glue", "rte")

In [None]:
print(dataset['train'][0])

{'sentence1': 'No Weapons of Mass Destruction Found in Iraq Yet.', 'sentence2': 'Weapons of Mass Destruction Found in Iraq.', 'label': 1, 'idx': 0}


In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
from datasets import load_metric
import numpy as np

def compute_metrics(eval_pred):
    metric = load_metric("accuracy")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import BertForSequenceClassification, TrainingArguments, Trainer
import torch

# Check CUDA availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# device="cuda:0"

# Load the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda


In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=30,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    log_level='info',
    load_best_model_at_end=True
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  # Add this line to evaluate metrics
)

# Start training
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence1, idx, sentence2. If sentence1, idx, sentence2 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2,490
  Num Epochs = 30
  Instantaneous batch size per device = 64
  Training with DataParallel so batch size has been adjusted to: 512
  Total train batch size (w. paral

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.749554,0.527076
2,0.777700,0.745954,0.527076
3,0.777700,0.739669,0.527076
4,0.761300,0.730535,0.530686
5,0.761300,0.718336,0.534296
6,0.736800,0.703721,0.530686
7,0.736800,0.689945,0.541516
8,0.702500,0.678946,0.577617
9,0.702500,0.678548,0.570397
10,0.673100,0.676953,0.577617


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence1, idx, sentence2. If sentence1, idx, sentence2 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 277
  Batch size = 512
  metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Saving model checkpoint to ./results/checkpoint-5
Configuration saved in ./results/checkpoint-5/config.json
Model weights saved in ./results/checkpoint-5/model.safetensors
tokenizer config file saved in ./results/checkpoint-5/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-5/special_tokens_map.json
The following columns in the evaluation set don't have a correspondin

TrainOutput(global_step=150, training_loss=0.5008951179186503, metrics={'train_runtime': 349.2537, 'train_samples_per_second': 213.885, 'train_steps_per_second': 0.429, 'total_flos': 1.9654395835392e+16, 'train_loss': 0.5008951179186503, 'epoch': 30.0})

In [None]:
import torch
import torch.nn.utils.prune as prune
import copy

def apply_pruning_to_distilbert(model):
    for module in model.modules():
        if isinstance(module, torch.nn.Linear):
            prune.l1_unstructured(module, name='weight', amount=0.5)

    return model

copied_model = copy.deepcopy(model)
pruned_model = apply_pruning_to_distilbert(copied_model)

In [None]:
def check_pruning_effectiveness(model):
    total_pruned = 0
    total_params = 0

    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            # Get the weight tensor
            tensor = module.weight.data
            # Count the zeros in the tensor
            pruned = torch.sum(tensor == 0)
            total_pruned += pruned.item()
            total_params += tensor.numel()

            # Optional: print details for each layer
            print(f"{name} - Pruned: {pruned.item()} of {tensor.numel()} weights ({100.0 * pruned.item() / tensor.numel():.2f}%)")

    total_pruning_percentage = 100.0 * total_pruned / total_params
    print(f"Overall pruning across linear layers: {total_pruning_percentage:.2f}% of weights are zero.")

check_pruning_effectiveness(pruned_model)

bert.encoder.layer.0.attention.self.query - Pruned: 294912 of 589824 weights (50.00%)
bert.encoder.layer.0.attention.self.key - Pruned: 294912 of 589824 weights (50.00%)
bert.encoder.layer.0.attention.self.value - Pruned: 294912 of 589824 weights (50.00%)
bert.encoder.layer.0.attention.output.dense - Pruned: 294912 of 589824 weights (50.00%)
bert.encoder.layer.0.intermediate.dense - Pruned: 1179648 of 2359296 weights (50.00%)
bert.encoder.layer.0.output.dense - Pruned: 1179648 of 2359296 weights (50.00%)
bert.encoder.layer.1.attention.self.query - Pruned: 294912 of 589824 weights (50.00%)
bert.encoder.layer.1.attention.self.key - Pruned: 294912 of 589824 weights (50.00%)
bert.encoder.layer.1.attention.self.value - Pruned: 294912 of 589824 weights (50.00%)
bert.encoder.layer.1.attention.output.dense - Pruned: 294912 of 589824 weights (50.00%)
bert.encoder.layer.1.intermediate.dense - Pruned: 1179648 of 2359296 weights (50.00%)
bert.encoder.layer.1.output.dense - Pruned: 1179648 of 23592

In [None]:
from transformers import Trainer, TrainingArguments

def tokenize_function(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results_pruned',
    num_train_epochs=30,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs_pruned',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

# Initialize the Trainer with the pruned model
trainer = Trainer(
    model=pruned_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the pruned model
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence1, idx, sentence2. If sentence1, idx, sentence2 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2,490
  Num Epochs = 30
  Instantaneous batch size per device = 64
  Training with DataParallel so batch size has been adjusted to: 512
  Total train batch size (w. paral

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.679132,0.552347
2,0.686700,0.678021,0.559567
3,0.686700,0.676253,0.563177
4,0.680800,0.674429,0.570397
5,0.680800,0.672812,0.548736
6,0.673700,0.671172,0.552347
7,0.673700,0.66926,0.563177
8,0.664100,0.667064,0.566787
9,0.664100,0.664254,0.566787
10,0.647100,0.661667,0.563177


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence1, idx, sentence2. If sentence1, idx, sentence2 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 277
  Batch size = 512
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Saving model checkpoint to ./results_pruned/checkpoint-5
Configuration saved in ./results_pruned/checkpoint-5/config.json
Model weights saved in ./results_pruned/checkpoint-5/model.safetensors
tokenizer config file saved in ./results_pruned/checkpoint-5/tokenizer_config.json
Special tokens file saved in ./results_pruned/checkpoint-5/special_tokens_map.json
The following columns in the evaluation set don't have a correspondin

TrainOutput(global_step=150, training_loss=0.5375990931193034, metrics={'train_runtime': 382.3186, 'train_samples_per_second': 195.387, 'train_steps_per_second': 0.392, 'total_flos': 1.9654395835392e+16, 'train_loss': 0.5375990931193034, 'epoch': 30.0})

In [None]:
from transformers import DebertaTokenizer

roberta_tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-large')

def tokenize_function(examples):
    return roberta_tokenizer(examples['sentence1'], examples['sentence2'], padding="max_length", truncation=True, max_length=256)

roberta_encoded_dataset = dataset.map(tokenize_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

loading file vocab.json from cache at /nethome/dsanyal7/.cache/huggingface/hub/models--microsoft--deberta-large/snapshots/a97e054da5f34feed3d26951db4a25831dfcb486/vocab.json
loading file merges.txt from cache at /nethome/dsanyal7/.cache/huggingface/hub/models--microsoft--deberta-large/snapshots/a97e054da5f34feed3d26951db4a25831dfcb486/merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /nethome/dsanyal7/.cache/huggingface/hub/models--microsoft--deberta-large/snapshots/a97e054da5f34feed3d26951db4a25831dfcb486/tokenizer_config.json
loading file tokenizer.json from cache at None


config.json:   0%|          | 0.00/475 [00:00<?, ?B/s]

loading configuration file config.json from cache at /nethome/dsanyal7/.cache/huggingface/hub/models--microsoft--deberta-large/snapshots/a97e054da5f34feed3d26951db4a25831dfcb486/config.json
Model config DebertaConfig {
  "_name_or_path": "microsoft/deberta-large",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "transformers_version": "4.40.1",
  "type_vocab_size": 0,
  "vocab_size": 50265
}



Map:   0%|          | 0/2490 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Map:   0%|          | 0/277 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
from transformers import DebertaForSequenceClassification

teacher_model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-large', num_labels=2).to(device)

loading configuration file config.json from cache at /nethome/dsanyal7/.cache/huggingface/hub/models--microsoft--deberta-large/snapshots/a97e054da5f34feed3d26951db4a25831dfcb486/config.json
Model config DebertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "transformers_version": "4.40.1",
  "type_vocab_size": 0,
  "vocab_size": 50265
}



pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /nethome/dsanyal7/.cache/huggingface/hub/models--microsoft--deberta-large/snapshots/a97e054da5f34feed3d26951db4a25831dfcb486/pytorch_model.bin
Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassi

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=30,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    log_level='info',
    load_best_model_at_end=True
)

# Initialize the Trainer
trainer = Trainer(
    model=teacher_model,
    args=training_args,
    train_dataset=roberta_encoded_dataset['train'],
    eval_dataset=roberta_encoded_dataset['validation'],
    tokenizer=roberta_tokenizer,
    compute_metrics=compute_metrics
)

# Start training
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
The following columns in the training set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: sentence1, idx, sentence2. If sentence1, idx, sentence2 are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2,490
  Num Epochs = 30
  Instantaneous batch size per device = 16
  Training with DataParallel so batch size has been adjusted to: 128
  Total train batch size (w.

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6916,0.68972,0.545126
2,0.688,0.675311,0.617329
3,0.6677,0.609893,0.689531
4,0.5145,0.556846,0.754513
5,0.3497,0.668946,0.768953
6,0.2086,0.574032,0.830325
7,0.1546,0.520581,0.855596
8,0.0743,0.675556,0.848375
9,0.0792,0.705839,0.826715
10,0.0537,0.641292,0.859206


The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: sentence1, idx, sentence2. If sentence1, idx, sentence2 are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 277
  Batch size = 128
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Saving model checkpoint to ./results/checkpoint-20
Configuration saved in ./results/checkpoint-20/config.json
Model weights saved in ./results/checkpoint-20/model.safetensors
tokenizer config file saved in ./results/checkpoint-20/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-20/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DebertaFo

KeyboardInterrupt: 

In [None]:
import torch
from torch.nn import KLDivLoss, CrossEntropyLoss, Softmax, LogSoftmax
from transformers import TrainingArguments, Trainer

teacher_tokenizer = roberta_tokenizer
student_tokenizer = tokenizer

def tokenize_function(examples):
    # print(examples)
    teacher_encodings = teacher_tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding='max_length', max_length=256)
    student_encodings = student_tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding='max_length', max_length=256)
    answer = {
        "input_ids_teacher": teacher_encodings['input_ids'],
        "attention_mask_teacher": teacher_encodings['attention_mask'],
        "input_ids_student": student_encodings['input_ids'],
        "attention_mask_student": student_encodings['attention_mask'],
        "label": examples["label"]
    }
    # print(answer)
    return answer

# Apply tokenization to dataset
# tokenized_datasets = tokenize_function(next(dataset))
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# for a in tokenized_datasets["train"]:
#     print("-----")
#     print(a)
#     print("-----")
#     break
print()

teacher_model.eval()

def distillation_loss(teacher_logits, student_logits, labels, T=2.0, alpha=0.5):
    """ Compute the distillation loss. """
    ce_loss = CrossEntropyLoss()(student_logits, labels)
    kl_loss = KLDivLoss(reduction="batchmean")(LogSoftmax(dim=-1)(student_logits/T), Softmax(dim=-1)(teacher_logits/T))
    return alpha * kl_loss * (T * T) + (1.0 - alpha) * ce_loss

# def compute_loss(model, inputs, return_outputs=False):
#     """ Custom loss computation for distillation. """
#     outputs_student = model(**inputs)
#     with torch.no_grad():
#         outputs_teacher = teacher_model(**inputs)
#     loss = distillation_loss(outputs_teacher.logits, outputs_student.logits, inputs["labels"])
#     return (loss, outputs_student) if return_outputs else loss


training_args = TrainingArguments(
    output_dir='./results_distil',
    num_train_epochs=30,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_dir='./logs_distil',
    evaluation_strategy="epoch",
     save_strategy="epoch",
    load_best_model_at_end=True,
    log_level='info',
    logging_steps=10,
    warmup_steps=50,
    weight_decay=0.01,
    remove_unused_columns = False
)

from transformers import Trainer, TrainingArguments
import torch.nn.functional as F
from torch import nn
teacher_model.to(device)

class DistilTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # Prepare inputs for both student and teacher models
        # print(inputs)
        student_inputs = {
            'input_ids': inputs['input_ids_student'],
            'attention_mask': inputs['attention_mask_student']
        }
        teacher_inputs = {
            'input_ids': inputs['input_ids_teacher'],
            'attention_mask': inputs['attention_mask_teacher']
        }

        # Pass the relevant keys to each model
        outputs_student = model(**student_inputs)
        with torch.no_grad():
            outputs_teacher = teacher_model(**teacher_inputs)

        # Compute the distillation loss
        # Assuming alpha and temperature are properly defined
        alpha = 0.5
        temperature = 5
        loss_logits = nn.KLDivLoss(reduction="batchmean")(
            F.log_softmax(outputs_student.logits / temperature, dim=-1),
            F.softmax(outputs_teacher.logits / temperature, dim=-1)) * (temperature ** 2)

        student_loss = CrossEntropyLoss()(outputs_student.logits, inputs["labels"])
        loss = alpha * student_loss + (1. - alpha) * loss_logits
        return (loss, outputs_student) if return_outputs else loss


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).





In [None]:
from transformers import DataCollatorWithPadding
import torch

class CustomDataCollator(DataCollatorWithPadding):
    def __init__(self, tokenizer, return_tensors="pt"):
        super().__init__(tokenizer=tokenizer, return_tensors=return_tensors)

    def __call__(self, features):
        # Use superclass to handle input_ids, attention_mask, etc.
        # print(features)
        batch = super().__call__(features)

        # Check and print to debug
        print("Batch keys after super call:", batch.keys())

        # Ensure custom handling for your specific fields
        if 'input_ids_student' in features[0]:  # Check if your key exists in the feature set
            batch['input_ids_student'] = torch.stack([f['input_ids_student'] for f in features])
            batch['attention_mask_student'] = torch.stack([f['attention_mask_student'] for f in features])
        if 'input_ids_teacher' in features[0]:
            batch['input_ids_teacher'] = torch.stack([f['input_ids_teacher'] for f in features])
            batch['attention_mask_teacher'] = torch.stack([f['attention_mask_teacher'] for f in features])

        return batch

data_collator = CustomDataCollator(tokenizer=student_tokenizer)

# for a in tokenized_datasets["train"]:
#     print("-----")
#     print(a)
#     print("-----")
#     break
# print()

trainer = DistilTrainer(
    model=pruned_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics
    # data_collator=data_collator,
)

# Train the student model
trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
***** Running training *****
  Num examples = 2,490
  Num Epochs = 30
  Instantaneous batch size per device = 32
  Training with DataParallel so batch size has been adjusted to: 256
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 1
  Total optimization steps = 300
  Number of trainable parameters = 109,483,778


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1868,2.453014,0.570397
2,0.15,2.576062,0.566787
3,0.1168,2.789143,0.545126
4,0.0898,3.106702,0.548736
5,0.1013,3.26697,0.548736
6,0.0941,3.249267,0.541516
7,0.0858,3.22479,0.555957
8,0.1198,3.221021,0.577617
9,0.0959,3.174405,0.548736
10,0.0761,3.363716,0.516245


***** Running Evaluation *****
  Num examples = 277
  Batch size = 256
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Saving model checkpoint to ./results_distil/checkpoint-10
Configuration saved in ./results_distil/checkpoint-10/config.json
Model weights saved in ./results_distil/checkpoint-10/model.safetensors
***** Running Evaluation *****
  Num examples = 277
  Batch size = 256
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Saving model checkpoint to ./results_distil/checkpoint-20
Configuration saved in ./results_distil/checkpoint-20/config.json
Model weights saved in ./results_distil/checkpoint-20/model.safetensors
***** Running Evaluation *****
  Num examples = 277
  Batch size