In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import set_seed
import torch
from torch.optim import AdamW
from huggingface_hub import HfApi, login, hf_hub_download
from google.colab import userdata

from supplementary_file_for_sentence_similarity import *

# Loading the tokenizer and a dataset

In [2]:
# Loading the tokenizer

model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set padding token to EOS token
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [3]:
dataloader = sentence_similarity_dataloaders(tokenizer)

set_seed(42)
train_dataloader, val_dataloader, test_dataloader = dataloader.get_dataloaders()

README.md: 0.00B [00:00, ?B/s]

mrpc/train-00000-of-00001.parquet:   0%|          | 0.00/649k [00:00<?, ?B/s]

mrpc/validation-00000-of-00001.parquet:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

mrpc/test-00000-of-00001.parquet:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

# Training Experiments

In [None]:
lr_list = [1e-5, 3e-5, 5e-5]
num_epochs = 3
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

### With Random seed: 137

In [None]:
for lr in lr_list:

  for lr_scheduler in [False, True]:

    set_seed(137)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    # Set padding token to EOS token
    model.config.pad_token_id = tokenizer.eos_token_id

    trainer_config = {'optimizer' : AdamW,
                  'num_epochs' : num_epochs,
                  'learning_rate' : lr,
                  'lr_scheduler' : lr_scheduler,
                  }

    trainer = sentence_similarity_trainer(model=model,
                      train_dataloader = train_dataloader,
                      val_dataloader = val_dataloader,
                      device = device,
                      trainer_config = trainer_config,
                      )

    # Running the training loops
    print("="*20, f"{lr=} and {lr_scheduler = }", "="*20)
    trainer.train()

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 0 Step 0 -- training loss: 2.096025940574592 --validation loss: 2.0981458086593476 -- validation accuracy 0.32107843137254904
Epoch 0 Step 50 -- training loss: 0.6075662261520336 --validation loss: 0.5871494488388884 -- validation accuracy 0.6838235294117647
Epoch 0 Step 100 -- training loss: 0.6230084758925541 --validation loss: 0.6189404784464368 -- validation accuracy 0.7009803921568627
Epoch 0 Step 150 -- training loss: 0.609856319226211 --validation loss: 0.6232486072124219 -- validation accuracy 0.7058823529411765
Epoch 0 Step 200 -- training loss: 0.5549501094293491 --validation loss: 0.5656997952975479 -- validation accuracy 0.7034313725490197
Epoch 0 Step 250 -- training loss: 0.580116110381088 --validation loss: 0.5907023829572341 -- validation accuracy 0.6985294117647058
Epoch 0 Step 300 -- training loss: 0.5401024531759727 --validation loss: 0.5467386473627651 -- validation accuracy 0.7156862745098039
Epoch 0 Step 350 -- training loss: 0.5209414809292957 --validation 

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 2 Step 458 -- training loss: 0.3355970421972358 --validation loss: 0.48012871718874167 -- validation accuracy 0.7794117647058824
The best accuracy was 0.7818627450980392 after step 450 of epoch 1.
Epoch 0 Step 0 -- training loss: 2.096025940574592 --validation loss: 2.0981458086593476 -- validation accuracy 0.32107843137254904
Epoch 0 Step 50 -- training loss: 0.6089085876487179 --validation loss: 0.5882904687348534 -- validation accuracy 0.6813725490196079
Epoch 0 Step 100 -- training loss: 0.6221720403413368 --validation loss: 0.6171437612935609 -- validation accuracy 0.7009803921568627
Epoch 0 Step 150 -- training loss: 0.60873581115197 --validation loss: 0.6202989273211535 -- validation accuracy 0.7009803921568627
Epoch 0 Step 200 -- training loss: 0.5572249415718132 --validation loss: 0.5664025609399758 -- validation accuracy 0.7034313725490197
Epoch 0 Step 250 -- training loss: 0.5839446057811022 --validation loss: 0.594105182909498 -- validation accuracy 0.7009803921568627

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 0 Step 0 -- training loss: 1.7515341689838564 --validation loss: 1.7445332735192542 -- validation accuracy 0.3161764705882353
Epoch 0 Step 50 -- training loss: 0.608549735255231 --validation loss: 0.616916735382641 -- validation accuracy 0.6764705882352942
Epoch 0 Step 100 -- training loss: 0.6389864011138093 --validation loss: 0.6549954393915102 -- validation accuracy 0.6887254901960784
Epoch 0 Step 150 -- training loss: 0.6004718259658689 --validation loss: 0.6073870048219082 -- validation accuracy 0.6887254901960784
Epoch 0 Step 200 -- training loss: 0.57142725379119 --validation loss: 0.5803293030635983 -- validation accuracy 0.6985294117647058
Epoch 0 Step 250 -- training loss: 0.5688868812681024 --validation loss: 0.5785399391955021 -- validation accuracy 0.6985294117647058
Epoch 0 Step 300 -- training loss: 0.5460404257304269 --validation loss: 0.5543616963367836 -- validation accuracy 0.7132352941176471
Epoch 0 Step 350 -- training loss: 0.580060755310495 --validation los

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 0 Step 0 -- training loss: 1.7515341689838564 --validation loss: 1.7445332735192542 -- validation accuracy 0.3161764705882353
Epoch 0 Step 50 -- training loss: 0.6213669430196674 --validation loss: 0.6136673501893586 -- validation accuracy 0.6740196078431373
Epoch 0 Step 100 -- training loss: 0.6243433563732633 --validation loss: 0.625625716120589 -- validation accuracy 0.6911764705882353
Epoch 0 Step 150 -- training loss: 0.6006905606304638 --validation loss: 0.6103566896681692 -- validation accuracy 0.6887254901960784
Epoch 0 Step 200 -- training loss: 0.5689069581317486 --validation loss: 0.5832686903429967 -- validation accuracy 0.6985294117647058
Epoch 0 Step 250 -- training loss: 0.5832340672839441 --validation loss: 0.5985899842252919 -- validation accuracy 0.6911764705882353
Epoch 0 Step 300 -- training loss: 0.5512469333955665 --validation loss: 0.5675468205236921 -- validation accuracy 0.7034313725490197
Epoch 0 Step 350 -- training loss: 0.5463026744802533 --validation

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 0 Step 0 -- training loss: 1.4354786700477786 --validation loss: 1.4263130099165673 -- validation accuracy 0.33088235294117646
Epoch 0 Step 50 -- training loss: 0.6022826246186799 --validation loss: 0.6074196252168393 -- validation accuracy 0.6862745098039216
Epoch 0 Step 100 -- training loss: 0.5875854487967127 --validation loss: 0.5901244615807253 -- validation accuracy 0.678921568627451
Epoch 0 Step 150 -- training loss: 0.5495845113184976 --validation loss: 0.5695550219685424 -- validation accuracy 0.7034313725490197
Epoch 0 Step 200 -- training loss: 0.5851028784622554 --validation loss: 0.6023961378663194 -- validation accuracy 0.6936274509803921
Epoch 0 Step 250 -- training loss: 0.505504358405641 --validation loss: 0.5261382489812141 -- validation accuracy 0.7377450980392157
Epoch 0 Step 300 -- training loss: 0.4790444188388085 --validation loss: 0.510176428100642 -- validation accuracy 0.7475490196078431
Epoch 0 Step 350 -- training loss: 0.4833844631975253 --validation 

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 2 Step 458 -- training loss: 0.0833679464745106 --validation loss: 0.5015465018211627 -- validation accuracy 0.8063725490196079
The best accuracy was 0.8112745098039216 after step 150 of epoch 2.
Epoch 0 Step 0 -- training loss: 1.4354786700477786 --validation loss: 1.4263130099165673 -- validation accuracy 0.33088235294117646
Epoch 0 Step 50 -- training loss: 0.6030037818651054 --validation loss: 0.6080950393396265 -- validation accuracy 0.678921568627451
Epoch 0 Step 100 -- training loss: 0.5872716867975679 --validation loss: 0.5900714987633275 -- validation accuracy 0.6813725490196079
Epoch 0 Step 150 -- training loss: 0.553512653429264 --validation loss: 0.5731417478299609 -- validation accuracy 0.7034313725490197
Epoch 0 Step 200 -- training loss: 0.5832387178422059 --validation loss: 0.6001031577002769 -- validation accuracy 0.6887254901960784
Epoch 0 Step 250 -- training loss: 0.5129258544681379 --validation loss: 0.5330427394193762 -- validation accuracy 0.730392156862745

**Observations**:

In experiment 1, we observe overfitting after epoch 1. The best result in experiment 1 is at step 458 of epoch 1, where the validation accuracy is 77.7% and validation loss is 0.498.

On the other hand, we do not see overfitting in exper 2, and the best result is at step 300 of epoch 2. The validation accuracy and validation loss are 77.9% and 0.497 at this step.

In experiment 3, we see some oscillations in validation loss in epoch 2, but overall validation loss goes down. The best result is at step 400 of epoch 2, where the validation accuracy and validation loss are 80.1% and 0.453, respectively.

In experiment 4, we do not see any signs of overfitting. The best result is at step 400 of epoch 2, where the validation accuracy is 77.9% and validation loss is 0.484.

We observe overfitting in epoch 2 in experiment 5. The best result is at step 458 of epoch 1. The validation accuracy and validation loss at this step are 78.7% and 0.439, respectively.

In experiment 6, we observe overfitting after step 300 of epoch 1. The validation accuracy at this step is 78.7% whereas the validation loss is 0.458.

Therefore, we conclude that the best results are achieved with experiment 5.

### With Random seed: 23

In [None]:
for lr in lr_list:

  for lr_scheduler in [False, True]:

    set_seed(23)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    # Set padding token to EOS token
    model.config.pad_token_id = tokenizer.eos_token_id

    trainer_config = {'optimizer' : AdamW,
                  'num_epochs' : num_epochs,
                  'learning_rate' : lr,
                  'lr_scheduler' : lr_scheduler,
                  }

    trainer = sentence_similarity_trainer(model=model,
                      train_dataloader = train_dataloader,
                      val_dataloader = val_dataloader,
                      device = device,
                      trainer_config = trainer_config,
                      )

    # Running the training loops
    print("="*20, f"{lr=} and {lr_scheduler = }", "="*20)
    trainer.train()

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 0 Step 0 -- training loss: 0.8227063777627249 --validation loss: 0.7973103222309375 -- validation accuracy 0.6666666666666666
Epoch 0 Step 50 -- training loss: 0.6336311964817296 --validation loss: 0.6284563255076315 -- validation accuracy 0.6544117647058824
Epoch 0 Step 100 -- training loss: 0.5854898885314501 --validation loss: 0.5845217669711393 -- validation accuracy 0.6985294117647058
Epoch 0 Step 150 -- training loss: 0.6027838501730256 --validation loss: 0.5974431528764612 -- validation accuracy 0.7058823529411765
Epoch 0 Step 200 -- training loss: 0.5556332859857929 --validation loss: 0.5560396164655685 -- validation accuracy 0.7156862745098039
Epoch 0 Step 250 -- training loss: 0.554062404728663 --validation loss: 0.5548606333779353 -- validation accuracy 0.7254901960784313
Epoch 0 Step 300 -- training loss: 0.52816824624741 --validation loss: 0.5331571143047482 -- validation accuracy 0.7303921568627451
Epoch 0 Step 350 -- training loss: 0.5152218327867699 --validation l

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 2 Step 458 -- training loss: 0.31397514985082975 --validation loss: 0.4372760375925139 -- validation accuracy 0.7941176470588235
The best accuracy was 0.7941176470588235 after step 400 of epoch 1.
Epoch 0 Step 0 -- training loss: 0.8227063777627249 --validation loss: 0.7973103222309375 -- validation accuracy 0.6666666666666666
Epoch 0 Step 50 -- training loss: 0.6342576138334337 --validation loss: 0.6293012067383411 -- validation accuracy 0.6544117647058824
Epoch 0 Step 100 -- training loss: 0.5869744108393301 --validation loss: 0.585925924427369 -- validation accuracy 0.7009803921568627
Epoch 0 Step 150 -- training loss: 0.5999328817092775 --validation loss: 0.5944452411403843 -- validation accuracy 0.7058823529411765
Epoch 0 Step 200 -- training loss: 0.5569603964524071 --validation loss: 0.5564296768576491 -- validation accuracy 0.7132352941176471
Epoch 0 Step 250 -- training loss: 0.5567826228074259 --validation loss: 0.5567816075156716 -- validation accuracy 0.72058823529411

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 0 Step 0 -- training loss: 0.8408855500921185 --validation loss: 0.8118688424720484 -- validation accuracy 0.678921568627451
Epoch 0 Step 50 -- training loss: 0.5870723023134119 --validation loss: 0.5790512041718352 -- validation accuracy 0.6887254901960784
Epoch 0 Step 100 -- training loss: 0.553913808823411 --validation loss: 0.5553118563165852 -- validation accuracy 0.6911764705882353
Epoch 0 Step 150 -- training loss: 0.5964009175340854 --validation loss: 0.5916385919440026 -- validation accuracy 0.7034313725490197
Epoch 0 Step 200 -- training loss: 0.5223947916674978 --validation loss: 0.5306053395364799 -- validation accuracy 0.7083333333333334
Epoch 0 Step 250 -- training loss: 0.5369174312272622 --validation loss: 0.5524335906786078 -- validation accuracy 0.7058823529411765
Epoch 0 Step 300 -- training loss: 0.5132735655977835 --validation loss: 0.5310401302926681 -- validation accuracy 0.7475490196078431
Epoch 0 Step 350 -- training loss: 0.49914904260167886 --validation

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 2 Step 458 -- training loss: 0.16688949753548585 --validation loss: 0.5072091576807639 -- validation accuracy 0.7990196078431373
The best accuracy was 0.8186274509803921 after step 50 of epoch 2.
Epoch 0 Step 0 -- training loss: 0.8408855500921185 --validation loss: 0.8118688424720484 -- validation accuracy 0.678921568627451
Epoch 0 Step 50 -- training loss: 0.5880549231905303 --validation loss: 0.5801327339574402 -- validation accuracy 0.6887254901960784
Epoch 0 Step 100 -- training loss: 0.5547912223081962 --validation loss: 0.5556769674899531 -- validation accuracy 0.6887254901960784
Epoch 0 Step 150 -- training loss: 0.5942247025912104 --validation loss: 0.5884826989734874 -- validation accuracy 0.6985294117647058
Epoch 0 Step 200 -- training loss: 0.5202980521977077 --validation loss: 0.5292255299932817 -- validation accuracy 0.7132352941176471
Epoch 0 Step 250 -- training loss: 0.5379114378901089 --validation loss: 0.5510509031660417 -- validation accuracy 0.705882352941176

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 2 Step 458 -- training loss: 0.24137903742232483 --validation loss: 0.46728120189087063 -- validation accuracy 0.7941176470588235
The best accuracy was 0.8186274509803921 after step 400 of epoch 2.
Epoch 0 Step 0 -- training loss: 0.8624716038797416 --validation loss: 0.8309214468680176 -- validation accuracy 0.6838235294117647
Epoch 0 Step 50 -- training loss: 0.5807699891606707 --validation loss: 0.5738563005830727 -- validation accuracy 0.7058823529411765
Epoch 0 Step 100 -- training loss: 0.5423233825591655 --validation loss: 0.5493259324746973 -- validation accuracy 0.7058823529411765
Epoch 0 Step 150 -- training loss: 0.6005775729977487 --validation loss: 0.6038244904256335 -- validation accuracy 0.7034313725490197
Epoch 0 Step 200 -- training loss: 0.5022916168883476 --validation loss: 0.5206058416880813 -- validation accuracy 0.7401960784313726
Epoch 0 Step 250 -- training loss: 0.5087184330087342 --validation loss: 0.5390056418437584 -- validation accuracy 0.720588235294

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 0 Step 0 -- training loss: 0.8624716038797416 --validation loss: 0.8309214468680176 -- validation accuracy 0.6838235294117647
Epoch 0 Step 50 -- training loss: 0.5809491329333362 --validation loss: 0.5737693695461049 -- validation accuracy 0.7058823529411765
Epoch 0 Step 100 -- training loss: 0.5432575709939262 --validation loss: 0.5501038379528943 -- validation accuracy 0.7083333333333334
Epoch 0 Step 150 -- training loss: 0.5962569055863715 --validation loss: 0.6011319741898892 -- validation accuracy 0.7034313725490197
Epoch 0 Step 200 -- training loss: 0.5035469085952036 --validation loss: 0.5194860971441456 -- validation accuracy 0.7352941176470589
Epoch 0 Step 250 -- training loss: 0.5113799329481873 --validation loss: 0.5387416178104925 -- validation accuracy 0.7058823529411765
Epoch 0 Step 300 -- training loss: 0.5013098988787541 --validation loss: 0.5233840650203181 -- validation accuracy 0.7524509803921569
Epoch 0 Step 350 -- training loss: 0.4775259284087516 --validatio

**Observations**:

We do not see overfitting in experiment 1 and 2, but we see signs of overfitting in epoch 2 of experiment 3-6.

The best result for experiment 1 are at step 350 of epoch 2, where the validation accuracy and validation loss are 79.4% and 0.437, respectively.

The best result for experiment 2 are at step 250 of epoch 2, where the validation accuracy and validation loss are 77.7% and 0.468, respectively.

In experiment 3, we see some oscillations in both train and validation losses in epoch 1, but the losses overall go down in epoch 1. The best result are at step 458 of epoch 1, where the validation accuracy and validation loss are 78.2% and 0.437, respectively.

The best result for experiment 4 are at step 50 of epoch 2, where the validation accuracy and validation loss are 81.6% and 0.430, respectively.

In experiment 5, the best result are at step 458 of epoch 1. The validation accuracy at this step is 81.1% whereas the validation loss is 0.408.

The best result for experiment 6 are at step 400 of epoch 1, where the validation accuracy and validation loss are 79.4% and 0.438, respectively.

Based on these observations, the best results are with experiment 5.

### With Random seed: 114

In [None]:
for lr in lr_list:

  for lr_scheduler in [False, True]:

    set_seed(114)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    # Set padding token to EOS token
    model.config.pad_token_id = tokenizer.eos_token_id

    trainer_config = {'optimizer' : AdamW,
                  'num_epochs' : num_epochs,
                  'learning_rate' : lr,
                  'lr_scheduler' : lr_scheduler,
                  }

    trainer = sentence_similarity_trainer(model=model,
                      train_dataloader = train_dataloader,
                      val_dataloader = val_dataloader,
                      device = device,
                      trainer_config = trainer_config,
                      )

    # Running the training loops
    print("="*20, f"{lr=} and {lr_scheduler = }", "="*20)
    trainer.train()

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 0 Step 0 -- training loss: 1.0465804818116762 --validation loss: 1.0321396142974788 -- validation accuracy 0.678921568627451
Epoch 0 Step 50 -- training loss: 0.6746570381918244 --validation loss: 0.6906085034795836 -- validation accuracy 0.6617647058823529
Epoch 0 Step 100 -- training loss: 0.6248471182740591 --validation loss: 0.6325051953979567 -- validation accuracy 0.6691176470588235
Epoch 0 Step 150 -- training loss: 0.5992267095445288 --validation loss: 0.6041843645712909 -- validation accuracy 0.6911764705882353
Epoch 0 Step 200 -- training loss: 0.5888089087339268 --validation loss: 0.5954521497090658 -- validation accuracy 0.7083333333333334
Epoch 0 Step 250 -- training loss: 0.5940247537783289 --validation loss: 0.5971872867906795 -- validation accuracy 0.7009803921568627
Epoch 0 Step 300 -- training loss: 0.5549838284521581 --validation loss: 0.5638869685285232 -- validation accuracy 0.7303921568627451
Epoch 0 Step 350 -- training loss: 0.576496508873366 --validation 

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 0 Step 0 -- training loss: 1.0465804818116762 --validation loss: 1.0321396142974788 -- validation accuracy 0.678921568627451
Epoch 0 Step 50 -- training loss: 0.6751514139842884 --validation loss: 0.6914475081013698 -- validation accuracy 0.6666666666666666
Epoch 0 Step 100 -- training loss: 0.626175361293853 --validation loss: 0.6342649097536125 -- validation accuracy 0.6666666666666666
Epoch 0 Step 150 -- training loss: 0.6019024397358136 --validation loss: 0.6075995517712013 -- validation accuracy 0.6862745098039216
Epoch 0 Step 200 -- training loss: 0.5885519529479781 --validation loss: 0.594912561715818 -- validation accuracy 0.7083333333333334
Epoch 0 Step 250 -- training loss: 0.5857074765857788 --validation loss: 0.5894725220460518 -- validation accuracy 0.7009803921568627
Epoch 0 Step 300 -- training loss: 0.5651204162043943 --validation loss: 0.5736875563275581 -- validation accuracy 0.7083333333333334
Epoch 0 Step 350 -- training loss: 0.5652372777656792 --validation l

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 0 Step 0 -- training loss: 0.9949173327202631 --validation loss: 0.9815634787082672 -- validation accuracy 0.6838235294117647
Epoch 0 Step 50 -- training loss: 0.6722144740339457 --validation loss: 0.6726028434201783 -- validation accuracy 0.6936274509803921
Epoch 0 Step 100 -- training loss: 0.6016157772676098 --validation loss: 0.6071371044598374 -- validation accuracy 0.6936274509803921
Epoch 0 Step 150 -- training loss: 0.583191493270444 --validation loss: 0.587233220245324 -- validation accuracy 0.7132352941176471
Epoch 0 Step 200 -- training loss: 0.605412658133538 --validation loss: 0.6073397710627201 -- validation accuracy 0.6985294117647058
Epoch 0 Step 250 -- training loss: 0.6069199190615049 --validation loss: 0.6082200942670598 -- validation accuracy 0.6936274509803921
Epoch 0 Step 300 -- training loss: 0.5320766410406899 --validation loss: 0.5462948627331677 -- validation accuracy 0.7156862745098039
Epoch 0 Step 350 -- training loss: 0.6216035968987251 --validation l

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 2 Step 458 -- training loss: 0.22317763462191964 --validation loss: 0.5035581742139423 -- validation accuracy 0.7867647058823529
The best accuracy was 0.8259803921568627 after step 100 of epoch 2.
Epoch 0 Step 0 -- training loss: 0.9949173327202631 --validation loss: 0.9815634787082672 -- validation accuracy 0.6838235294117647
Epoch 0 Step 50 -- training loss: 0.669341016244265 --validation loss: 0.6700909774677426 -- validation accuracy 0.6936274509803921
Epoch 0 Step 100 -- training loss: 0.6021291036686347 --validation loss: 0.60791508473602 -- validation accuracy 0.6911764705882353
Epoch 0 Step 150 -- training loss: 0.5862940742039733 --validation loss: 0.590306865234001 -- validation accuracy 0.7132352941176471
Epoch 0 Step 200 -- training loss: 0.5974929044449252 --validation loss: 0.6004572209774279 -- validation accuracy 0.7009803921568627
Epoch 0 Step 250 -- training loss: 0.6009607055997537 --validation loss: 0.6016820601972879 -- validation accuracy 0.6936274509803921


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 0 Step 0 -- training loss: 0.9480582174879534 --validation loss: 0.9370095849621529 -- validation accuracy 0.6838235294117647
Epoch 0 Step 50 -- training loss: 0.7167714424999451 --validation loss: 0.7094836449798416 -- validation accuracy 0.6887254901960784
Epoch 0 Step 100 -- training loss: 0.5970809048686931 --validation loss: 0.5961940317761665 -- validation accuracy 0.6936274509803921
Epoch 0 Step 150 -- training loss: 0.5756784478823344 --validation loss: 0.5841832733621785 -- validation accuracy 0.7009803921568627
Epoch 0 Step 200 -- training loss: 0.5966290446538032 --validation loss: 0.5982822311275146 -- validation accuracy 0.7009803921568627
Epoch 0 Step 250 -- training loss: 0.7042008689351071 --validation loss: 0.7079208710906553 -- validation accuracy 0.6862745098039216
Epoch 0 Step 300 -- training loss: 0.5505814414658058 --validation loss: 0.5656761941956538 -- validation accuracy 0.7132352941176471
Epoch 0 Step 350 -- training loss: 0.6262172723680021 --validatio

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 0 Step 0 -- training loss: 0.9480582174879534 --validation loss: 0.9370095849621529 -- validation accuracy 0.6838235294117647
Epoch 0 Step 50 -- training loss: 0.7159055888587873 --validation loss: 0.7087677179598341 -- validation accuracy 0.6887254901960784
Epoch 0 Step 100 -- training loss: 0.5955916290579278 --validation loss: 0.5953427670048732 -- validation accuracy 0.6936274509803921
Epoch 0 Step 150 -- training loss: 0.5809506234390284 --validation loss: 0.5897534489631653 -- validation accuracy 0.6887254901960784
Epoch 0 Step 200 -- training loss: 0.5864817282927581 --validation loss: 0.5925035026727938 -- validation accuracy 0.7034313725490197
Epoch 0 Step 250 -- training loss: 0.6546588052624192 --validation loss: 0.6614055370583254 -- validation accuracy 0.6887254901960784
Epoch 0 Step 300 -- training loss: 0.5515893860060664 --validation loss: 0.5712361914270064 -- validation accuracy 0.7132352941176471
Epoch 0 Step 350 -- training loss: 0.6290121430080701 --validatio

**Observations**:

In experiment 1, we do not observe overfitting. The best result is at step 350 of epoch 2, where the validation accuracy is 77.5% and validation loss is 0.470.

We do not see overfitting in experiment 2 as well. The best result is at step 400 of epoch 2. The validation accuracy and validation loss are 75.5% and 0.497 at this step.

In experiment 3, we see signs of overfitting in later steps of epoch 2. We also see some oscillations in both train and validation losses, but overall losses go down. The best result is at step 100 of epoch 2, where the validation accuracy and validation loss are 82.6% and 0.430, respectively.

In experiment 4, we see some oscillations in both train and validation losses, but the losses go down. We do not observe any signs of overfitting. The best result is at step 400 of epoch 2, where the validation accuracy is 80.4% and validation loss is 0.451.

We observe large oscillations in validation loss in epoch 2, and hence, decide to ignore this epoch. There are also some oscillations in both train and validation losses in epoch 1 as well, but they are relatively small oscillations and the losses overall go down. The best result for this experiment is at step 400 of epoch 1. The validation accuracy and validation loss at this step are 75.5% and 0.483, respectively.

In experiment 6, we observe overfitting after epoch 1. The best result is at step 450 of epoch 1. The validation accuracy at this step is 80.4% whereas the validation loss is 0.465.

Therefore, we conclude that the best results are achieved with experiment 3.

# Conclusion

We conclude that the experiment with a random seed of 23 with a constant learning rate of $5\times 10^{-5}$ leads to the best result (validation loss of 0.408) if we stop after step 458 of epoch 1.

In [5]:
'''
Repeating the best performing experiment
with the stopping condition to get the
model weights and to calculate test accuracy.
'''

lr = 5e-5
lr_scheduler = False
num_epochs = 3
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
stopping_condition = {'step': 458, 'epoch': 1}

set_seed(23)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
# Set padding token to EOS token
model.config.pad_token_id = tokenizer.eos_token_id

trainer_config = {'optimizer' : AdamW,
              'num_epochs' : num_epochs,
              'learning_rate' : lr,
              'lr_scheduler' : lr_scheduler,
              }



trainer = sentence_similarity_trainer(model=model,
                  train_dataloader = train_dataloader,
                  val_dataloader = val_dataloader,
                  device = device,
                  trainer_config = trainer_config,
                  stopping_condition = stopping_condition,
                  )

# Running the training loops
trainer.train()

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 0 Step 0 -- training loss: 0.8624716038797416 --validation loss: 0.8309214468680176 -- validation accuracy 0.6838235294117647
Epoch 0 Step 50 -- training loss: 0.5807699891606707 --validation loss: 0.5738563005830727 -- validation accuracy 0.7058823529411765
Epoch 0 Step 100 -- training loss: 0.5423233825591655 --validation loss: 0.5493259324746973 -- validation accuracy 0.7058823529411765
Epoch 0 Step 150 -- training loss: 0.6005775729977487 --validation loss: 0.6038244904256335 -- validation accuracy 0.7034313725490197
Epoch 0 Step 200 -- training loss: 0.5022916168883476 --validation loss: 0.5206058416880813 -- validation accuracy 0.7401960784313726
Epoch 0 Step 250 -- training loss: 0.5087184330087342 --validation loss: 0.5390056418437584 -- validation accuracy 0.7205882352941176
Epoch 0 Step 300 -- training loss: 0.5274269219836898 --validation loss: 0.5483598083842034 -- validation accuracy 0.7549019607843137
Epoch 0 Step 350 -- training loss: 0.482880339821829 --validation

In [6]:
'''
Calculating the test loss
'''

def test_evaluation():

    trainer.model.eval()
    with torch.no_grad():

      test_losses = []
      test_accuracies = []

      for i, batch in enumerate(test_dataloader):

        # Getting the batch loss
        batch = {k: v.to(trainer.device) for k, v in batch.items()}
        outputs = trainer.model(**batch)
        test_losses.append(outputs.loss.item())
        # Getting the batch accuracy
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        test_accuracy = (predictions == batch['labels']).float().mean()
        test_accuracies.append(test_accuracy.item())

      avg_test_loss = sum(test_losses) / len(test_losses)
      avg_test_accuracy = sum(test_accuracies) / len(test_accuracies)

    trainer.model.train()

    return avg_test_loss, avg_test_accuracy

test_loss, test_acc = test_evaluation()
print(f"{test_loss=}")
print(f"{test_acc=}")

test_loss=0.4346150653091846
test_acc=0.8003472222222222


In [7]:
'''
Uploading the weights on HF
'''

# Save model weights
file_name = "model_weights.pth"
trainer.save_model(file_name)

# Logging into Hugging face Hub
hf_token = userdata.get('hf_TOKEN')
login(token=hf_token)
api = HfApi()

repo_id = "mudassirmoosa/sentence-similarity-transformer-comparison"

# Uploading model weights
api.upload_file(
    path_or_fileobj=file_name,
    path_in_repo="GPT2_for_sentence_similarity.pth",
    repo_id=repo_id,
    token=hf_token
)


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  model_weights.pth                     :   0%|          |  552kB /  498MB            

CommitInfo(commit_url='https://huggingface.co/mudassirmoosa/sentence-similarity-transformer-comparison/commit/e9bdb38f09547072f4c3880cd349886dfe390feb', commit_message='Upload GPT2_for_sentence_similarity.pth with huggingface_hub', commit_description='', oid='e9bdb38f09547072f4c3880cd349886dfe390feb', pr_url=None, repo_url=RepoUrl('https://huggingface.co/mudassirmoosa/sentence-similarity-transformer-comparison', endpoint='https://huggingface.co', repo_type='model', repo_id='mudassirmoosa/sentence-similarity-transformer-comparison'), pr_revision=None, pr_num=None)