## Configuration and Setup


In [1]:
import os
os.environ['HF_HOME'] = "C:/HF_CACHE/"

In [2]:
import torch

from cs324_project.datasets import GlueDatasetTask, load_glue_dataset_info
from cs324_project.models import ModelCheckpointName, load_classification_model, load_pretraining_model, load_tokenizer
from cs324_project.training import get_training_args_mlm, get_training_args_sc, get_trainer_mlm, get_trainer_sc, get_latest_checkpoint_path
from cs324_project.masking import get_whole_word_masking_data_collator, get_random_masking_data_collator

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
model_name = ModelCheckpointName.TINYBERT_HUAWEI
task = GlueDatasetTask.COLA

In [5]:
tokenizer = load_tokenizer(model_name)
dataset_info = load_glue_dataset_info(task, tokenizer)

Found cached dataset glue (C:/HF_CACHE/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at C:\HF_CACHE\datasets\glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-51f5b2c7a7ff1e58.arrow
Loading cached processed dataset at C:\HF_CACHE\datasets\glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-02d0d32fa3df3a08.arrow


Map:   0%|          | 0/1063 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Loading cached processed dataset at C:\HF_CACHE\datasets\glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-f9d914923ed56f0f.arrow
Loading cached processed dataset at C:\HF_CACHE\datasets\glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-ab84ee6f5284da84.arrow
Loading cached processed dataset at C:\HF_CACHE\datasets\glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-63fe19e4ca4d356a.arrow


## Fine-tune with masking

In [6]:
model_mlm = load_pretraining_model(model_name, dataset_info)

Some weights of the model checkpoint at huawei-noah/TinyBERT_General_4L_312D were not used when initializing BertForMaskedLM: ['fit_denses.0.bias', 'fit_denses.2.weight', 'fit_denses.1.weight', 'fit_denses.3.bias', 'cls.seq_relationship.weight', 'fit_denses.3.weight', 'fit_denses.1.bias', 'fit_denses.4.weight', 'cls.seq_relationship.bias', 'fit_denses.4.bias', 'fit_denses.0.weight', 'fit_denses.2.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
training_args_mlm = get_training_args_mlm(num_epochs=20)
data_collator = get_whole_word_masking_data_collator(tokenizer)
trainer_mlm = get_trainer_mlm(
    dataset_info=dataset_info,
    model=model_mlm,
    training_args=training_args_mlm,
    data_collator=data_collator)

Creating training arguments, model output dir: C:\Users\Windows\Desktop\Shahir\cs324-final-project-2023\models\mlm\Model 03-19-2023 03-57-18 AM


In [8]:
trainer_mlm.train()



  0%|          | 0/10700 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 6.2087, 'learning_rate': 1.9065420560747666e-05, 'epoch': 0.93}


  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 5.498152256011963, 'eval_runtime': 0.4398, 'eval_samples_per_second': 2371.431, 'eval_steps_per_second': 150.062, 'epoch': 1.0}
{'loss': 5.2714, 'learning_rate': 1.8130841121495328e-05, 'epoch': 1.87}


  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 4.917837619781494, 'eval_runtime': 0.4378, 'eval_samples_per_second': 2382.228, 'eval_steps_per_second': 150.745, 'epoch': 2.0}
{'loss': 4.9161, 'learning_rate': 1.7196261682242992e-05, 'epoch': 2.8}


  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 4.645493030548096, 'eval_runtime': 0.4478, 'eval_samples_per_second': 2329.163, 'eval_steps_per_second': 147.387, 'epoch': 3.0}
{'loss': 4.7168, 'learning_rate': 1.6261682242990654e-05, 'epoch': 3.74}


  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 4.590783596038818, 'eval_runtime': 0.4388, 'eval_samples_per_second': 2376.828, 'eval_steps_per_second': 150.403, 'epoch': 4.0}
{'loss': 4.5762, 'learning_rate': 1.5327102803738318e-05, 'epoch': 4.67}


  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 4.514120578765869, 'eval_runtime': 0.4398, 'eval_samples_per_second': 2371.424, 'eval_steps_per_second': 150.061, 'epoch': 5.0}
{'loss': 4.5133, 'learning_rate': 1.4392523364485981e-05, 'epoch': 5.61}


  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 4.273838996887207, 'eval_runtime': 0.5066, 'eval_samples_per_second': 2058.65, 'eval_steps_per_second': 130.269, 'epoch': 6.0}
{'loss': 4.3706, 'learning_rate': 1.3457943925233646e-05, 'epoch': 6.54}


  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 4.328298091888428, 'eval_runtime': 0.5007, 'eval_samples_per_second': 2083.25, 'eval_steps_per_second': 131.826, 'epoch': 7.0}
{'loss': 4.3086, 'learning_rate': 1.2523364485981309e-05, 'epoch': 7.48}


  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 4.262673377990723, 'eval_runtime': 0.4767, 'eval_samples_per_second': 2187.857, 'eval_steps_per_second': 138.445, 'epoch': 8.0}
{'loss': 4.2847, 'learning_rate': 1.1588785046728972e-05, 'epoch': 8.41}


  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 4.277965545654297, 'eval_runtime': 0.4727, 'eval_samples_per_second': 2206.323, 'eval_steps_per_second': 139.614, 'epoch': 9.0}
{'loss': 4.1918, 'learning_rate': 1.0654205607476635e-05, 'epoch': 9.35}


  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 4.30593204498291, 'eval_runtime': 0.5176, 'eval_samples_per_second': 2015.024, 'eval_steps_per_second': 127.509, 'epoch': 10.0}
{'loss': 4.1622, 'learning_rate': 9.7196261682243e-06, 'epoch': 10.28}


  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 4.267398357391357, 'eval_runtime': 0.4987, 'eval_samples_per_second': 2091.596, 'eval_steps_per_second': 132.354, 'epoch': 11.0}
{'loss': 4.1041, 'learning_rate': 8.785046728971963e-06, 'epoch': 11.21}


  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 4.25706148147583, 'eval_runtime': 0.4977, 'eval_samples_per_second': 2095.785, 'eval_steps_per_second': 132.619, 'epoch': 12.0}
{'loss': 4.0441, 'learning_rate': 7.850467289719627e-06, 'epoch': 12.15}


  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 4.179014205932617, 'eval_runtime': 0.5126, 'eval_samples_per_second': 2034.593, 'eval_steps_per_second': 128.747, 'epoch': 13.0}
{'loss': 4.0512, 'learning_rate': 6.91588785046729e-06, 'epoch': 13.08}


  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 4.39653205871582, 'eval_runtime': 0.4807, 'eval_samples_per_second': 2169.702, 'eval_steps_per_second': 137.297, 'epoch': 14.0}
{'loss': 4.0349, 'learning_rate': 5.981308411214953e-06, 'epoch': 14.02}
{'loss': 4.021, 'learning_rate': 5.046728971962617e-06, 'epoch': 14.95}


  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 4.191129684448242, 'eval_runtime': 0.5096, 'eval_samples_per_second': 2046.564, 'eval_steps_per_second': 129.505, 'epoch': 15.0}
{'loss': 4.0011, 'learning_rate': 4.112149532710281e-06, 'epoch': 15.89}


  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 4.138580799102783, 'eval_runtime': 0.4897, 'eval_samples_per_second': 2129.932, 'eval_steps_per_second': 134.78, 'epoch': 16.0}
{'loss': 3.9409, 'learning_rate': 3.177570093457944e-06, 'epoch': 16.82}


  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 4.122939109802246, 'eval_runtime': 0.4857, 'eval_samples_per_second': 2147.438, 'eval_steps_per_second': 135.888, 'epoch': 17.0}
{'loss': 3.9164, 'learning_rate': 2.2429906542056077e-06, 'epoch': 17.76}


  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 4.264461040496826, 'eval_runtime': 0.5296, 'eval_samples_per_second': 1969.484, 'eval_steps_per_second': 124.627, 'epoch': 18.0}
{'loss': 3.9495, 'learning_rate': 1.308411214953271e-06, 'epoch': 18.69}


  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 4.054150104522705, 'eval_runtime': 0.4588, 'eval_samples_per_second': 2273.47, 'eval_steps_per_second': 143.863, 'epoch': 19.0}
{'loss': 3.9317, 'learning_rate': 3.7383177570093457e-07, 'epoch': 19.63}


  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 4.079279899597168, 'eval_runtime': 0.4458, 'eval_samples_per_second': 2339.586, 'eval_steps_per_second': 148.047, 'epoch': 20.0}
{'train_runtime': 298.5195, 'train_samples_per_second': 572.894, 'train_steps_per_second': 35.844, 'train_loss': 4.3497840039975175, 'epoch': 20.0}


TrainOutput(global_step=10700, training_loss=4.3497840039975175, metrics={'train_runtime': 298.5195, 'train_samples_per_second': 572.894, 'train_steps_per_second': 35.844, 'train_loss': 4.3497840039975175, 'epoch': 20.0})

In [9]:
trainer_mlm.evaluate()

  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 4.08126163482666,
 'eval_runtime': 0.5396,
 'eval_samples_per_second': 1933.074,
 'eval_steps_per_second': 122.323,
 'epoch': 20.0}

In [10]:
checkpoint_path = get_latest_checkpoint_path(training_args_mlm)

## Fine-tune on sequence classification task

In [11]:
model_sc = load_classification_model(checkpoint_path, dataset_info)

Some weights of the model checkpoint at C:\Users\Windows\Desktop\Shahir\cs324-final-project-2023\models\mlm\Model 03-19-2023 03-57-18 AM\checkpoint-10700 were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequ

In [12]:
training_args_sc = get_training_args_sc(task, num_epochs=1)
trainer_sc = get_trainer_sc(
    dataset_info=dataset_info,
    model=model_sc,
    training_args=training_args_sc)

Creating training arguments, model output dir: C:\Users\Windows\Desktop\Shahir\cs324-final-project-2023\models\sc\Model 03-19-2023 04-02-19 AM


In [13]:
trainer_sc.train()



  0%|          | 0/535 [00:00<?, ?it/s]

{'loss': 0.609, 'learning_rate': 1.308411214953271e-06, 'epoch': 0.93}


  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 0.6142383217811584, 'eval_matthews_correlation': 0.0, 'eval_runtime': 0.4977, 'eval_samples_per_second': 2095.784, 'eval_steps_per_second': 132.619, 'epoch': 1.0}
{'train_runtime': 15.4083, 'train_samples_per_second': 554.96, 'train_steps_per_second': 34.721, 'train_loss': 0.6066856241671839, 'epoch': 1.0}


TrainOutput(global_step=535, training_loss=0.6066856241671839, metrics={'train_runtime': 15.4083, 'train_samples_per_second': 554.96, 'train_steps_per_second': 34.721, 'train_loss': 0.6066856241671839, 'epoch': 1.0})

In [14]:
trainer_sc.evaluate()

  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 0.6142383217811584,
 'eval_matthews_correlation': 0.0,
 'eval_runtime': 0.4348,
 'eval_samples_per_second': 2398.607,
 'eval_steps_per_second': 151.781,
 'epoch': 1.0}