## Configuration and Setup


In [1]:
import os
os.environ['HF_HOME'] = "C:/HF_CACHE/"

In [2]:
import torch

from cs324_project.datasets import GlueDatasetTask, load_glue_dataset_info
from cs324_project.models import ModelCheckpointName, load_classification_model, load_pretraining_model, load_tokenizer
from cs324_project.masking import (
    get_training_args_mlm, get_trainer_mlm, RandomMaskingConfig, WholeWordMaskingConfig, TyphoonMaskingConfig)
from cs324_project.classification import get_training_args_sc, get_trainer_sc

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
model_name = ModelCheckpointName.TINYBERT_HUAWEI
task = GlueDatasetTask.MRPC

In [6]:
tokenizer = load_tokenizer(model_name)
dataset_info = load_glue_dataset_info(task, tokenizer)

Found cached dataset glue (C:/HF_CACHE/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

## Fine-tune with masking

In [7]:
model_mlm = load_pretraining_model(model_name, dataset_info)

Some weights of the model checkpoint at huawei-noah/TinyBERT_General_4L_312D were not used when initializing BertForMaskedLM: ['fit_denses.2.weight', 'cls.seq_relationship.bias', 'fit_denses.4.weight', 'fit_denses.3.bias', 'fit_denses.1.weight', 'fit_denses.2.bias', 'fit_denses.0.bias', 'fit_denses.1.bias', 'fit_denses.3.weight', 'cls.seq_relationship.weight', 'fit_denses.0.weight', 'fit_denses.4.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
training_args_mlm = get_training_args_mlm(
    masking_config=TyphoonMaskingConfig(),
    num_epochs=100)
trainer_mlm = get_trainer_mlm(
    dataset_info=dataset_info,
    mlm_args=training_args_mlm,
    model=model_mlm)

Creating training arguments, model output dir: C:\Users\Windows\Desktop\Shahir\cs324-final-project-2023\models\mlm\Model 03-21-2023 11-22-37 AM


In [9]:
trainer_mlm.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,6.4314,5.714137
2,5.2108,4.945677
3,4.7062,4.574192
4,4.3631,4.323779
5,4.1378,4.18539
6,3.9537,3.983167
7,3.7942,3.92047
8,3.6386,3.790572
9,3.5788,3.812777
10,3.4446,3.732371


TrainOutput(global_step=23000, training_loss=2.2991143931513247, metrics={'train_runtime': 1431.9086, 'train_samples_per_second': 256.162, 'train_steps_per_second': 16.062, 'total_flos': 786121535232432.0, 'train_loss': 2.2991143931513247, 'epoch': 100.0})

In [10]:
trainer_mlm.evaluate()

{'eval_loss': 3.381281852722168,
 'eval_runtime': 0.4757,
 'eval_samples_per_second': 857.739,
 'eval_steps_per_second': 54.66,
 'epoch': 100.0}

In [11]:
print(trainer_mlm.state.best_model_checkpoint)

C:\Users\Windows\Desktop\Shahir\cs324-final-project-2023\models\mlm\Model 03-21-2023 11-22-37 AM\checkpoint-11270


## Fine-tune on sequence classification task

In [12]:
model_sc = load_classification_model(trainer_mlm.state.best_model_checkpoint, dataset_info)

Some weights of the model checkpoint at C:\Users\Windows\Desktop\Shahir\cs324-final-project-2023\models\mlm\Model 03-21-2023 11-22-37 AM\checkpoint-11270 were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequ

In [13]:
training_args_sc = get_training_args_sc(
    task,
    num_epochs=100)
trainer_sc = get_trainer_sc(
    dataset_info=dataset_info,
    model=model_sc,
    training_args=training_args_sc)

Creating training arguments, model output dir: C:\Users\Windows\Desktop\Shahir\cs324-final-project-2023\models\sc\Model 03-21-2023 11-46-31 AM


In [14]:
trainer_sc.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6656,0.631107,0.683824,0.812227
2,0.6272,0.608966,0.683824,0.812227
3,0.6129,0.60026,0.691176,0.815789
4,0.6034,0.593078,0.696078,0.818182
5,0.5954,0.586084,0.698529,0.818316
6,0.5864,0.578768,0.70098,0.819527
7,0.5797,0.570612,0.698529,0.817778
8,0.569,0.562677,0.703431,0.818591
9,0.5604,0.554392,0.715686,0.824242
10,0.5515,0.546068,0.715686,0.823708


TrainOutput(global_step=23000, training_loss=0.39363236800484036, metrics={'train_runtime': 1003.421, 'train_samples_per_second': 365.549, 'train_steps_per_second': 22.922, 'total_flos': 781014765977712.0, 'train_loss': 0.39363236800484036, 'epoch': 100.0})

In [15]:
trainer_sc.evaluate()

{'eval_loss': 0.4879816174507141,
 'eval_accuracy': 0.7867647058823529,
 'eval_f1': 0.8492201039861352,
 'eval_runtime': 0.5086,
 'eval_samples_per_second': 802.149,
 'eval_steps_per_second': 51.117,
 'epoch': 100.0}

In [16]:
print(trainer_sc.state.best_model_checkpoint)

C:\Users\Windows\Desktop\Shahir\cs324-final-project-2023\models\sc\Model 03-21-2023 11-46-31 AM\checkpoint-6440
