## Configuration and Setup


In [1]:
import os
os.environ['HF_HOME'] = "C:/HF_CACHE/"

In [2]:
import torch

from cs324_project.datasets import GlueDatasetTask, load_glue_dataset_info
from cs324_project.models import ModelCheckpointName, load_classification_model, load_pretraining_model, load_tokenizer
from cs324_project.masking import (
    get_training_args_mlm, get_trainer_mlm, RandomMaskingConfig, WholeWordMaskingConfig, TyphoonMaskingConfig)
from cs324_project.classification import get_training_args_sc, get_trainer_sc

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
model_name = ModelCheckpointName.TINYBERT_HUAWEI
task = GlueDatasetTask.COLA

In [5]:
tokenizer = load_tokenizer(model_name)
dataset_info = load_glue_dataset_info(task, tokenizer)

Found cached dataset glue (C:/HF_CACHE/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at C:\HF_CACHE\datasets\glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-51f5b2c7a7ff1e58.arrow
Loading cached processed dataset at C:\HF_CACHE\datasets\glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-02d0d32fa3df3a08.arrow
Loading cached processed dataset at C:\HF_CACHE\datasets\glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-37efad78792ce9c4.arrow
Loading cached processed dataset at C:\HF_CACHE\datasets\glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-df985a2934c3ba2f.arrow
Loading cached processed dataset at C:\HF_CACHE\datasets\glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-b2e1d3e57836d04c.arrow
Loading cached processed dataset at C:\HF_CACHE\datasets\glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-598cd95f3828ee11.arro

## Fine-tune with masking

In [6]:
model_mlm = load_pretraining_model(model_name, dataset_info)

Some weights of the model checkpoint at huawei-noah/TinyBERT_General_4L_312D were not used when initializing BertForMaskedLM: ['fit_denses.2.weight', 'fit_denses.4.bias', 'fit_denses.0.bias', 'fit_denses.3.bias', 'fit_denses.2.bias', 'fit_denses.1.bias', 'cls.seq_relationship.weight', 'fit_denses.4.weight', 'fit_denses.1.weight', 'cls.seq_relationship.bias', 'fit_denses.0.weight', 'fit_denses.3.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
training_args_mlm = get_training_args_mlm(
    masking_config=RandomMaskingConfig(),
    num_epochs=80)
trainer_mlm = get_trainer_mlm(
    dataset_info=dataset_info,
    mlm_args=training_args_mlm,
    model=model_mlm)

Creating training arguments, model output dir: C:\Users\Windows\Desktop\Shahir\cs324-final-project-2023\models\mlm\Model 03-21-2023 10-50-00 PM


In [8]:
trainer_mlm.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,4.5691,3.982995
2,3.8259,3.698572
3,3.5853,3.621619
4,3.4085,3.674379
5,3.2893,3.419763
6,3.1502,3.382149
7,3.0434,3.414268
8,2.9467,3.371143
9,2.8788,3.358126
10,2.787,3.25376


TrainOutput(global_step=42800, training_loss=1.7564905284275518, metrics={'train_runtime': 1515.5566, 'train_samples_per_second': 451.372, 'train_steps_per_second': 28.24, 'total_flos': 399180875327136.0, 'train_loss': 1.7564905284275518, 'epoch': 80.0})

In [9]:
trainer_mlm.evaluate()

{'eval_loss': 3.074653148651123,
 'eval_runtime': 0.6772,
 'eval_samples_per_second': 1540.194,
 'eval_steps_per_second': 97.462,
 'epoch': 80.0}

In [10]:
print(trainer_mlm.state.best_model_checkpoint)

C:\Users\Windows\Desktop\Shahir\cs324-final-project-2023\models\mlm\Model 03-21-2023 10-50-00 PM\checkpoint-18725


## Fine-tune on sequence classification task

In [11]:
model_sc = load_classification_model(trainer_mlm.state.best_model_checkpoint, dataset_info)

Some weights of the model checkpoint at C:\Users\Windows\Desktop\Shahir\cs324-final-project-2023\models\mlm\Model 03-21-2023 10-50-00 PM\checkpoint-18725 were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequ

In [12]:
training_args_sc = get_training_args_sc(
    task,
    learning_rate=2e-5,
    num_epochs=120)
trainer_sc = get_trainer_sc(
    dataset_info=dataset_info,
    model=model_sc,
    training_args=training_args_sc)

Creating training arguments, model output dir: C:\Users\Windows\Desktop\Shahir\cs324-final-project-2023\models\sc\Model 03-21-2023 11-15-16 PM


In [13]:
trainer_sc.train()

Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.6062,0.626239,0.0
2,0.586,0.615488,0.074576
3,0.5495,0.621807,0.150448
4,0.513,0.648292,0.141881
5,0.4753,0.704112,0.095184
6,0.4383,0.772287,0.115828
7,0.3976,0.818466,0.107498
8,0.3666,0.922398,0.083275
9,0.3374,1.0147,0.09863
10,0.3135,1.21056,0.073775


TrainOutput(global_step=64200, training_loss=0.10841144086415894, metrics={'train_runtime': 1816.6181, 'train_samples_per_second': 564.852, 'train_steps_per_second': 35.34, 'total_flos': 594747984354972.0, 'train_loss': 0.10841144086415894, 'epoch': 120.0})

In [14]:
trainer_sc.evaluate()

{'eval_loss': 0.6218067407608032,
 'eval_matthews_correlation': 0.15044770083461464,
 'eval_runtime': 0.5725,
 'eval_samples_per_second': 1821.942,
 'eval_steps_per_second': 115.291,
 'epoch': 120.0}

In [15]:
print(trainer_sc.state.best_model_checkpoint)

C:\Users\Windows\Desktop\Shahir\cs324-final-project-2023\models\sc\Model 03-21-2023 11-15-16 PM\checkpoint-1605
