## Configuration and Setup


In [1]:
import os
os.environ['HF_HOME'] = "D:/HF_CACHE/"

In [2]:
from cs324_project.datasets import GlueDatasetTask, load_glue_dataset_info
from cs324_project.models import ModelCheckpointName, load_classification_model, load_pretraining_model, load_tokenizer
from cs324_project.training import get_training_args, get_trainer_mlm, get_trainer_sc, get_latest_checkpoint_path
from cs324_project.masking import get_whole_word_masking_data_collator, get_random_masking_data_collator, default_data_collator

In [3]:
model_name = ModelCheckpointName.TINYBERT_HUAWEI
task = GlueDatasetTask.COLA

In [4]:
tokenizer = load_tokenizer(model_name)
dataset_info = load_glue_dataset_info(task, tokenizer, reduce_fraction=0.1)

Found cached dataset glue (D:/HF_CACHE/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/855 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/104 [00:00<?, ? examples/s]

Map:   0%|          | 0/106 [00:00<?, ? examples/s]

Map:   0%|          | 0/855 [00:00<?, ? examples/s]

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

Map:   0%|          | 0/106 [00:00<?, ? examples/s]

## Fine-tune with masking

In [5]:
model_mlm = load_pretraining_model(model_name, dataset_info)

Some weights of the model checkpoint at huawei-noah/TinyBERT_General_4L_312D were not used when initializing BertForMaskedLM: ['fit_denses.0.bias', 'fit_denses.4.bias', 'fit_denses.1.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'fit_denses.2.weight', 'fit_denses.1.weight', 'fit_denses.3.weight', 'fit_denses.0.weight', 'fit_denses.4.weight', 'fit_denses.2.bias', 'fit_denses.3.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
training_args_mlm = get_training_args(task, num_epochs=1)
data_collator = get_random_masking_data_collator(tokenizer)
trainer_mlm = get_trainer_mlm(
    dataset_info=dataset_info,
    model=model_mlm,
    training_args=training_args_mlm,
    data_collator=data_collator)

In [7]:
trainer_mlm.train()



  0%|          | 0/54 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
trainer_mlm.evaluate()

In [None]:
checkpoint_path = get_latest_checkpoint_path(training_args_mlm)

## Fine-tune on sequence classification task

In [None]:
model_sc = load_classification_model(checkpoint_path, dataset_info)

In [None]:
training_args_sc = get_training_args(task, num_epochs=1)
trainer_sc = get_trainer_sc(dataset_info, model_sc, training_args)

In [None]:
trainer_sc.train()

In [None]:
trainer_sc.evaluate()