# Let's Train ChefGPT

Let's fine tune our own ChefGPT model. 

Load the pre-tokenized dataset

In [None]:
# We are going to load the tokenized training dataset

from datasets import load_from_disk

train_tokenized = load_from_disk('./tokenized_train_dataset_5k_v1')
val_tokenized = load_from_disk('./tokenized_test_dataset_5k_v1')

In [2]:
# Print the size of the training dataset

train_tokenized.shape

(5000, 3)

In [3]:
# Print the size of the evaluation dataset

val_tokenized.shape

(500, 3)

## Let's Train

In [4]:
# We are loading the model using the same wrappers used in previous labs.
# Using those wrappers we are going to tune the entire model using the loaded dataset. Since the model is not very large, this is not
# going to take a long time.

from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config, TrainingArguments, Trainer

In [5]:
# Configure the model
config = T5Config.from_pretrained('t5-base')

# Initialize the model
model = T5ForConditionalGeneration.from_pretrained('t5-base', config=config)

# Disable caching for the model, prevents cache error message during the training!
model.config.use_cache = False

In [6]:
# In this cell, the training parameters are set while using the `TrainingArguments` and `Trainer` wrappers from the HuggingFace Transformers library.
# The training arguments were taken directly from the HuggingFace T5 model training documentation.
# The values for the parameters were selected to leverage the performance characteristics of the GPU shape we are using in this lab (dual A10 GPUs).

# Configure training arguments / A10 dual conform
# as per https://huggingface.co/docs/transformers/model_doc/t5#training
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='epoch',
    logging_dir='./logs',
    gradient_accumulation_steps=2,
    learning_rate=1e-4,
    warmup_steps=100,
    weight_decay=0.01,
    fp16=True,
    load_best_model_at_end=True,
    optim="adamw_torch",
    adam_beta1=0.85    
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized
)

Using cuda_amp half precision backend


In [7]:
# We are going to execute the training process as well as save the results to a local file.
# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("fine_tuned_t5_recipes_base_5k_v1")

***** Running training *****
  Num examples = 5000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 2
  Total optimization steps = 468
  Number of trainable parameters = 222903552


Epoch,Training Loss,Validation Loss
0,3.9147,2.105792
1,2.2824,1.998505
2,2.1916,1.975185


***** Running Evaluation *****
  Num examples = 500
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-156
Configuration saved in ./results/checkpoint-156/config.json
Model weights saved in ./results/checkpoint-156/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 500
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-312
Configuration saved in ./results/checkpoint-312/config.json
Model weights saved in ./results/checkpoint-312/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 500
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-468
Configuration saved in ./results/checkpoint-468/config.json
Model weights saved in ./results/checkpoint-468/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results/checkpoint-468 (score: 1.9751847982406616).
Configuration saved in fine_tuned_t5_recipes_base_5k_v1/config.json
Model weights saved in f