In [72]:
# Clone the repo
!git clone https://github.com/ljeong072/TCSS456

Cloning into 'TCSS456'...
remote: Enumerating objects: 8, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 8 (delta 0), reused 5 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (8/8), done.


In [73]:
import os

# Set directory into cloned repo and open the files to check.
os.chdir("TCSS456")  
os.listdir()

['README.md', '.git']

In [84]:
# See available branches
!git branch -a

* [32mDev[m
  main[m
  [31mremotes/origin/Dev[m
  [31mremotes/origin/HEAD[m -> origin/main
  [31mremotes/origin/mahri[m
  [31mremotes/origin/main[m


In [86]:
# Checkout Dev branch
!git checkout Dev

Already on 'Dev'
Your branch is up to date with 'origin/Dev'.


In [88]:
# Status of branch
!git status

On branch Dev
Your branch is up to date with 'origin/Dev'.

nothing to commit, working tree clean


In [79]:
# Push to Github (Change the message and check that this is the correct branch
!git add .
!git commit -m "First commit"
!git push
!git push origin Dev

On branch Dev
Your branch is up to date with 'origin/Dev'.

nothing to commit, working tree clean
Everything up-to-date
Everything up-to-date


In [9]:
# 1.1 Prerequisites
!pip install transformers datasets pandas scikit-learn torch torchvision torchaudio ipywidgets accelerate>=0.26.0 --quiet

In [10]:
import os
import pandas as pd
import transformers
import torch
import joblib
from datasets import load_dataset, Dataset
from transformers import GPT2Tokenizer, GPT2Model
from transformers import AutoModelForCausalLM
from transformers import TrainingArguments, Trainer
from transformers import pipeline

In [3]:
# Login using e.g. `huggingface-cli login` to access this dataset

dataset = load_dataset("thomasat/diet-planning")

# Split the train split into 80% train, 20% val
split_dataset = dataset['train'].train_test_split(test_size = 0.2, seed = 42)

print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['Prompt', 'Formal specification', 'Max per-meal sodium (mg)', 'Max daily saturated fat (g)', 'Max daily calories (kcal)', 'Min daily fiber (g)', 'Min daily servings of vegetables', 'Min daily servings of fruit', 'Max daily percentage carbohydrates', 'Min daily percentage carbohydrates', 'Max daily percentage fat', 'Min daily percentage fat', 'Max daily percentage protein', 'Min daily percentage protein', 'Max daily servings of whole grains', 'Min daily servings of whole grains', 'Max servings of sweets per week', 'Max servings of red meat per week', 'Min servings of nuts, seeds, legumes per week', 'Max servings of low-fat dairy per week', 'Min servings of low-fat dairy per week', 'Max servings of fish per week', 'Min servings of fish per week', 'Dietary preference', 'Flavor preference', 'Cooking preference', 'Cuisine preference'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['Prompt', 'Formal specification', 

In [4]:
# 1.3 Step 2: Tokenize the Text
# Note: We have used distilbert-base-uncased tokenizer in Tutorial_1

model_name = "gpt2"

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Load the model (GPT-2 Small)
model = GPT2Model.from_pretrained(model_name)

def tokenize_function(example):
  return tokenizer(example["Prompt"])

tokenized_dataset = split_dataset.map(tokenize_function, batched = True, remove_columns = ["Prompt", "Formal specification", "Max per-meal sodium (mg)", "Max daily saturated fat (g)", "Max daily calories (kcal)", "Min daily fiber (g)", "Min daily servings of vegetables", "Min daily servings of fruit", "Max daily percentage carbohydrates", "Min daily percentage carbohydrates", "Max daily percentage fat", "Min daily percentage fat", "Max daily percentage protein", "Min daily percentage protein", "Max daily servings of whole grains", "Min daily servings of whole grains", "Max servings of sweets per week", "Max servings of red meat per week", "Min servings of nuts, seeds, legumes per week", "Max servings of low-fat dairy per week", "Min servings of low-fat dairy per week", "Max servings of fish per week", "Min servings of fish per week", "Dietary preference", "Flavor preference", "Cooking preference", "Cuisine preference"])
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [5]:
# 1.4 Step 3: Group Tokens for Language Modeling
block_size = 128

def group_texts(examples):
    concatenated = sum(examples['input_ids'], [])
    concatenated_attention_mask = sum(examples['attention_mask'], [])
    
    total_length = (len(concatenated) // block_size) * block_size
    result = {
        'input_ids': [concatenated[i:i + block_size] for i in range(0, total_length, block_size)],
        'attention_mask': [concatenated_attention_mask[i:i + block_size] for i in range(0, total_length, block_size)]
    }

    result["labels"] = result["input_ids"].copy()
    return result

lm_dataset = tokenized_dataset.map(group_texts, batched = True)
lm_dataset["train"][0]

{'input_ids': [1639,
  389,
  281,
  5887,
  5496,
  6749,
  13,
  4222,
  7716,
  257,
  5496,
  1410,
  329,
  530,
  1285,
  11,
  543,
  12991,
  4340,
  262,
  2836,
  447,
  247,
  82,
  14676,
  981,
  635,
  10941,
  9573,
  13,
  4222,
  1826,
  262,
  1708,
  17778,
  25,
  198,
  9806,
  583,
  12,
  28208,
  21072,
  286,
  38123,
  10527,
  11,
  3509,
  4445,
  24725,
  3735,
  286,
  1511,
  308,
  11,
  3509,
  4445,
  14653,
  286,
  4751,
  11,
  949,
  4445,
  13608,
  286,
  2608,
  308,
  11,
  949,
  718,
  4445,
  43096,
  286,
  13701,
  11,
  220,
  949,
  362,
  4445,
  43096,
  286,
  8234,
  11,
  3509,
  4445,
  5873,
  32328,
  5996,
  11,
  949,
  4445,
  5873,
  32328,
  4153,
  11,
  220,
  3509,
  4445,
  5873,
  7532,
  1679,
  11,
  949,
  4445,
  5873,
  7532,
  1315,
  11,
  3509,
  4445,
  5873,
  3735,
  1542,
  11,
  220,
  949,
  4445,
  5873,
  3735,
  1160,
  11,
  3509,
  4445,
  43096,
  286,
  2187,
  21824,
  807,
  11],
 'attention_mask'

In [6]:
# 1.5 Step 4: Load a Pretrained Model for Language Modeling
model = AutoModelForCausalLM.from_pretrained(model_name) # Note: we are using gpt2

In [7]:
# Trains the model and selects a GPU if possible
# Some hyperparameters were changed as Google Colab's TPU and GPU
# had expired, so the CPU was utilized with parameters adjusted to
# allow it to train in a reasonable amount of time.

# 1. Leverage mixed precision training
training_args = TrainingArguments(
    output_dir = "./lm_checkpoints",
    
    # 2. Aligned evaluation and saving strategy
    eval_strategy = "steps", # Evaluate at specific steps
    eval_steps = 100, # Evaluate every 100 steps
    save_strategy = "steps", # Does not allow equality to "epoch"
    save_steps = 100, # Save every 100 steps
    save_total_limit = 2, # Keep only the 2 most recent checkpoints

    # 3. Increase learning rate and use warmup
    learning_rate = 5e-5, # Higher learning rate
    warmup_ratio = 0.1, # Warm up for first 10% of training

    # 4. Increase batch size
    per_device_train_batch_size = 16, # Increase if your GPU has enough memory
    per_device_eval_batch_size = 16,
    gradient_accumulation_steps = 2, # Simulate larger batch sizes

    # 5. Enable fp16 training (mixed precision)
    fp16 = True, # Enable mixed precision training

    # 6. Early stopping configuration
    load_best_model_at_end = True, # Load the best model when training ends
    metric_for_best_model = "loss", # Use evaluation loss as the metric to track
    greater_is_better = False, # Lower loss is better

    # 7. Other optimizations
    weight_decay = 0.01,
    logging_steps = 50, # Less frequent logging
    report_to = "none",

    # 8. Enable data parallelism if multiple GPUs are available
    dataloader_num_workers =  4, # Use multiple CPU cores for data loading

    # 9. Set number of epochs
    num_train_epochs = 1, # Maintain the original epochs setting
)

# 9. Initialize the Trainer
trainer = Trainer(
  model = model,
  args = training_args,
  train_dataset = lm_dataset["train"],
  eval_dataset = lm_dataset["test"]
)

# 10. Optional: Use early stopping
early_stopping_callback = transformers.EarlyStoppingCallback(
  early_stopping_patience = 3,
  early_stopping_threshold = 0.01
)
trainer.add_callback(early_stopping_callback)

# Start training
trainer.train()

joblib.dump(model, "model.pkl")

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
100,0.1652,0.134233
200,0.1275,0.119927
300,0.1197,0.113403
400,0.1155,0.110726


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=457, training_loss=0.3204675093782287, metrics={'train_runtime': 3797.8549, 'train_samples_per_second': 3.854, 'train_steps_per_second': 0.12, 'total_flos': 955283668992000.0, 'train_loss': 0.3204675093782287, 'epoch': 0.9989071038251366})

In [15]:
model = joblib.load("model.pkl")
generator = pipeline("text-generation", model = model, tokenizer = tokenizer)

output1 = generator("A healthy diet", max_length = 40, num_return_sequences = 1)
output2 = generator("A diet consisting of 100 mg of salt, 200 mg of fat, and", max_length = 60, num_return_sequences = 1)
output3 = generator("I prefer Chinese cuisine but must lower my sodium. A diet", max_length = 60, num_return_sequences = 1)
output4 = generator("I am a pescatarian and enjoy spicy food. ", max_length = 60, num_return_sequences = 1)
output5 = generator("I need to consume more fruit and carbohydrates a day.", max_length = 60, num_return_sequences = 1)
output6 = generator("My maximum daily calories is 1,200 so I should consume ", max_length = 60, num_return_sequences = 1)
output7 = generator("I am a vegetarian and I enjoy food that is sweet and salty.", max_length = 60, num_return_sequences = 1)

# 3 unique sentences for generation testing
print(output1[0]["generated_text"])
print(output2[0]["generated_text"])
print(output3[0]["generated_text"])
print(output4[0]["generated_text"])
print(output5[0]["generated_text"])
print(output6[0]["generated_text"])
print(output7[0]["generated_text"])

Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


A healthy diet plan for one week, which maximizes the user’s satisfaction while also maintaining diversity. Please meet the following constraints:
max per-meal sodium of 2100 mg, max daily
A diet consisting of 100 mg of salt, 200 mg of fat, and min of 2 servings per week,  max low-fat dairy of 3 servings per week, min of 2 servings per week, max 2 servings of fish per week,  and min of 1 per week. Diet preference:
I prefer Chinese cuisine but must lower my sodium. A diet plan for one week, which maximizes the user’s satisfaction while also maintaining diversity. Please meet the following constraints:
max per-meal sodium of 2300 mg, max daily saturated fat of 13 g, max daily calories of
I am a pescatarian and enjoy spicy food.  min 4 daily servings of fruit, max daily percentage carbohydrates 55, min daily percentage carbohydrates 45,  max daily percentage protein 25, min daily percentage protein 15, max daily percentage fat 30,  min daily percentage fat 20, max daily
I need to consume 