# SET UPS

- installing the dependencies and packages neccessary for GPT2 (This may take ~ 1 min)

- initialize environment variables



In [None]:
# install GPT 2 via pip command 
!pip install --upgrade pip  # ensures that pip is current
!pip install git+https://github.com/huggingface/transformers@main # install GPT2 and related dependencies
!pip install git+https://github.com/google-research/bleurt # install bluert score and related dependencies

In [12]:
# imports
import os
import numpy as np
import pandas as pd
import random
from transformers import AutoTokenizer, AutoConfig, AutoModelForPreTraining, \
                         AdamW, get_linear_schedule_with_warmup, \
                         TrainingArguments, BeamScorer, Trainer

import torch
from torch.utils.data import Dataset, random_split, DataLoader, \
                             RandomSampler, SequentialSampler

In [22]:
DEBUG           = False
APEX_OPT_LEVEL  = 'O1'
MODEL           = 'gpt2'

SPECIAL_TOKENS  = { "bos_token": "<|BOS|>",
                    "eos_token": "<|EOS|>",
                    "unk_token": "<|UNK|>",                    
                    "pad_token": "<|PAD|>",
                    "sep_token": "<|SEP|>"}
                    
MAXLEN          = 768 # largest possible length given colab's capacities

TRAIN_SIZE      = 0.8

TRAIN_DATA_CKPT = 0      # position of last trained data
TRAIN_DATA_LIMIT = 30 # ~ 1 - 1.5 hr training time for 10000 of data pts

TRAIN_BATCHSIZE = 4   # largest possible batCh given colab's capacities
BATCH_UPDATE    = 16

EPOCHS          = 4
LR              = 5e-4
EPS             = 1e-8
WARMUP_STEPS    = 1e2

SEED            = 1234


In [14]:
def SETSEED(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

SETSEED(SEED)

# LOAD DATA

Modify the following paths according to your directory structure

In [15]:
# path to data file
DATA_PATH = 'recipes.csv'

# path to model, 'None' if there is no model or you would like to train new model
MODEL_PATH = 'model.bin'

# directory to save model in
SAVE_MODEL_DIR = ''

# directory to save generated data in
SAVE_DATA_DIR = 'generated_results/'

In [16]:
# Mount google drive if using colab, else skip
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
all_data = pd.read_csv(DATA_PATH)
data = all_data[TRAIN_DATA_CKPT: TRAIN_DATA_CKPT + TRAIN_DATA_LIMIT]

In [24]:
class RecipeDataset(Dataset):
    def __init__(self, data, tokenizer, randomize=True):
        self.randomize = randomize
        self.tokenizer = tokenizer 
        self.ingredients = list(data['ingredients'])
        self.instructions = list(data['instructions'])

    def __len__(self):
        return len(self.ingredients)
    
    def __getitem__(self, i):
        
        input = SPECIAL_TOKENS['bos_token'] + self.ingredients[i] + SPECIAL_TOKENS['sep_token'] + \
                self.instructions[i] + SPECIAL_TOKENS['eos_token']

        encodings_dict = self.tokenizer(input,                                   
                                   truncation=True, 
                                   max_length=MAXLEN, 
                                   padding="max_length")   
        
        input_ids = encodings_dict['input_ids']
        attention_mask = encodings_dict['attention_mask']
        
        return {'label': torch.tensor(input_ids),
                'input_ids': torch.tensor(input_ids), 
                'attention_mask': torch.tensor(attention_mask)}

In [25]:
def get_tokenier(special_tokens):
    tokenizer = AutoTokenizer.from_pretrained(MODEL)  
    tokenizer.add_special_tokens(special_tokens)
    return tokenizer

def get_model(tokenizer, special_tokens, load_model_path=None):

    
    config = AutoConfig.from_pretrained(MODEL, 
                                        bos_token_id=tokenizer.bos_token_id,
                                        eos_token_id=tokenizer.eos_token_id,
                                        sep_token_id=tokenizer.sep_token_id,
                                        pad_token_id=tokenizer.pad_token_id,
                                        output_hidden_states=False)
   

    model = AutoModelForPreTraining.from_pretrained(MODEL, config=config)
    model.resize_token_embeddings(len(tokenizer))

    if load_model_path:
        model.load_state_dict(torch.load(load_model_path))

    model.cuda()
    return model

# Model Loading

In [26]:
tokenizer = get_tokenier(SPECIAL_TOKENS)
model = get_model(tokenizer, 
                  SPECIAL_TOKENS,
                  load_model_path= MODEL_PATH
                  )

# Model Training



In [27]:
train_data = data.iloc[0:int(len(data)*TRAIN_SIZE), :]
val_data = data.iloc[int(len(data)*TRAIN_SIZE):, :]

train_dataset = RecipeDataset(train_data, tokenizer)
val_dataset = RecipeDataset(val_data, tokenizer, randomize=False)

f'TRAINING SMAPLES: {len(train_dataset) :,} || TESTING SAMPLES: {len(val_dataset) :,}'

'TRAINING SMAPLES: 80 || TESTING SAMPLES: 20'

In [28]:
torch.cuda.empty_cache()
training_args = TrainingArguments(
    output_dir="/content/",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAIN_BATCHSIZE,
    per_device_eval_batch_size=TRAIN_BATCHSIZE,
    gradient_accumulation_steps=BATCH_UPDATE,
    evaluation_strategy="epoch",
    fp16=True,
    fp16_opt_level=APEX_OPT_LEVEL,
    warmup_steps=WARMUP_STEPS,    
    learning_rate=LR,
    adam_epsilon=EPS,
    weight_decay=0.01,        
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    args=training_args,    
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

trainer.train()
trainer.save_model()    

Using amp half precision backend
***** Running training *****
  Num examples = 80
  Num Epochs = 4
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 16
  Total optimization steps = 4


Epoch,Training Loss,Validation Loss
0,No log,0.459299
1,No log,0.459
2,No log,0.458438
3,No log,0.457684


***** Running Evaluation *****
  Num examples = 20
  Batch size = 4
***** Running Evaluation *****
  Num examples = 20
  Batch size = 4
***** Running Evaluation *****
  Num examples = 20
  Batch size = 4
***** Running Evaluation *****
  Num examples = 20
  Batch size = 4


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /content/
Configuration saved in /content/config.json
Model weights saved in /content/pytorch_model.bin
tokenizer config file saved in /content/tokenizer_config.json
Special tokens file saved in /content/special_tokens_map.json


# Model Saving

In [29]:
MODEL_NAME = 'TEST-model.bin'
path = SAVE_MODEL_DIR + MODEL_NAME
torch.save(model.state_dict(), path)

# Generation

In [30]:
#ingredient = list(data['ingredients'])[0]
ingredient = "Ingredients: 1. 2 pound of beef 2. 1 cup of water 3. 1 cup of coffee 4. 1 cup of cream 5. 1 tablespoon of salt 6. 2 tablespoon of sugar 7. 2 cup of cheese 8. 1 onion 9. 1 cup of flour"

prompt = SPECIAL_TOKENS['bos_token']  + ingredient + SPECIAL_TOKENS['sep_token']
         
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
device = torch.device("cuda")
generated = generated.to(device)

model.eval();

In [43]:
sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                min_length=50, 
                                max_length=MAXLEN,
                                top_k=30,                                 
                                top_p=0.7,        
                                temperature=1.5,
                                repetition_penalty=1.0,
                                num_return_sequences=10
                                )

In [53]:
for i, sample_output in enumerate(sample_outputs):
    text = tokenizer.decode(sample_output, skip_special_tokens=True)
    if i == 0:
      print(text.split('Instructions:')[0], '\n', '-'*10, '\n', sep = '')
    print('Instructions:', text.split('Instructions:')[1], '\n', sep = '')

Ingredients: 1. 2 cups flour 2. 1 tablespoon cinnamon 3. 2 teaspoons baking soda 4. 1 teaspoon salt 5. 14 teaspoon baking powder 6. 3 eggs 7. 2 cups sugar 8. 1 cup vegetable oil 9. 1 tablespoon vanilla 10. 2 cups zucchini 11. 1 cup walnuts 
----------

Instructions: 1. Sift flour, cinnamon, baking soda, salt, and baking powder together into mixing bowl. 2. Make a well in the center, and pour in the eggs and sugar. 3. Stir together until blended. 4. Fold in vanilla and zucchini, nuts, and flour until well mixed. 5. Spoon batter into two greased and floured loaf pans, and bake at 350 degrees for one hour, or until done. 6. Let cool for 20 minutes before removing from pans. 

Instructions: 1. Combine first 8 ingredients in large bowl. 2. Beat with electric mixer on medium speed 3 minutes or until fluffy. 3. Stir in zucchini and nuts. 4. Pour into greased 8-inch square baking pan. 5. Bake at 350 degrees for 30 minutes or until toothpick inserted in center comes out clean. 6. Cool completel

# Generate Validation Dataframe


In [33]:
SAVE_DATA_FILE_NAME = 'validation_data.csv'
test_data = all_data[:10] 
validation_df = pd.DataFrame(columns = ["truth", "prediction"])

In [34]:
for i, row in test_data.iterrows():

  prompt = SPECIAL_TOKENS['bos_token']  + row["ingredients"] + SPECIAL_TOKENS['sep_token']
        
  generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
  device = torch.device("cuda")
  generated = generated.to(device)

  model.eval()
  sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                min_length=50, 
                                max_length=MAXLEN,
                                top_k=30,                                 
                                top_p=0.7,        
                                temperature=1.5,
                                repetition_penalty=1.0,
                                num_return_sequences=3)
  for i, sample_output in enumerate(sample_outputs):
    text = tokenizer.decode(sample_output, skip_special_tokens=True)
    text = text.split("Instructions:")[1]
    #print(text)
    validation_df = validation_df.append({
        'truth' : row["instructions"],
        'prediction': "Instructions:" + text
    }, ignore_index=True)

#validation_df.to_csv(SAVE_DATA_DIR + SAVE_DATA_FILE_NAME)  

# Evaluation

In [36]:
from bleurt import score

references = validation_df['truth']
candidates = validation_df['prediction']

scorer = score.BleurtScorer()
scores = scorer.score(references=references, candidates=candidates)

INFO:tensorflow:No checkpoint specified, defaulting to BLEURT-tiny.
INFO:tensorflow:Reading checkpoint /usr/local/lib/python3.7/dist-packages/bleurt/test_checkpoint.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint dbleurt_tiny
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:dbleurt_tiny
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... max_seq_length:512
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating WordPiece tokenizer.
INFO:tensorflow:WordPiece tokenizer instantiated.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.
INFO:tensorflow:BLEURT initialized.


In [37]:
pd.DataFrame(scores).describe()

Unnamed: 0,0
count,30.0
mean,-0.453432
std,0.28842
min,-1.071038
25%,-0.604296
50%,-0.395737
75%,-0.296551
max,0.094377
