# SET UPS

- installing the dependencies and packages neccessary for GPT2 (This may take ~ 1 min)

- initialize environment variables



In [1]:
# install GPT 2 via pip command 
!pip install --upgrade pip  # ensures that pip is current
!pip install git+https://github.com/huggingface/transformers@main # install GPT2 and related dependencies
!pip install git+https://github.com/google-research/bleurt # install bluert score and related dependencies

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pip
  Downloading pip-22.1.2-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 8.3 MB/s 
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.1.3
    Uninstalling pip-21.1.3:
      Successfully uninstalled pip-21.1.3
Successfully installed pip-22.1.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/huggingface/transformers@main
  Cloning https://github.com/huggingface/transformers (to revision main) to /tmp/pip-req-build-24ywwnzy
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-24ywwnzy
  Resolved https://github.com/huggingface/transformers to commit 66e8656778392609e1fb769f1a0d0839af3cd76a
  Installing build dependencies ... [?25l[?25hdone
  Ge

In [2]:
# imports
import os
import numpy as np
import pandas as pd
import random
from transformers import AutoTokenizer, AutoConfig, AutoModelForPreTraining, \
                         AdamW, get_linear_schedule_with_warmup, \
                         TrainingArguments, BeamScorer, Trainer

import torch
from torch.utils.data import Dataset, random_split, DataLoader, \
                             RandomSampler, SequentialSampler

In [3]:
DEBUG           = False
APEX_OPT_LEVEL  = 'O1'
MODEL           = 'gpt2'

SPECIAL_TOKENS  = { "bos_token": "<|BOS|>",
                    "eos_token": "<|EOS|>",
                    "unk_token": "<|UNK|>",                    
                    "pad_token": "<|PAD|>",
                    "sep_token": "<|SEP|>"}
                    
MAXLEN          = 768 # largest possible length given colab's capacities

TRAIN_SIZE      = 0.8

TRAIN_DATA_CKPT = 0      # position of last trained data
TRAIN_DATA_LIMIT = 50 # ~ 1 - 1.5 hr training time for 10000 of data pts

TRAIN_BATCHSIZE = 4   # largest possible batCh given colab's capacities
BATCH_UPDATE    = 16

EPOCHS          = 4
LR              = 5e-4
EPS             = 1e-8
WARMUP_STEPS    = 1e2

SEED            = 1234


In [4]:
def SETSEED(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

SETSEED(SEED)

# LOAD DATA

Modify the following paths according to your directory structure

In [5]:
# path to data file
DATA_PATH = 'sample_data.csv'

# path to model, 'None' if there is no model or you would like to train new model
MODEL_PATH = 'model.bin'

# DIRECTORY to save model in (you don't have to change this unless you want to train model)
SAVE_MODEL_DIR = ''

# DIRECTORY to save generated data in (you don't have to change this unless you want to save generated results)
SAVE_DATA_DIR = 'generated_results/'

In [9]:
# Mount google drive if using colab, else skip
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [33]:
all_data = pd.read_csv(DATA_PATH)
data = all_data[TRAIN_DATA_CKPT: TRAIN_DATA_CKPT + TRAIN_DATA_LIMIT]

In [10]:
class RecipeDataset(Dataset):
    def __init__(self, data, tokenizer, randomize=True):
        self.randomize = randomize
        self.tokenizer = tokenizer 
        self.ingredients = list(data['ingredients'])
        self.instructions = list(data['instructions'])

    def __len__(self):
        return len(self.ingredients)
    
    def __getitem__(self, i):
        
        input = SPECIAL_TOKENS['bos_token'] + self.ingredients[i] + SPECIAL_TOKENS['sep_token'] + \
                self.instructions[i] + SPECIAL_TOKENS['eos_token']

        encodings_dict = self.tokenizer(input,                                   
                                   truncation=True, 
                                   max_length=MAXLEN, 
                                   padding="max_length")   
        
        input_ids = encodings_dict['input_ids']
        attention_mask = encodings_dict['attention_mask']
        
        return {'label': torch.tensor(input_ids),
                'input_ids': torch.tensor(input_ids), 
                'attention_mask': torch.tensor(attention_mask)}

In [11]:
def get_tokenier(special_tokens):
    tokenizer = AutoTokenizer.from_pretrained(MODEL)  
    tokenizer.add_special_tokens(special_tokens)
    return tokenizer

def get_model(tokenizer, special_tokens, load_model_path=None):

    
    config = AutoConfig.from_pretrained(MODEL, 
                                        bos_token_id=tokenizer.bos_token_id,
                                        eos_token_id=tokenizer.eos_token_id,
                                        sep_token_id=tokenizer.sep_token_id,
                                        pad_token_id=tokenizer.pad_token_id,
                                        output_hidden_states=False)
   

    model = AutoModelForPreTraining.from_pretrained(MODEL, config=config)
    model.resize_token_embeddings(len(tokenizer))

    if load_model_path:
        model.load_state_dict(torch.load(load_model_path))

    model.cuda()
    return model

# Model Loading

In [12]:
tokenizer = get_tokenier(SPECIAL_TOKENS)
model = get_model(tokenizer, 
                  SPECIAL_TOKENS,
                  load_model_path= MODEL_PATH
                  )

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

# Model Training



In [None]:
train_data = data.iloc[0:int(len(data)*TRAIN_SIZE), :]
val_data = data.iloc[int(len(data)*TRAIN_SIZE):, :]

train_dataset = RecipeDataset(train_data, tokenizer)
val_dataset = RecipeDataset(val_data, tokenizer, randomize=False)

f'TRAINING SMAPLES: {len(train_dataset) :,} || TESTING SAMPLES: {len(val_dataset) :,}'

'TRAINING SMAPLES: 80 || TESTING SAMPLES: 20'

In [None]:
torch.cuda.empty_cache()
training_args = TrainingArguments(
    output_dir="/content/",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAIN_BATCHSIZE,
    per_device_eval_batch_size=TRAIN_BATCHSIZE,
    gradient_accumulation_steps=BATCH_UPDATE,
    evaluation_strategy="epoch",
    fp16=True,
    fp16_opt_level=APEX_OPT_LEVEL,
    warmup_steps=WARMUP_STEPS,    
    learning_rate=LR,
    adam_epsilon=EPS,
    weight_decay=0.01,        
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    args=training_args,    
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

trainer.train()
trainer.save_model()    

Using amp half precision backend
***** Running training *****
  Num examples = 80
  Num Epochs = 4
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 16
  Total optimization steps = 4


Epoch,Training Loss,Validation Loss
0,No log,0.459299
1,No log,0.459
2,No log,0.458438
3,No log,0.457684


***** Running Evaluation *****
  Num examples = 20
  Batch size = 4
***** Running Evaluation *****
  Num examples = 20
  Batch size = 4
***** Running Evaluation *****
  Num examples = 20
  Batch size = 4
***** Running Evaluation *****
  Num examples = 20
  Batch size = 4


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /content/
Configuration saved in /content/config.json
Model weights saved in /content/pytorch_model.bin
tokenizer config file saved in /content/tokenizer_config.json
Special tokens file saved in /content/special_tokens_map.json


# Model Saving

In [None]:
MODEL_NAME = 'TEST-model.bin'
path = SAVE_MODEL_DIR + MODEL_NAME
torch.save(model.state_dict(), path)

# Generation

In [13]:
#ingredient = list(data['ingredients'])[0]
ingredient = "Ingredients: 1. 2 pound of beef 2. 1 cup of water 3. 1 cup of coffee 4. 1 cup of cream 5. 1 tablespoon of salt 6. 2 tablespoon of sugar 7. 2 cup of cheese 8. 1 onion 9. 1 cup of flour"

prompt = SPECIAL_TOKENS['bos_token']  + ingredient + SPECIAL_TOKENS['sep_token']
         
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
device = torch.device("cuda")
generated = generated.to(device)

model.eval();

### Optimal Temperature: Best Results

In [14]:
sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                min_length=50, 
                                max_length=MAXLEN,
                                top_k=30,                                 
                                top_p=0.7,        
                                temperature=1.5,
                                repetition_penalty=1.0,
                                num_return_sequences=10
                                )

In [None]:
for i, sample_output in enumerate(sample_outputs):
    text = tokenizer.decode(sample_output, skip_special_tokens=True)
    if i == 0:
      print(text.split('Instructions:')[0], '\n', '-'*10, '\n', sep = '')
    print('Instructions:', text.split('Instructions:')[1], '\n', sep = '')

Ingredients: 1. 2 cups flour 2. 1 tablespoon cinnamon 3. 2 teaspoons baking soda 4. 1 teaspoon salt 5. 14 teaspoon baking powder 6. 3 eggs 7. 2 cups sugar 8. 1 cup vegetable oil 9. 1 tablespoon vanilla 10. 2 cups zucchini 11. 1 cup walnuts 
----------

Instructions: 1. Sift flour, cinnamon, baking soda, salt, and baking powder together into mixing bowl. 2. Make a well in the center, and pour in the eggs and sugar. 3. Stir together until blended. 4. Fold in vanilla and zucchini, nuts, and flour until well mixed. 5. Spoon batter into two greased and floured loaf pans, and bake at 350 degrees for one hour, or until done. 6. Let cool for 20 minutes before removing from pans. 

Instructions: 1. Combine first 8 ingredients in large bowl. 2. Beat with electric mixer on medium speed 3 minutes or until fluffy. 3. Stir in zucchini and nuts. 4. Pour into greased 8-inch square baking pan. 5. Bake at 350 degrees for 30 minutes or until toothpick inserted in center comes out clean. 6. Cool completel

### Low Temperature: High Bias

All the generated results will look very similar to each other if not completely identical

In [16]:
sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                min_length=50, 
                                max_length=MAXLEN,
                                top_k=30,                                 
                                top_p=0.7,        
                                temperature=0.1,
                                repetition_penalty=1.0,
                                num_return_sequences=10
                                )
for i, sample_output in enumerate(sample_outputs):
    text = tokenizer.decode(sample_output, skip_special_tokens=True)
    if i == 0:
      print(text.split('Instructions:')[0], '\n', '-'*10, '\n', sep = '')
    print('Instructions:', text.split('Instructions:')[1], '\n', sep = '')

Ingredients: 1. 2 pound of beef 2. 1 cup of water 3. 1 cup of coffee 4. 1 cup of cream 5. 1 tablespoon of salt 6. 2 tablespoon of sugar 7. 2 cup of cheese 8. 1 onion 9. 1 cup of flour
----------

Instructions: 1. Brown the ground beef in a skillet. 2. Add the water, coffee, cream, salt, sugar, and cheese. 3. Simmer for 20 minutes. 4. Add the onion and cook for another 10 minutes. 5. Add the flour and cook for 5 minutes. 6. Serve with mashed potatoes. 

Instructions: 1. Brown the ground beef in a skillet. 2. Add the water, coffee, cream, salt, sugar, and cheese. 3. Simmer for 20 minutes. 4. Add the onion and cook for another 10 minutes. 5. Add the flour and cook for another 10 minutes. 6. Serve with mashed potatoes. 

Instructions: 1. Brown the ground beef in a skillet. 2. Add the water, coffee, cream, salt, sugar, and cheese. 3. Simmer for 20 minutes. 4. Add the onion and cook for another 10 minutes. 5. Add the flour and cook for 5 minutes. 6. Serve with mashed potatoes. 

Instructions

### High Temperature: High Variance

starts to use ingreidents out side of input; loses structure

emm... Don't try this at home

In [19]:
sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                min_length=50, 
                                max_length=MAXLEN,
                                top_k=30,                                 
                                top_p=0.7,        
                                temperature=2.0,
                                repetition_penalty=1.0,
                                num_return_sequences=10
                                )
for i, sample_output in enumerate(sample_outputs):
    text = tokenizer.decode(sample_output, skip_special_tokens=True)
    if i == 0:
      print(text.split('Instructions:')[0], '\n', '-'*10, '\n', sep = '')
    print('Instructions:', text.split('Instructions:')[1], '\n', sep = '')

Ingredients: 1. 2 pound of beef 2. 1 cup of water 3. 1 cup of coffee 4. 1 cup of cream 5. 1 tablespoon of salt 6. 2 tablespoon of sugar 7. 2 cup of cheese 8. 1 onion 9. 1 cup of flour
----------

Instructions: 1. Combine first 6 ingredients. 2. Roll meat in flour, brown. 3. Remove and repeat. 4. Pour boiling water over all, mix well, and simmer 15 minute Put on wax paper or wax paper lined baking sheet, and freeze 1 to 1 1/2 months. 

Instructions: 1. Put beef and water in crock pot 2. Mix sugar, salt, sugar and cornstarch in bowl. 3. Add beef mixture and stir until smooth. 4. Layer crockpot with cheese layer, beef mix with onion and then with flour mixture in same manner. 5. Cook for 6 hours on Low. 6. Stir before serving, top with additional grated cheese. 

Instructions: 1. Brown hamburger in butter. 2. Stir in water. 3. Simmer till the sauce thickens. 4. Then add coffee. 5. Combine salt and 1 T sugar, bring to boil then stir into thickening sauce, stir in onions. 6. Put a thin laye

### VERY High Temperature: No, seriously, don't try this at home.

starts to lose even the basic structure of having "Instructions" infront. Completely insensible output, does not have an association with prompt

In [30]:
sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                min_length=50, 
                                max_length=MAXLEN,
                                top_k=30,                                 
                                top_p=0.7,        
                                temperature=10.0,
                                repetition_penalty=1.0,
                                num_return_sequences=10
                                )
for i, sample_output in enumerate(sample_outputs):
    text = tokenizer.decode(sample_output, skip_special_tokens=True)
    if i == 0:
      print(ingredient, '\n', '-'*10, '\n', sep = '')
    print(text[len(ingredient):], '\n')

Ingredients: 1. 2 pound of beef 2. 1 cup of water 3. 1 cup of coffee 4. 1 cup of cream 5. 1 tablespoon of salt 6. 2 tablespoon of sugar 7. 2 cup of cheese 8. 1 onion 9. 1 cup of flour
----------

 salt water dred the beef 9 pound flour salt all 10 urchkins roll up and tie the roast 12 slices day rolling 12 slices cheese and onions roll 1 egg white and 1 slice each fresh the chowder beef or turkey 9 pounds all season the rolls season flour seal seal well cut butter 10 1 cup oil deep the can fill 3x5 3 quart line fill center and rim 1 or 2 3/50 inch full make cup the pan fill holes 2 fill center and add fill 2 cover  cover oil 2 to 350 utes 9 or 12 hours if not serving over meat heat deep 12 1 x 25 cup flour dred ds water 15 x 12 rolls bread ices pre make bread holes 2 fill tops add fill the meatballs fill out the chuggets wrap cheese cut rolls slice iced in a deep hot pot to the center hole make cup and top fill them place slices put water over filling add some butter seal lid seal lid 

# Evaluation

In [None]:
SAVE_DATA_FILE_NAME = 'validation_data.csv'
test_data = all_data[:10] 
validation_df = pd.DataFrame(columns = ["truth", "prediction"])
TEMPERATURE = 1.5

In [None]:
for i, row in test_data.iterrows():

  prompt = SPECIAL_TOKENS['bos_token']  + row["ingredients"] + SPECIAL_TOKENS['sep_token']
        
  generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
  device = torch.device("cuda")
  generated = generated.to(device)

  model.eval()
  sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                min_length=50, 
                                max_length=MAXLEN,
                                top_k=30,                                 
                                top_p=0.7,        
                                temperature=TEMPERATURE,
                                repetition_penalty=1.0,
                                num_return_sequences=3)
  for i, sample_output in enumerate(sample_outputs):
    text = tokenizer.decode(sample_output, skip_special_tokens=True)
    text = text.split("Instructions:")[1]
    #print(text)
    validation_df = validation_df.append({
        'truth' : row["instructions"],
        'prediction': "Instructions:" + text
    }, ignore_index=True)

#validation_df.to_csv(SAVE_DATA_DIR + SAVE_DATA_FILE_NAME)  

In [None]:
from bleurt import score

references = validation_df['truth']
candidates = validation_df['prediction']

scorer = score.BleurtScorer()
scores = scorer.score(references=references, candidates=candidates)

INFO:tensorflow:No checkpoint specified, defaulting to BLEURT-tiny.
INFO:tensorflow:Reading checkpoint /usr/local/lib/python3.7/dist-packages/bleurt/test_checkpoint.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint dbleurt_tiny
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:dbleurt_tiny
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... max_seq_length:512
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating WordPiece tokenizer.
INFO:tensorflow:WordPiece tokenizer instantiated.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.
INFO:tensorflow:BLEURT initialized.


In [None]:
pd.DataFrame(scores).describe()

Unnamed: 0,0
count,30.0
mean,-0.453432
std,0.28842
min,-1.071038
25%,-0.604296
50%,-0.395737
75%,-0.296551
max,0.094377
