# Let's Tokenize the Dataset

To finetune our model, we have to tokenize the dataset.

## Let's Train

In [1]:
from datasets import load_dataset
dataset = load_dataset('csv', data_files={'train': "finalset/train_recipes.002.clean.csv",'test': "finalset/test_recipes.002.clean.csv"})

# dataset = load_dataset('csv', data_files="finalset/train_recipes.002.clean.csv", split='train').train_test_split(train_size=0.9, test_size=0.1)

Found cached dataset csv (/home/datascience/.cache/huggingface/datasets/csv/default-4cf2398602f53a9e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
dataset.shape

{'train': (65781, 2), 'test': (16609, 2)}

In [3]:
# let's shuffle and get smaller subset for quicker training

small_train_dataset = dataset["train"].shuffle(seed=42).select(range(5000))
small_eval_dataset = dataset["test"].shuffle(seed=42).select(range(500))

Loading cached shuffled indices for dataset at /home/datascience/.cache/huggingface/datasets/csv/default-4cf2398602f53a9e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-3c77251dcc9c8caa.arrow
Loading cached shuffled indices for dataset at /home/datascience/.cache/huggingface/datasets/csv/default-4cf2398602f53a9e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-2e9069c5a2a46689.arrow


In [4]:
small_train_dataset.shape

(5000, 2)

In [5]:
small_eval_dataset.shape

(500, 2)

In [6]:
# train_dataset = dataset['train']
# val_dataset = dataset['test']

train_dataset = small_train_dataset
val_dataset = small_eval_dataset

In [7]:
train_dataset.shape

(5000, 2)

In [8]:
val_dataset.shape

(500, 2)

In [9]:
train_dataset[10]['NER']

'Salt, Pepper, FILLING, Olive Oil, Sausage, Fresh Spinach Leaves, Eggs, Egg Whites, Coconut Milk, Colby Cheese, Salt, Pepper, Hot Sauce, Green Onions'

In [10]:
train_dataset[10]['merged_text']

"title: Country Breakfast Skillet Quiche With Spaghetti Squash Crust ingredients: FOR THE CRUST:, 1 Large Spaghetti Squash, 1/2 teaspoons Salt, 1/8 teaspoons Pepper, FOR THE FILLING:, 1 teaspoon Olive Oil, 16 ounces, weight Pork Sausage, 3 cups Fresh Spinach Leaves, 4 Eggs, 5 Egg Whites, 1 cup Coconut Milk, Unsweetened, 1 cup Shredded Colby Cheese, 1/2 teaspoons Salt, 1/8 teaspoons Pepper, Hot Sauce, Optional, 4 Green Onions, Sliced, Optional For Garnish directions: Make the sausage before starting on the crust. Add the oil into a 10\\ cast iron skillet and heat over medium heat. Add sausage and break it into pieces while it cooks. Brown completely. Drain grease. Spoon sausage into a medium-sized bowl and set aside. Don't wipe out the skillet! For the crust: Put the spaghetti squash in a large microwavable bowl. Prick the squash all over with a fork. Microwave on high for 15 minutes. Be careful removing it from the microwave because there will be very hot juice from the squash in the b

In [11]:
train_dataset[:3]

{'NER': ['olive oil, white wine, onion flakes, lemon, garlic, thyme, rosemary, ground sage, marjoram, salt, ground black pepper, hot pepper, chicken',
  'Butter, flour, milk, tuna, eggs',
  'condensed milk, lemon juice, whipped cream, peaches, cake'],
 'merged_text': ["title: Roasted Bone-In Chicken Breasts With Herbs ingredients: 3 tablespoons olive oil, 1/2 cup white wine, 1 tablespoon dried onion flakes, 1 lemon, zested, 1 fresh garlic clove, 1 teaspoon dried thyme, 1/2 teaspoon dried rosemary, crushed, 1/4 teaspoon ground sage, 1/4 teaspoon dried marjoram, 1/2 teaspoon salt, 1/2 teaspoon ground black pepper, 1/8 teaspoon hot pepper sauce, 2 chicken breast halves, bone-in with skin directions: Preheat oven to 425 degrees. In the baking dish you are going to use prepare the sauce/marinade by combining all of the ingredients. NOTE: you can also use a good poultry seasoning that isn't completely ground in place of the rosemary sage and marjoram since these are usually in this mix. If y

In [12]:
# val_dataset[10]['NER']
val_dataset[10]

{'NER': 'onion, soy sauce, rice vinegar, water, sugar, salt, freshly ground black pepper, sesame oil',
 'merged_text': 'title: Japanese Salad Dressing  ingredients: 1/4 cup finely chopped onion, 2 Tbsp. plus 2 teaspoons soy sauce, 1 Tbsp. rice vinegar, 2 teaspoons water, 1/2 teaspoon granulated sugar, A pinch sea salt, A pinch freshly ground black pepper, 4 teaspoons sesame oil directions: Combine all the ingredients except the oil. When the salt is fully dissolved add oil.'}

In [13]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config, TrainingArguments, Trainer

tokenizer = T5Tokenizer.from_pretrained('t5-base', model_max_length=512)

In [14]:
import torch

def tokenize_dataset(example):
    input_text = ['generate recipe: ' + doc if doc is not None else '' for doc in example['NER']]
#     target_text = [doc + tokenizer.eos_token if doc is not None else '' for doc in example['merged_text']]
    target_text = example['merged_text']
        
    input_tokenized = tokenizer(input_text, truncation=True, padding='max_length', max_length=512)
    target_tokenized = tokenizer(target_text, truncation=True, padding='max_length', max_length=512)
    labels = target_tokenized.input_ids
#     target_tokenized[target_tokenized == tokenizer.pad_token_id] = -100
#     labels = target_tokenized.input_ids


#     labels[labels == tokenizer.pad_token_id] = -100

    # replace padding token id's of the labels by -100 so it's ignored by the loss - WORKS!
    labels = torch.tensor(labels)
    labels[labels == tokenizer.pad_token_id] = -100

    return {
        'input_ids': input_tokenized['input_ids'],
        'attention_mask': input_tokenized['attention_mask'],
        'labels': labels,        
    }

train_tokenized = train_dataset.map(tokenize_dataset, batched=True, remove_columns=list(train_dataset.features))
val_tokenized = val_dataset.map(tokenize_dataset, batched=True, remove_columns=list(train_dataset.features))

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [15]:
train_tokenized['labels'][1]

[2233,
 10,
 14024,
 16693,
 7,
 275,
 16008,
 9,
 461,
 30884,
 3018,
 10,
 14698,
 41,
 1071,
 346,
 201,
 505,
 3,
 17,
 7,
 102,
 5,
 7055,
 6,
 314,
 3,
 75,
 5,
 3702,
 6,
 204,
 54,
 7,
 5240,
 9,
 6,
 586,
 5875,
 6,
 16544,
 7943,
 10,
 1491,
 173,
 5875,
 5,
 23291,
 11,
 25669,
 5875,
 5,
 5049,
 17,
 4194,
 617,
 7055,
 5,
 13522,
 552,
 4126,
 35,
 7,
 5,
 2334,
 3702,
 7831,
 5,
 2334,
 5240,
 9,
 5,
 2334,
 8,
 5875,
 5,
 13522,
 552,
 4126,
 35,
 7,
 5,
 5306,
 30,
 16544,
 5,
 1,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 

In [16]:
train_tokenized.save_to_disk('./tokenized_train_dataset_5k_v4')
val_tokenized.save_to_disk('./tokenized_test_dataset_5k_v4')

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

In [17]:
tokenizer.save_pretrained("fine_tuned_t5_recipes_base_5k_v1")

('fine_tuned_t5_recipes_base_5k_v1/tokenizer_config.json',
 'fine_tuned_t5_recipes_base_5k_v1/special_tokens_map.json',
 'fine_tuned_t5_recipes_base_5k_v1/spiece.model',
 'fine_tuned_t5_recipes_base_5k_v1/added_tokens.json')