In [1]:
import pandas as pd
import numpy as np
import math
import random
import csv
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from transformers import get_linear_schedule_with_warmup


In [2]:
import pandas as pd
import numpy as np
import math
import random
import csv
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from transformers import get_linear_schedule_with_warmup

# some parameters
epochs = 1
learning_rate = 5e-5  #default 1e-3 5e-5
epsilon = 1e-8  #default
model_name = "gpt2"
batch_size = 4

# this produces sample output every 100 steps
sample_every = 100
# save the model every 1000 step
save_every = 1000
# save the model to this file name
save_model = "models/topsmallest_epochs"
import nltk
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer, word_tokenize, pos_tag
import re

# Ensure NLTK downloads
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')

stop_words = set(stopwords.words('english'))


def preprocess_ingredients(ingredients):
    ingredients_list = eval(ingredients)
    processed_ingredients = []
    regex = re.compile('[^a-zA-Z ]')
    lemmatizer = WordNetLemmatizer()
    #stemmer = PorterStemmer()

    # POS tags that represent nouns
    noun_tags = ['NN', 'NNS', 'NNP', 'NNPS']

    # Define the words to be dropped
    #words_to_drop = {"powder", "brown", "salt", "water", "sugar", "onion", "butter", "pepper", "ground", "cream"}

    for ingr in ingredients_list:
        ingr = regex.sub(' ', ingr.lower()).strip()
        components = [comp.strip() for comp in ingr.split('and')]

        for comp in components:

            sentence = ""
            tokens = word_tokenize(comp)  # Tokenize each component
            tagged_tokens = pos_tag(tokens)  # Perform POS tagging

            # Extract main nouns while handling compound nouns
            nouns = []
            current_noun = ""
            for word, tag in tagged_tokens:
                word = lemmatizer.lemmatize(word.strip())
                if len(word) > 2 and word not in stop_words and tag in noun_tags:  # and word not in words_to_drop
                    if current_noun:
                        nouns.append(current_noun)
                        current_noun = ""
                    current_noun = word

            # Add last current noun if exists
            if current_noun:
                nouns.append(current_noun)

            for word in nouns:
                singular_comp = lemmatizer.lemmatize(word.strip())
                #stemmed_word = stemmer.stem(singular_comp)

                if singular_comp not in stop_words and len(singular_comp) > 2:
                    sentence += singular_comp + " "

            if sentence.strip():
                processed_ingredients.append(sentence.strip())

    return list(set(processed_ingredients))


# Funzione di preprocessing per le tecniche
def preprocess_techniques(techniques):
    techniques_list = eval(techniques)
    processed_techniques = []

    for technique in techniques_list:
        technique = technique.lower().strip()
        tokens = word_tokenize(technique)
        tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
        processed_techniques.append(' '.join(tokens))

    return processed_techniques


# Funzione per preprocessare ogni riga della ricetta
def process_recipe(row):
    ingredients = preprocess_ingredients(row['ingredients'])
    #ingredients = preprocess_techniques(row['ingredients'])
    techniques = preprocess_techniques(row['techniques_list'])
    instructions = row['steps'].lower().replace('\'', '').replace('[', '').replace(']', '').replace('"', '')

    ingredients_str = ''.join([f", {ingr}" for ingr in ingredients])
    ingredients_str = ingredients_str.replace(", ", "", 1)

    techniques_str = ''.join([f", {tec}" for tec in techniques])
    techniques_str = techniques_str.replace(", ", "", 1)

    recipe_instance = f"[BOS] [INGREDIENTS] {ingredients_str} [TECHNIQUES] {techniques_str} [STEPS] {instructions} [EOS]"
    return recipe_instance


from joblib import Parallel, delayed
import multiprocessing


# Funzione per preprocessare i dati in parallelo
def load_preprocess_raw_data_parallel(raw_data):
    with open(raw_data, 'r', encoding='utf-8') as f:
        reader = list(csv.DictReader(f))  # Convertiamo reader in una lista
        num_cores = multiprocessing.cpu_count()  # Otteniamo il numero di core della CPU
        print(f"Number of CPU cores: {num_cores}")

        # Eseguiamo il preprocessing in parallelo
        recipe_instances = Parallel(n_jobs=num_cores)(
            delayed(process_recipe)(row) for row in tqdm(reader, desc="Processing recipes", unit="recipes")
        )

    return recipe_instances


# create text list for dataset
# https://www.kaggle.com/datasets/shuyangli94/food-com-recipes-and-user-interactions/data
#recipe_list = load_preprocess_raw_data("dataset/RAW_merged.csv")
recipe_list = load_preprocess_raw_data_parallel("dataset/RAW_merged_top_smallest.csv")
recipe_list = recipe_list[:100]

train_list, test_list = np.split(recipe_list, [int(.8 * len(recipe_list))])
print('\nNumber of train data: ', len(train_list))
print('Number of test data: ', len(test_list))
print(train_list[0])
# Load the GPT tokenizer.
tokenizer = GPT2Tokenizer.from_pretrained(model_name, bos_token='[BOS]', eos_token='[EOS]', pad_token='[PAD]')
# add special tokens for title, ingredients and instruction seperator
special_tokens_dict = {'additional_special_tokens': ['[INGREDIENTS]', '[TECHNIQUES]', '[STEPS]']}  # '[INGR]', '[STEP]'
# check the number of special tokens
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print('We have added', num_added_toks, 'tokens')
# Verifica gli ID dei token speciali
special_tokens = ['[BOS]', '[EOS]', '[PAD]', '[INGREDIENTS]', '[TECHNIQUES]', '[STEPS]']  # '[INGR]'
for token in special_tokens:
    token_id = tokenizer.convert_tokens_to_ids(token)
    print(f"Token: {token}, ID: {token_id}")
# Esempio di testi
texts = [
    train_list[0]
]

# Tokenizzazione
encodings = tokenizer.batch_encode_plus(
    texts,
    truncation=True,
    max_length=320,
    padding='max_length',
    return_tensors='pt',
    add_special_tokens=True  # Assicurati che i token speciali siano inclusi
)

# Visualizza gli input_ids e attention_mask
print("Input IDs:")
print(encodings['input_ids'])
print("\nAttention Mask:")
print(encodings['attention_mask'])
# Decodifica dei token
decoded_inputs = [tokenizer.decode(ids, skip_special_tokens=False) for ids in encodings['input_ids']]
for i, decoded in enumerate(decoded_inputs):
    print(f"Decoded Input {i}: {decoded}")
lengths = [len(tokenizer.encode(recipe)) for recipe in recipe_list]
max_length_in_data = max(lengths)
avg_length_in_data = sum(lengths) / len(lengths)
print(f"Lunghezza massima: {max_length_in_data}, Lunghezza media: {avg_length_in_data}")


# Lunghezza massima: 312, Lunghezza media: 136.59567387687187
class GPT2Dataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length=320):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.texts = [text.lower() for text in txt_list]  # Preprocess texts to lowercase

        # Tokenize all texts in a batch
        encodings = tokenizer.batch_encode_plus(
            self.texts,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors='pt',
            add_special_tokens=True
        )

        self.input_ids = encodings['input_ids']
        self.attn_masks = encodings['attention_mask']

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]


dataset = GPT2Dataset(train_list, tokenizer, max_length=320)

# Split into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))
# Create the DataLoaders for our training and validation datasets.
# We'll take training samples in random order.
train_dataloader = DataLoader(
    train_dataset,  # The training samples.
    sampler=RandomSampler(train_dataset),  # Select batches randomly
    batch_size=batch_size  # Trains with this batch size.
)

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
    val_dataset,  # The validation samples.
    sampler=SequentialSampler(val_dataset),  # Pull out batches sequentially.
    batch_size=batch_size  # Evaluate with this batch size.
)
# I'm not really doing anything with the config buheret
configuration = GPT2Config.from_pretrained(model_name, output_hidden_states=False)

# instantiate the model
model = GPT2LMHeadModel.from_pretrained(model_name, config=configuration)

# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
# otherwise the tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))

# Tell pytorch to run this model on the GPU.
if torch.cuda.is_available():
    device = torch.device("cuda")
    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)
else:
    device = torch.device("cpu")

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
if device == "cuda":
    torch.cuda.manual_seed_all(seed_val)
# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
optimizer = torch.optim.AdamW(model.parameters(),
                              lr=learning_rate,
                              eps=epsilon
                              )
# Total number of training steps is [number of batches] x [number of epochs].
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs
print('Total number of steps: ', total_steps)

warmup_steps = int(0.05 * total_steps)  # 5% del totale degli step
# Create the learning rate scheduler.
# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=warmup_steps,
                                            num_training_steps=total_steps)
training_stats = []
print("Currently using device type: ", device)

model = model.to(device)

for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    losses = []

    total_train_loss = 0

    model.train()

    loop = tqdm(train_dataloader, leave=True)
    for step, batch in enumerate(loop):

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        model.zero_grad()

        outputs = model(b_input_ids,
                        labels=b_labels,
                        attention_mask=b_masks
                        )

        loss = outputs[0]

        batch_loss = loss.item()
        total_train_loss += batch_loss
        losses.append(batch_loss)

        # Get sample every x batches.
        if step % sample_every == 0 and not step == 0:
            print('Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.'.format(step, len(train_dataloader), batch_loss))

        loss.backward()

        optimizer.step()

        scheduler.step()

        if step % save_every == 0:
            model.save_pretrained(save_model)

        loop.set_postfix(loss=batch_loss)

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)

    # Calculate perplexity.
    losses = torch.tensor(losses)
    train_perplexity = math.exp(torch.mean(losses))

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Perplexity: {0:.2f}".format(train_perplexity))
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    model.eval()

    losses = []
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        with torch.no_grad():
            outputs = model(b_input_ids,
                            attention_mask=b_masks,
                            labels=b_labels)

            loss = outputs[0]

        batch_loss = loss.item()
        losses.append(batch_loss)
        total_eval_loss += batch_loss

    avg_val_loss = total_eval_loss / len(validation_dataloader)

    # Calculate perplexity.
    losses = torch.tensor(losses)
    val_perplexity = math.exp(torch.mean(losses))

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation perplexity: {0:.2f}".format(val_perplexity))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Perplexity': train_perplexity,
            'Valid. Perplexity': val_perplexity,
        }
    )

print("")
print("Training complete!")
#model.save_pretrained(save_model)
# # prepare datasets for dev_list and test_list
# test_dataset = GPT2Dataset(test_list, tokenizer, max_length=320)
# # load the datasets
# test_dataloader = DataLoader(
#     test_dataset,  # The validation samples.
#     sampler=SequentialSampler(test_dataset),  # Pull out batches sequentially.
#     batch_size=batch_size  # Evaluate with this batch size.
# )
# def evaluate_model(model, dataloaded):
#     model = model.to(device)
#     model.eval()
#
#     losses = []
#     perplexity = []
#     total_eval_loss = 0
#
#     # Evaluate data for one epoch
#     for batch in dataloaded:
#         b_input_ids = batch[0].to(device)
#         b_labels = batch[0].to(device)
#         b_masks = batch[1].to(device)
#
#         with torch.no_grad():
#             outputs = model(b_input_ids,
#                             attention_mask=b_masks,
#                             labels=b_labels)
#
#             loss = outputs[0]
#
#         batch_loss = loss.item()
#         losses.append(batch_loss)
#         total_eval_loss += batch_loss
#
#     avg_val_loss = total_eval_loss / len(dataloaded)
#
#     # Calculate perplexity.
#     losses = torch.tensor(losses)
#     val_perplexity = math.exp(torch.mean(losses))
#     perplexity.append(val_perplexity)
#
#     print("  Validation Loss: {0:.2f}".format(avg_val_loss))
#     print("  Validation perplexity: {0:.2f}".format(val_perplexity))
#     return avg_val_loss, val_perplexity
# print('Testing...')
# test_loss, test_perplexity = evaluate_model(model, test_dataloader)
# test_eval_df = pd.DataFrame(columns=["test_loss", "test_perplexity"])
# test_eval_df['test_loss'] = test_loss
# test_eval_df['test_perplexity'] = test_perplexity
# test_eval_df.to_csv("test_eval.csv")
# this produces sample output every 100 steps
sample_every = 100
# save the model every 1000 step
save_every = 1000
# save the model to this file name
save_model = "models/topsmallest_epochs"

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer, word_tokenize, pos_tag
import re

# Ensure NLTK downloads
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')

stop_words = set(stopwords.words('english'))

def preprocess_ingredients(ingredients):
    ingredients_list = eval(ingredients)
    processed_ingredients = []
    regex = re.compile('[^a-zA-Z ]')
    lemmatizer = WordNetLemmatizer()
    #stemmer = PorterStemmer()
    
    # POS tags that represent nouns
    noun_tags = ['NN', 'NNS', 'NNP', 'NNPS']
    
    # Define the words to be dropped
    #words_to_drop = {"powder", "brown", "salt", "water", "sugar", "onion", "butter", "pepper", "ground", "cream"} 

    for ingr in ingredients_list:
        ingr = regex.sub(' ', ingr.lower()).strip()
        components = [comp.strip() for comp in ingr.split('and')]
        

        for comp in components:
                        
            sentence = ""
            tokens = word_tokenize(comp)  # Tokenize each component
            tagged_tokens = pos_tag(tokens)  # Perform POS tagging
            
            # Extract main nouns while handling compound nouns
            nouns = []
            current_noun = ""
            for word, tag in tagged_tokens:
                word = lemmatizer.lemmatize(word.strip())
                if len(word) > 2 and word not in stop_words and tag in noun_tags: # and word not in words_to_drop
                    if current_noun:
                        nouns.append(current_noun)
                        current_noun = ""
                    current_noun = word
            
            # Add last current noun if exists
            if current_noun:
                nouns.append(current_noun)            
            
            for word in nouns:
                singular_comp = lemmatizer.lemmatize(word.strip())
                #stemmed_word = stemmer.stem(singular_comp)
            
                if singular_comp not in stop_words and len(singular_comp) > 2:
                    sentence += singular_comp + " "
                    
            if sentence.strip():
                processed_ingredients.append(sentence.strip())

    return list(set(processed_ingredients))

# Funzione di preprocessing per le tecniche
def preprocess_techniques(techniques):
    techniques_list = eval(techniques)
    processed_techniques = []

    for technique in techniques_list:
        technique = technique.lower().strip()
        tokens = word_tokenize(technique)
        tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
        processed_techniques.append(' '.join(tokens))

    return processed_techniques

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/matteorigat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/matteorigat/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/matteorigat/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matteorigat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/matteorigat/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [4]:
# Funzione per preprocessare ogni riga della ricetta
def process_recipe(row):
    ingredients = preprocess_ingredients(row['ingredients'])
    #ingredients = preprocess_techniques(row['ingredients'])
    techniques = preprocess_techniques(row['techniques_list'])
    instructions = row['steps'].lower().replace('\'', '').replace('[', '').replace(']', '').replace('"', '')
    
    ingredients_str = ''.join([f", {ingr}" for ingr in ingredients])
    ingredients_str = ingredients_str.replace(", ", "", 1)
    
    techniques_str = ''.join([f", {tec}" for tec in techniques])
    techniques_str = techniques_str.replace(", ", "", 1)

    recipe_instance = f"[BOS] [INGREDIENTS] {ingredients_str} [TECHNIQUES] {techniques_str} [STEPS] {instructions} [EOS]"
    return recipe_instance

In [5]:
from joblib import Parallel, delayed
import multiprocessing

# Funzione per preprocessare i dati in parallelo
def load_preprocess_raw_data_parallel(raw_data):
    with open(raw_data, 'r', encoding='utf-8') as f:
        reader = list(csv.DictReader(f))  # Convertiamo reader in una lista
        num_cores = multiprocessing.cpu_count()  # Otteniamo il numero di core della CPU
        print(f"Number of CPU cores: {num_cores}")
        
        # Eseguiamo il preprocessing in parallelo
        recipe_instances = Parallel(n_jobs=num_cores)(
            delayed(process_recipe)(row) for row in tqdm(reader, desc="Processing recipes", unit="recipes")
        )

    return recipe_instances

In [6]:
# create text list for dataset
# https://www.kaggle.com/datasets/shuyangli94/food-com-recipes-and-user-interactions/data
#recipe_list = load_preprocess_raw_data("dataset/RAW_merged.csv")
recipe_list = load_preprocess_raw_data_parallel("dataset/RAW_merged_top_smallest.csv")
recipe_list = recipe_list[:100]

train_list, test_list = np.split(recipe_list, [int(.8 * len(recipe_list))])
print('\nNumber of train data: ', len(train_list))
print('Number of test data: ', len(test_list))

Number of CPU cores: 8


Processing recipes: 100%|██████████| 6010/6010 [00:04<00:00, 1227.87recipes/s]



Number of train data:  80
Number of test data:  20


In [7]:
print(train_list[0])

[BOS] [INGREDIENTS] virgin oil, rom tomato, clove, juice, basil, salt [TECHNIQUES] boil, grill, marinate, simmer, toss [STEPS] mix all ingredients together and set on counter stirring occasionally, the salt will bring out the juice of the tomatoes, taste and add more seasonings to your personal taste, you can use fresh basil or oregano if preferred , just remember to chopped finely and use a little less than dried, uses: serve with almost any meal as a side, on top of a green salad, use as a topping for bruschetta--just chop tomatoes instead of slicing, use for caprese salad--add sliced buffalo mozzarella and youre done, have some leftover juice: place some sliced chicken breasts and marinate over night , then carefully remove the chicken and quickly saut them until done, add remaining juice and tomatoes , boil for 1-2 minutes and then add cooked penne pasta , toss , and simmer for 5-6 minutes, juice uses: you can reuse the marinade with new tomatoes, just add some more sliced tomatoes

In [8]:
# Load the GPT tokenizer.
tokenizer = GPT2Tokenizer.from_pretrained(model_name, bos_token='[BOS]', eos_token='[EOS]', pad_token='[PAD]')
# add special tokens for title, ingredients and instruction seperator
special_tokens_dict = {'additional_special_tokens': ['[INGREDIENTS]', '[TECHNIQUES]', '[STEPS]']} # '[INGR]', '[STEP]'
# check the number of special tokens
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print('We have added', num_added_toks, 'tokens')

We have added 3 tokens


In [9]:
# Verifica gli ID dei token speciali
special_tokens = ['[BOS]', '[EOS]', '[PAD]', '[INGREDIENTS]', '[TECHNIQUES]', '[STEPS]'] # '[INGR]'
for token in special_tokens:
    token_id = tokenizer.convert_tokens_to_ids(token)
    print(f"Token: {token}, ID: {token_id}")

Token: [BOS], ID: 50257
Token: [EOS], ID: 50258
Token: [PAD], ID: 50259
Token: [INGREDIENTS], ID: 50260
Token: [TECHNIQUES], ID: 50261
Token: [STEPS], ID: 50262


In [10]:
# Esempio di testi
texts = [
    train_list[0]
]

# Tokenizzazione
encodings = tokenizer.batch_encode_plus(
    texts,
    truncation=True,
    max_length=320,
    padding='max_length',
    return_tensors='pt',
    add_special_tokens=True  # Assicurati che i token speciali siano inclusi
)

# Visualizza gli input_ids e attention_mask
print("Input IDs:")
print(encodings['input_ids'])
print("\nAttention Mask:")
print(encodings['attention_mask'])

Input IDs:
tensor([[50257,   220, 50260, 21772,  3056,    11,  9267, 24240,    11,   537,
           659,    11, 13135,    11, 37792,    11,  8268,   220, 50261, 20667,
            11, 29901,    11,  1667,  4559,    11, 32857,    11, 12153,   220,
         50262,  5022,   477,  9391,  1978,   290,   900,   319,  3753, 26547,
         10491,    11,   262,  8268,   481,  2222,   503,   262, 13135,   286,
           262, 23972,    11,  6938,   290,   751,   517,  1622,   654,   284,
           534,  2614,  6938,    11,   345,   460,   779,  4713, 37792,   393,
         23751,  1030,    78,   611,  9871,   837,   655,  3505,   284, 20720,
         32566,   290,   779,   257,  1310,  1342,   621, 16577,    11,  3544,
            25,  4691,   351,  2048,   597,  9799,   355,   257,  1735,    11,
           319,  1353,   286,   257,  4077, 20698,    11,   779,   355,   257,
         34366,   329,   865,   385,  2395, 25854,   438,  3137, 30506, 23972,
          2427,   286, 49289,    11,   77

In [11]:
# Decodifica dei token
decoded_inputs = [tokenizer.decode(ids, skip_special_tokens=False) for ids in encodings['input_ids']]
for i, decoded in enumerate(decoded_inputs):
    print(f"Decoded Input {i}: {decoded}")

Decoded Input 0: [BOS]  [INGREDIENTS]  virgin oil, rom tomato, clove, juice, basil, salt  [TECHNIQUES]  boil, grill, marinate, simmer, toss  [STEPS]  mix all ingredients together and set on counter stirring occasionally, the salt will bring out the juice of the tomatoes, taste and add more seasonings to your personal taste, you can use fresh basil or oregano if preferred, just remember to chopped finely and use a little less than dried, uses: serve with almost any meal as a side, on top of a green salad, use as a topping for bruschetta--just chop tomatoes instead of slicing, use for caprese salad--add sliced buffalo mozzarella and youre done, have some leftover juice: place some sliced chicken breasts and marinate over night, then carefully remove the chicken and quickly saut them until done, add remaining juice and tomatoes, boil for 1-2 minutes and then add cooked penne pasta, toss, and simmer for 5-6 minutes, juice uses: you can reuse the marinade with new tomatoes, just add some mo

In [12]:
lengths = [len(tokenizer.encode(recipe)) for recipe in recipe_list]
max_length_in_data = max(lengths)
avg_length_in_data = sum(lengths) / len(lengths)
print(f"Lunghezza massima: {max_length_in_data}, Lunghezza media: {avg_length_in_data}")

# Lunghezza massima: 312, Lunghezza media: 136.59567387687187

Lunghezza massima: 293, Lunghezza media: 137.77


In [13]:
class GPT2Dataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length=320):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.texts = [text.lower() for text in txt_list]  # Preprocess texts to lowercase
        
        # Tokenize all texts in a batch
        encodings = tokenizer.batch_encode_plus(
            self.texts,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors='pt',
            add_special_tokens=True
        )
        
        self.input_ids = encodings['input_ids']
        self.attn_masks = encodings['attention_mask']

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [14]:
dataset = GPT2Dataset(train_list, tokenizer, max_length=320)

# Split into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

   64 training samples
   16 validation samples


In [15]:
# Create the DataLoaders for our training and validation datasets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
    train_dataset,  # The training samples.
    sampler=RandomSampler(train_dataset),  # Select batches randomly
    batch_size=batch_size  # Trains with this batch size.
)

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
    val_dataset,  # The validation samples.
    sampler=SequentialSampler(val_dataset),  # Pull out batches sequentially.
    batch_size=batch_size  # Evaluate with this batch size.
)

In [16]:
# I'm not really doing anything with the config buheret
configuration = GPT2Config.from_pretrained(model_name, output_hidden_states=False)

# instantiate the model
model = GPT2LMHeadModel.from_pretrained(model_name, config=configuration)

# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
# otherwise the tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))

# Tell pytorch to run this model on the GPU.
if torch.cuda.is_available():
    device = torch.device("cuda")
    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)
else:
    device = torch.device("cpu")

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
if device == "cuda":
    torch.cuda.manual_seed_all(seed_val)

In [17]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
optimizer = torch.optim.AdamW(model.parameters(),
                              lr=learning_rate,
                              eps=epsilon
                              )
# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs
print('Total number of steps: ', total_steps)

warmup_steps = int(0.05 * total_steps) # 5% del totale degli step
# Create the learning rate scheduler.
# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=warmup_steps,
                                            num_training_steps=total_steps)

Total number of steps:  16


In [18]:
training_stats = []
print("Currently using device type: ", device)

model = model.to(device)

for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    losses = []

    total_train_loss = 0

    model.train()

    loop = tqdm(train_dataloader, leave=True)
    for step, batch in enumerate(loop):

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        model.zero_grad()

        outputs = model(b_input_ids,
                        labels=b_labels,
                        attention_mask=b_masks
                        )

        loss = outputs[0]

        batch_loss = loss.item()
        total_train_loss += batch_loss
        losses.append(batch_loss)

        # Get sample every x batches.
        if step % sample_every == 0 and not step == 0:
            print('Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.'.format(step, len(train_dataloader), batch_loss))

        loss.backward()

        optimizer.step()

        scheduler.step()

        if step % save_every == 0:
            model.save_pretrained(save_model)

        loop.set_postfix(loss=batch_loss)

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)

    # Calculate perplexity.
    losses = torch.tensor(losses)
    train_perplexity = math.exp(torch.mean(losses))

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Perplexity: {0:.2f}".format(train_perplexity))
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    model.eval()

    losses = []
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        with torch.no_grad():
            outputs = model(b_input_ids,
                            attention_mask=b_masks,
                            labels=b_labels)

            loss = outputs[0]

        batch_loss = loss.item()
        losses.append(batch_loss)
        total_eval_loss += batch_loss

    avg_val_loss = total_eval_loss / len(validation_dataloader)

    # Calculate perplexity.
    losses = torch.tensor(losses)
    val_perplexity = math.exp(torch.mean(losses))

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation perplexity: {0:.2f}".format(val_perplexity))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Perplexity': train_perplexity,
            'Valid. Perplexity': val_perplexity,
        }
    )

print("")
print("Training complete!")

Currently using device type:  cpu

Training...


100%|██████████| 16/16 [00:49<00:00,  3.11s/it, loss=3.96]



  Average training loss: 11.65
  Perplexity: 115188.20

Running Validation...
  Validation Loss: 4.49
  Validation perplexity: 89.30

Training complete!


In [19]:
#model.save_pretrained(save_model)

In [20]:
# # prepare datasets for dev_list and test_list
# test_dataset = GPT2Dataset(test_list, tokenizer, max_length=320)
# # load the datasets
# test_dataloader = DataLoader(
#     test_dataset,  # The validation samples.
#     sampler=SequentialSampler(test_dataset),  # Pull out batches sequentially.
#     batch_size=batch_size  # Evaluate with this batch size.
# )

In [21]:
# def evaluate_model(model, dataloaded):
#     model = model.to(device)
#     model.eval()
# 
#     losses = []
#     perplexity = []
#     total_eval_loss = 0
# 
#     # Evaluate data for one epoch
#     for batch in dataloaded:
#         b_input_ids = batch[0].to(device)
#         b_labels = batch[0].to(device)
#         b_masks = batch[1].to(device)
# 
#         with torch.no_grad():
#             outputs = model(b_input_ids,
#                             attention_mask=b_masks,
#                             labels=b_labels)
# 
#             loss = outputs[0]
# 
#         batch_loss = loss.item()
#         losses.append(batch_loss)
#         total_eval_loss += batch_loss
# 
#     avg_val_loss = total_eval_loss / len(dataloaded)
# 
#     # Calculate perplexity.
#     losses = torch.tensor(losses)
#     val_perplexity = math.exp(torch.mean(losses))
#     perplexity.append(val_perplexity)
# 
#     print("  Validation Loss: {0:.2f}".format(avg_val_loss))
#     print("  Validation perplexity: {0:.2f}".format(val_perplexity))
#     return avg_val_loss, val_perplexity

In [22]:
# print('Testing...')
# test_loss, test_perplexity = evaluate_model(model, test_dataloader)
# test_eval_df = pd.DataFrame(columns=["test_loss", "test_perplexity"])
# test_eval_df['test_loss'] = test_loss
# test_eval_df['test_perplexity'] = test_perplexity
# test_eval_df.to_csv("test_eval.csv")