In [49]:
import numpy as np
import pandas as pd
from sklearn import metrics
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer

In [50]:
# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [51]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05

In [52]:
df = pd.read_csv("dataset/RAW_recipes.csv")

new_df = df[['ingredients', 'steps']].copy()
sample_size = int(len(df) * 0.01)
new_df = new_df.sample(n=sample_size).reset_index(drop=True)
# Preprocess the ingredients and steps columns
separator = ", "
new_df['ingredients'] = new_df['ingredients'].apply(lambda x: separator.join(eval(x)))
new_df['steps'] = new_df['steps'].apply(lambda x: separator.join(eval(x)))

print(new_df.shape)
print(new_df.head())

(2316, 2)
                                         ingredients  \
0  all-purpose flour, boneless beef chuck steaks,...   
1  sun-dried tomato packed in oil, asparagus, shr...   
2  link sausage, potatoes, onion, water, salt, pe...   
3  fresh strawberries, granulated sugar, cottage ...   
4  dry breadcrumbs, semisweet chocolate, sugar, m...   

                                               steps  
0  cut rib steaks or roast into 3 / 8" cubes, pla...  
1  heat oil reserved from tomatoes in heavy large...  
2  use raw link sausage- the kind that is in a ca...  
3  slice strawberries and add granulated sugar, i...  
4  combine breadcrumbs , chocolate chunks , sugar...  


In [53]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.ingredients = dataframe.ingredients
        self.steps = dataframe.steps
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        ingredients = str(self.ingredients[index])
        steps = str(self.steps[index])
        
        # Tokenize ingredients
        inputs = self.tokenizer.encode_plus(
            ingredients,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        
        # Tokenize steps
        outputs = self.tokenizer.encode_plus(
            steps,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )

        return {
            'ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'token_type_ids': torch.tensor(inputs["token_type_ids"], dtype=torch.long),
            'labels': torch.tensor(inputs['input_ids'], dtype=torch.long),
        }

In [55]:
from torch.utils.data import random_split

# Creating the dataset and dataloader for the neural network
train_dataset, val_dataset, test_dataset = random_split(new_df, [0.8, 0.1, 0.1])

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

AttributeError: 'Subset' object has no attribute 'ingredients'

In [None]:
BATCH_SIZE = 16
train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size = BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size = BATCH_SIZE, shuffle=False)

batch = next(iter(train_loader))
print(batch["input_ids"].size())

In [None]:
def train(model, epoch, optimizer, device, dataloader, val_loader):
    for i in range(1, epoch + 1):
        model.train()
        acc_loss = 0
        for batch in tqdm(dataloader, desc='Training Epoch ' + str(i)):
            input_ids = batch['input_ids'].to(device)
            input_ids = torch.squeeze(input_ids)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            acc_loss += float(loss) * BATCH_SIZE

        print("EPOCH training loss: " + str(acc_loss / len(dataloader.dataset)))

        # Evaluate the model on the validation set
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                input_ids = torch.squeeze(input_ids)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()

        print(f'Epoch {i+1}/{epoch}, Train Loss: {str(acc_loss / len(dataloader.dataset))} Val Loss: {val_loss/len(val_loader)}')

In [None]:
from transformers import BertForMaskedLM

LEARNING_RATE = 0.00002
EPOCH = 5

model = BertForMaskedLM.from_pretrained('bert-base-uncased')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

In [None]:
train(model, EPOCH, optimizer, device, train_loader, val_loader)

In [None]:
model.save_pretrained("fine-tuned-bert")

In [None]:
model.eval()
input_text = "artichoke"
num_ingredients_to_generate = 4
input_ids = dataset.tokenizer.encode(input_text, add_special_tokens=True, return_tensors='pt')
print(input_ids)

In [None]:
model.to(device)
for i in range(num_ingredients_to_generate):
    print("Input: ", input_text)
    input_ids = new_df.tokenizer.encode(input_text + " [MASK]", add_special_tokens=True, return_tensors='pt').to(device)
    attention_mask = input_ids.ne(new_df.tokenizer.pad_token_id).long().to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
    
    generated_text = new_df.tokenizer.decode(torch.argmax(logits, dim=-1).squeeze(), skip_special_tokens=True)
    print(generated_text)
    input_text = generated_text