In [None]:
import torch
import torch.nn as nn
from transformers import GPT2TokenizerFast, GPT2LMHeadModel

from transformers import Trainer, TrainingArguments

from tqdm.auto import tqdm

import pandas as pd
import numpy as np

In [None]:
clean = pd.read_csv('/content/Cleaned_Indian_Food_Dataset.csv').head(1000)

In [None]:
model_name = 'gpt2'

In [None]:
model_save_path = './100GPT'

In [None]:
tokenizer = GPT2TokenizerFast.from_pretrained(model_name,
                                              bos_token='<|startoftext|>',
                                              eos_token='<|endoftext|>',
                                              unk_token='<|unknown|>',
                                              pad_token='<|pad|>'
                                             )
model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Embedding(50260, 768)

In [None]:
train_size = 0.7

In [None]:
max_length = 512

In [None]:
tokenizer.save_pretrained(model_save_path)

('./100GPT/tokenizer_config.json',
 './100GPT/special_tokens_map.json',
 './100GPT/vocab.json',
 './100GPT/merges.txt',
 './100GPT/added_tokens.json',
 './100GPT/tokenizer.json')

In [None]:
tokenizer.convert_tokens_to_ids(['<|pad|>'])

[50259]

In [None]:
def generate(prompt):
    inputs = tokenizer.encode_plus(prompt, return_tensors='pt')
    output = model.generate(**inputs, max_length=max_length, do_sample=True, pad_token_id=tokenizer.eos_token_id)
    print(tokenizer.decode(output[0]))

In [None]:
clean = clean.sample(frac=1)
clean.reset_index(drop=True, inplace=True)


In [None]:
train_len = int(train_size * len(clean))
clean = clean[:train_len]

In [None]:
tokenizer.special_tokens_map

{'bos_token': '<|startoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|unknown|>',
 'pad_token': '<|pad|>'}

In [None]:
tokenizer.convert_tokens_to_ids(['<|startoftext|>'],)

[50257]

In [None]:
clean = pd.read_csv('/content/Cleaned_Indian_Food_Dataset.csv').head(1000)
clean = clean.sample(frac=1)
clean.reset_index(drop=True,inplace=True)

In [None]:
def print_recipe(idx):
    print(f"{clean['ingredients'][idx]}\n\n{clean['instructions'][idx]}")

In [None]:
def form_string(ingredient, instruction):
    s = f"Ingredients:\n{ingredient.strip()}\n\nInstructions:\n{instruction.strip()}"
    return s

In [None]:
print(clean.columns)


Index(['TranslatedRecipeName', 'TranslatedIngredients', 'TotalTimeInMins',
       'Cuisine', 'TranslatedInstructions', 'URL', 'Cleaned-Ingredients',
       'image-url', 'Ingredient-count'],
      dtype='object')


In [None]:
data = clean.apply(lambda x: form_string(x['Cleaned-Ingredients'], x['TranslatedInstructions']), axis=1).to_list()


In [None]:
train_size = 0.85
train_len = int(train_size * len(data))
train_data = data[:train_len]
val_data = data[train_len:]

In [None]:
class RecipeDataset:
    def __init__(self, data):
        self.data = data
        self.input_ids = []
        self.attn_masks = []

        for item in tqdm(data):
            encodings = tokenizer.encode_plus(item,
                                              truncation=True,
                                              padding='max_length',
                                              max_length=max_length,  # Adjusted max_length
                                              return_tensors='pt'
                                             )
            self.input_ids.append(torch.squeeze(encodings['input_ids'], 0))
            self.attn_masks.append(torch.squeeze(encodings['attention_mask'], 0))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [None]:
train_ds = RecipeDataset(data)


  0%|          | 0/700 [00:00<?, ?it/s]

In [None]:
batch_size = 2

In [None]:
def collate_fn(batch):
    return {
        'input_ids': torch.stack([item[0] for item in batch]),
        'attention_mask': torch.stack([item[1] for item in batch]),
        'labels': torch.stack([item[0] for item in batch])
    }


In [None]:
train_ds = RecipeDataset(train_data)
val_ds = RecipeDataset(val_data)

  0%|          | 0/5047 [00:00<?, ?it/s]

  0%|          | 0/891 [00:00<?, ?it/s]

In [None]:
pip install accelerate>=0.21.0

In [None]:
pip install transformers[torch]



In [None]:
pip install --upgrade pip

Collecting pip
  Downloading pip-24.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-24.0


In [None]:
pip install accelerate>=0.21.0


[0m

In [None]:
pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.39.1-py3-none-any.whl.metadata (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.39.1-py3-none-any.whl (8.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.38.2
    Uninstalling transformers-4.38.2:
      Successfully uninstalled transformers-4.38.2
Successfully installed transformers-4.39.1
[0m

In [None]:
args = TrainingArguments(output_dir=model_save_path,
                         per_device_train_batch_size=batch_size,
                         per_device_eval_batch_size=batch_size,
                         gradient_accumulation_steps=2,
                         report_to='none',
                         num_train_epochs=1,
                         save_strategy='no',
                         use_ipex=True
                        )

In [None]:
optim = torch.optim.AdamW(model.parameters(), lr=5e-5)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optim, 20, eta_min=1e-7)

In [None]:
trainer = Trainer(model,
                  args,
                  train_dataset=train_ds,
                  eval_dataset=train_ds,  # Using train_ds for evaluation for now, change it as needed
                  data_collator=collate_fn,
                  optimizers=(optim, scheduler)
                 )

In [None]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=175, training_loss=1.3976852852957589, metrics={'train_runtime': 10661.6491, 'train_samples_per_second': 0.066, 'train_steps_per_second': 0.016, 'total_flos': 182904422400000.0, 'train_loss': 1.3976852852957589, 'epoch': 1.0})

In [None]:
trainer.save_model()

In [None]:
model_save_path = "/path/to/save/model/"
trainer.save_model(model_save_path )

In [None]:


# Save the model to the specified path
trainer.save_model("/content/drive/My Drive/path/to/save")

In [None]:
from transformers import pipeline

In [None]:
pl = pipeline(task='text-generation',model='/content/100GPT')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def create_prompt(ingredients):
    ingredients = ','.join([x.strip().lower() for x in ingredients.split(',')])
    ingredients = ingredients.strip().replace(',','\n')
    s = f"<|startoftext|>Ingredients:\n{ingredients}\n"
    return s

In [None]:
ingredients = ['Rice,Potatoes,Tomatoes,Spinach,red bell peppers','chicken,tomatoes,aloo,jeera,curry powder']

In [None]:
for ing in ingredients:
    prompt = create_prompt(ing)
    print(pl(prompt,
         max_new_tokens=512,
         penalty_alpha=0.6,
         top_k=4,
         pad_token_id=50259
        )[0]['generated_text'])

<|startoftext|>Ingredients:
rice
potatoes
tomatoes
spinach
red bell peppers
coriander (dhania) leaves,coriander powder
water,salt,cumin seeds (jeera),green chillies,onion,green bell peppers

Instructions:
To begin making Spinach & Onion Pesto Recipe, first prep all the ingredients.Heat a pan, add onion and saute for a few seconds till it turns golden brown and turns golden brown.
Once the onions are golden brown add spinach, green chilies, red bell peppers and cook for a few seconds and then turn off the heat.
Add the cooked spinach to the spinach and mix well.
Add water if required and cook until the spinach is cooked through.
Turn off the heat and serve.Serve Spinach & Onion Pesto Recipe along with Vegetable Feta, Vegetable Pudding and a Coffee Coffee to help my my personal list my my personal list my personal my personal list my personal list my personal list my personal list my personal list my personal list my personal list my personal list my personal list my personal list my fam