### GPT2 medium model https://huggingface.co/docs/transformers/model_doc/gpt2

### Cooking Recipes Dataset https://huggingface.co/datasets/CodeKapital/CookingRecipes

### This code is adapted from https://tuanatran.medium.com/fine-tuning-large-language-model-with-hugging-face-pytorch-adce80dce2ad


In [1]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, GPT2Config
from transformers import get_linear_schedule_with_warmup

import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import random_split, RandomSampler, SequentialSampler

import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"
# model_name: ['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl']
model_name = "gpt2-medium"

  from .autonotebook import tqdm as notebook_tqdm


#### generate API token: https://huggingface.co/docs/hub/security-tokens


In [None]:
from huggingface_hub import login

# Log in using your API token
login(token="<your api token>")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
df_recipes = pd.read_csv("hf://datasets/CodeKapital/CookingRecipes/Data.csv")

In [None]:
df_recipes.head()

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER
0,0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""peanut butter"", ""graham cracker crumbs"", ""bu..."


In [None]:
df_recipes.reset_index(drop=True, inplace=True)
df_recipes.dropna(inplace=True)
df_recipes = df_recipes[:20000]


def form_string(name, ingredient, instruction):
    ingredient = ingredient.strip("[]").replace('"', "")
    instruction = instruction.strip("[]").replace('"', "").replace(",", " ")

    s = (
        f"<|startoftext|>Question: How to cook {name.strip()}. "
        f"Name: {name.strip()}. "
        f"Ingredients: {ingredient}. "
        f"Instructions: {instruction.strip()}<|endoftext|>"
    )
    return s


data = df_recipes.apply(
    lambda x: form_string(x["title"], x["ingredients"], x["directions"]), axis=1
).to_list()
data[0]

'<|startoftext|>Question: How to cook No-Bake Nut Cookies. Name: No-Bake Nut Cookies. Ingredients: 1 c. firmly packed brown sugar, 1/2 c. evaporated milk, 1/2 tsp. vanilla, 1/2 c. broken nuts (pecans), 2 Tbsp. butter or margarine, 3 1/2 c. bite size shredded rice biscuits. Instructions: In a heavy 2-quart saucepan  mix brown sugar  nuts  evaporated milk and butter or margarine.  Stir over medium heat until mixture bubbles all over top.  Boil and stir 5 minutes more. Take off heat.  Stir in vanilla and cereal; mix well.  Using 2 teaspoons  drop and shape into 30 clusters on wax paper.  Let stand until firm  about 30 minutes.<|endoftext|>'

In [None]:
tokenizer = GPT2TokenizerFast.from_pretrained(
    model_name,
    bos_token="<|startoftext|>",
    eos_token="<|endoftext|>",
    unk_token="<|unknown|>",
    pad_token="<|pad|>",
)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]



In [None]:
batch_size = 4
max_length = 512


# standard PyTorch approach of loading data in using a Dataset class.
class RecipeDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.input_ids = []
        self.attn_masks = []

        for recipe in self.data:
            encodings = tokenizer.encode_plus(
                recipe,
                truncation=True,
                padding="max_length",
                max_length=max_length,
                # return a PyTorch tensor
                return_tensors="pt",
            )
            self.input_ids.append(torch.squeeze(encodings["input_ids"], 0))
            self.attn_masks.append(torch.squeeze(encodings["attention_mask"], 0))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]


dataset = RecipeDataset(data, tokenizer)
print(f"input_ids: {dataset[0][0]} attn_masks: {dataset[0][1]}")

input_ids: tensor([50257, 24361,    25,  1374,   284,  4255,  1400,    12,    33,   539,
        11959, 45305,    13,  6530,    25,  1400,    12,    33,   539, 11959,
        45305,    13, 33474,    25,   352,   269,    13, 14245, 11856,  7586,
         7543,    11,   352,    14,    17,   269,    13, 28959,   515,  7545,
           11,   352,    14,    17, 23053,    13, 16858,    11,   352,    14,
           17,   269,    13,  5445, 14380,   357, 43106,   504,   828,   362,
          309, 24145,    13,  9215,   393,  6145, 34569,    11,   513,   352,
           14,    17,   269,    13, 13197,  2546, 37624, 11464, 50128,    13,
        27759,    25,   554,   257,  4334,   362,    12, 36008, 10746,  6839,
          220,  5022,  7586,  7543,   220, 14380,   220, 28959,   515,  7545,
          290,  9215,   393,  6145, 34569,    13,   220, 33689,   625,  7090,
         4894,  1566, 11710, 25037,   477,   625,  1353,    13,   220,  3248,
          346,   290, 11240,   642,  2431,   517,    

In [None]:
train_dataloader = DataLoader(
    dataset, sampler=RandomSampler(dataset), batch_size=batch_size
)

In [None]:
configuration = GPT2Config.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name, config=configuration)
model = model.to(device)
model.resize_token_embeddings(len(tokenizer))

epochs = 2
learning_rate = 2e-5
warmup_steps = 1e2
epsilon = 1e-8
optim = AdamW(model.parameters(), lr=learning_rate, eps=epsilon)

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optim, num_warmup_steps=warmup_steps, num_training_steps=total_steps
)

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
batch_size = 2
max_length = 256


def generate_response(prompt):
    input = f"<|startoftext|>Question: {prompt.strip()}"
    input = tokenizer(input, return_tensors="pt")
    input_ids = input["input_ids"]
    attention_mask = input["attention_mask"]

    output = model.generate(
        input_ids.to(device),
        attention_mask=attention_mask.to(device),
        max_new_tokens=max_length,
        do_sample=True,
        top_k=50,
        top_p=0.85,
    )
    output = tokenizer.decode(output[0], skip_special_tokens=True)
    return output

In [None]:
for epoch_i in range(0, epochs):
    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        model.zero_grad()
        outputs = model(
            input_ids=b_input_ids,
            labels=b_labels,
            attention_mask=b_masks,
            token_type_ids=None,
        )

        loss = outputs[0]

        if step % 100 == 0 and not step == 0:
            model.eval()
            print(f"Step: {step}, Loss {loss}, total_len {len(train_dataloader)}")
            print(generate_response("how to make pizza?"))
            model.train()

        loss.backward()
        optim.step()
        scheduler.step()

In [None]:
generate_response("I want to cook taco")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Question: I want to cook taco dip. Name: Taco Dip. Ingredients: 1 can mushrooms, 1 can tomato sauce, 1 can corn, drained, 1 can black beans, 1 can sour cream. Instructions: In casserole dish  combine ingredients and put in greased skillet.  Bake at 350\\u00b0 for 1 1/2 hours.  Add salsa if needed.  Serves 4.'

In [None]:
model.save_pretrained("model")

# Use the fine-tuned model


In [None]:
from zipfile import ZipFile

zf = ZipFile(
    "drive/MyDrive/LLM project/model.zip", "r"
)  # Change the path to the model.zip from Google Drive
zf.extractall("model")
zf.close()

In [None]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, GPT2Config
from transformers import get_linear_schedule_with_warmup
import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import random_split, RandomSampler, SequentialSampler
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"

max_length = 512


def infer(prompt):
    input = f"<|startoftext|>Question: {prompt.strip()}"
    input = tokenizer(input, return_tensors="pt")
    input_ids = input["input_ids"]
    attention_mask = input["attention_mask"]

    output = model.generate(
        input_ids.to(device),
        attention_mask=attention_mask.to(device),
        max_new_tokens=max_length,
        do_sample=True,
        top_k=50,
        temperature=0.7,
    )
    output = tokenizer.decode(output[0], skip_special_tokens=True)
    return output

In [None]:
model = GPT2LMHeadModel.from_pretrained("model")
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2-medium")
model.to(device)



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50260, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50260, bias=False)
)

In [None]:
output = generate_response("How to cook cake")  # Add your prompt

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output parsing to generate more structured format


In [None]:
import re

parts = re.split(r"Question:|Name:|Ingredients:|Instructions:", output)

parts[0] = parts[0].replace("<|startoftext|>", "").strip()

question = parts[1].strip()
name = parts[2].strip()
ingredients = parts[3].strip()
parts[4] = parts[4].encode("utf-8").decode("unicode_escape")
instructions = parts[4].strip()

print("Question:", question)
print("Name:", name)
print("Ingredients:", ingredients)
print("Instructions:", instructions)

Question: How to cook cake.
Name: Cake.
Ingredients: 3/4 c. margarine, 1/2 c. sugar, 3 eggs, 3 c. flour, 1/2 tsp. baking soda, 1 tsp. salt, 1 c. chopped nuts, 1 c. butter.
Instructions: Cream margarine and sugar together. Add eggs and beat well.  Add flour  baking soda and salt and mix well.  Add nuts and butter and mix well.  Gradually add egg mixture until well blended.  Drop batter by teaspoon onto ungreased cake pan.  Bake at 350° for 45 to 60 minutes.
