# GPT-2 fine tuning with German Recipes

## import libraries

In [1]:
!pip install datasets evaluate transformers[sentencepiece]

Defaulting to user installation because normal site-packages is not writeable


DEPRECATION: Loading egg at c:\programdata\anaconda3\lib\site-packages\vboxapi-1.0-py3.11.egg is deprecated. pip 23.3 will enforce this behaviour change. A possible replacement is to use pip for package installation..


In [1]:
import transformers
import re
import json
from sklearn.model_selection import train_test_split

## Read Dataset

In [2]:
data_path = 'folder/'

with open(data_path + 'recipes.json') as f:
    data = json.load(f)

def build_text_files(data_json, dest_path):
    f = open(dest_path, 'w')
    data = ''
    for texts in data_json:
        summary = str(texts['Instructions']).strip()
        summary = re.sub(r"\s", " ", summary)
        data += summary + "  "
    f.write(data)

In [3]:
type(data) 
# list of dict {'Url', 'Instructions', 'Ingredients', 'Day', 'Name', 'Year', 'Month', 'Weekday'}

list

In [4]:
print(data[0].keys())

dict_keys(['Url', 'Instructions', 'Ingredients', 'Day', 'Name', 'Year', 'Month', 'Weekday'])


In [5]:
print(data[0])

{'Url': 'https://www.chefkoch.de/rezepte/185441079701305/', 'Instructions': 'Die Eier hart kochen. Dann pellen und mit einem Eierschneider in Scheiben schneiden. Den Reis halbgar kochen und zur Seite stellen. Die Wurst (Kolbász) in dünne Scheiben schneiden.Den Knoblauch abziehen und fein würfeln. Die Zwiebel schälen, fein hacken und in etwas Fett glasig braten. Knoblauch und Hackfleisch dazu geben und so lange braten, bis das Hackfleisch schön krümelig wird. Den eigenen Saft nicht ganz verkochen lassen. Die Fleischmasse mit Salz, Pfeffer und Paprikapulver würzen.Das Sauerkraut kurz durchspülen, ausdrücken und abtropfen lassen (damit es nicht zu sauer wird). Das Sauerkraut in einen Topf geben und mit dem Kümmel und den Lorbeerblättern vermischen. Ca. 30 Minuten unter Zugabe von wenig Wasser bei niedriger Stufe dünsten.Eine feuerfeste Form mit etwas Öl einfetten und den Boden dünn mit Sauerkraut belegen. Darauf Kolbász und die Hälfte der in Scheiben geschnittene Eier verteilen, dann eine

In [6]:
train_data =  data[0:100]

In [7]:
type(train_data)

list

### Split Dataset

In [8]:
train, test = train_test_split(train_data,test_size=0.15)

build_text_files(train,'train_dataset.txt')
build_text_files(test,'test_dataset.txt')

print("Train dataset length: "+str(len(train)))
print("Test dataset length: "+ str(len(test)))

Train dataset length: 85
Test dataset length: 15


## Tokenize Input Text

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("anonymous-german-nlp/german-gpt2")

train_path = 'train_dataset.txt'
test_path = 'test_dataset.txt'

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Load Dataset

In [10]:
from transformers import TextDataset,DataCollatorForLanguageModeling

# Once created, a TextDataset object can be used as input to a Trainer object for training a language model
# TextDataset {Tokenization, Batching, Block Size}
def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)

    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)
          # maximum 128 token

    # is to collate and batch sequences of token IDs
    # DataCollatorForLanguageModeling {Masked Language Modeling, padding, tokenizer}
    data_collator = DataCollatorForLanguageModeling(
        tokenizer = tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)



In [11]:
train_dataset

<transformers.data.datasets.language_modeling.TextDataset at 0x19fa34a0d90>

## Train Model
### Initialize Trainer with TrainingArguments with German-GPT2 model

In [12]:
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead

model = AutoModelWithLMHead.from_pretrained("anonymous-german-nlp/german-gpt2")

# training configuration
training_args = TrainingArguments(
    output_dir="folder", #Specifies the directory where the trained model and logs will be saved.
    overwrite_output_dir=True, #overwrite the contents of the output directory if it already exists.
    num_train_epochs=2, # number of training epochs 
    per_device_train_batch_size=32, # Each training batch will contain 32 sequences.
    per_device_eval_batch_size=64,  # Each evaluation batch will contain 64 sequences.
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps=800, # after # steps model is saved
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
    report_to="tensorboard"
    )

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [13]:
def compute_accuracy(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    correct_predictions = (predictions == labels).sum().item()
    total_predictions = labels.numel()
    accuracy = correct_predictions / total_predictions
    return {"accuracy": accuracy}

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_accuracy,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


In [15]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [16]:
trainer.train()

  0%|          | 0/8 [00:00<?, ?it/s]

{'train_runtime': 392.6843, 'train_samples_per_second': 0.652, 'train_steps_per_second': 0.02, 'train_loss': 4.077993392944336, 'epoch': 2.0}


TrainOutput(global_step=8, training_loss=4.077993392944336, metrics={'train_runtime': 392.6843, 'train_samples_per_second': 0.652, 'train_steps_per_second': 0.02, 'train_loss': 4.077993392944336, 'epoch': 2.0})

In [17]:
# trainer.evaluate()

train_accuracy = trainer.evaluate()
print("Training Accuracy:", train_accuracy)

  0%|          | 0/1 [00:00<?, ?it/s]

Training Accuracy: {'eval_loss': 4.081147193908691, 'eval_runtime': 6.2097, 'eval_samples_per_second': 2.899, 'eval_steps_per_second': 0.161, 'epoch': 2.0}


In [None]:
# trainer.save_model()

In [18]:
import pandas as pd
with open("train_dataset.txt", "r") as file:
    # Read the entire contents of the file
    train_text = file.read()

train_text = train_text.split(".")
train_text

In [19]:
import pandas as pd
with open("test_dataset.txt", "r") as file:
    # Read the entire contents of the file
    test_text = file.read()

test_text = test_text.split(".")
test_text

## Test Model

In [20]:
from transformers import pipeline

generation = pipeline('text-generation',model='anonymous-german-nlp/german-gpt2', tokenizer='anonymous-german-nlp/german-gpt2')

#result = chef('Zuerst Hähnchen')[0]['generated_text']

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [21]:
from translate import Translator

In [None]:
# Add 2 cups of water/vegetable broth to the rice and add 1 cup for reserve
# Dice the onions
# 5 min kräftig anbraten, so dass sich viel Flüssigkeit entzogen hat
# Chop the lovage and parsley finely
input_text = str(input("enter your text:"))
translatorDTE = Translator(to_lang="de", from_lang="en")
translated_text = translatorDTE.translate(input_text)
print(f"from E to D: \n {translated_text} \n")

output = generation(translated_text)
last_outout = output[0]["generated_text"]
print(f"output of generation in german: \n[{last_outout}\n")

# last output from german to english
translatorETD = Translator(to_lang="en", from_lang="de")
translated_text = translatorETD.translate(last_outout)
print(translated_text)

In [None]:
generation("Geben Sie 2 Tassen Wasser/Gemüsebrühe in den Reis und fügen Sie 1 Tasse als Reserve hinzu")

In [None]:
# generation('Den Reis mit 2 Tassen Wasser/Gemüsebrühe aufsetzen und 1 Tasse als Reserve dazu stellen')


In [None]:
# generation('Zuerst Hähnchen')

In [None]:
# generation('Zuerst Hähnchen')[0]['generated_text']
input_ids = tokenizer.encode(output[0]['generated_text'], return_tensors='pt')
input_ids

In [None]:
# from transformers import pipeline
# import torch

# Calculate perplexity
tokenizer = generation.tokenizer
model = generation.model

# Tokenize the generated text
input_ids = tokenizer.encode(output[0]['generated_text'], return_tensors='pt')

# Set model to evaluation mode
model.eval()

# Pass the input through the model
with torch.no_grad():
    outputs = model(input_ids)
    logits = outputs.logits

# Compute the cross-entropy loss
loss = torch.nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), input_ids.view(-1))

# Calculate perplexity
perplexity = torch.exp(loss)

print("Perplexity:", perplexity.item())
print(loss)