In [1]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, Trainer, TrainingArguments, DataCollatorForLanguageModeling,pipeline
from datasets import load_dataset,DatasetDict,Dataset
import os
import torch
import numpy as np

In [2]:
# # Disable Weights & Biases logging
# os.environ['WANDB_DISABLED'] = 'TRUE'

In [3]:
# Check device availability and set the model to use GPU/CPU accordingly
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cpu


## Loading model

In [4]:
model=GPT2LMHeadModel.from_pretrained('gpt2').to(device)
tokenizer=GPT2TokenizerFast.from_pretrained('gpt2')
tokenizer.pad_token=tokenizer.eos_token # set eos as pad token
model.config.pad_token_id=tokenizer.pad_token_id

## Loading dataset

In [5]:
dataset_dict=load_dataset('csv',data_files={'train':'training.csv','test':'testing.csv'})
train_dataset=dataset_dict['train']
test_dataset=dataset_dict['test']

In [6]:
train_dataset

Dataset({
    features: ['question', 'answer'],
    num_rows: 2457
})

In [7]:
test_dataset

Dataset({
    features: ['question', 'answer'],
    num_rows: 274
})

## concat question and answer into single text string

In [8]:
def format_example(example):
    formatted_text = f"Question: {example['question']} Answer: {example['answer']} <|endoftext|>"
    # return dictionary with new column 'text'
    return {'text': formatted_text}

In [9]:
train_dataset=train_dataset.map(format_example)
test_dataset=test_dataset.map(format_example)

In [10]:
train_dataset['question'][1]

'tell be about lying cable biceps curl'

In [11]:
train_dataset['answer'][1]

'the lying cable biceps curl is a cable exercise that targets the biceps it is performed lying on the back either on a bench or the ground with the feet facing toward a cable stack this movement is usually performed for moderate to high reps for a burn and pump as part of an armfocused workout'

In [12]:
test_dataset['question'][1]

'can injury prevention for shoulders help with injury prevention'

In [13]:
test_dataset['answer'][1]

'yes injury prevention for shoulders can prevent injuries by improving muscle imbalances and joint stability'

In [14]:
train_dataset['text'][1]

'Question: tell be about lying cable biceps curl Answer: the lying cable biceps curl is a cable exercise that targets the biceps it is performed lying on the back either on a bench or the ground with the feet facing toward a cable stack this movement is usually performed for moderate to high reps for a burn and pump as part of an armfocused workout <|endoftext|>'

## combine split dataset into a single datasetdictm

In [15]:
split_datasets=DatasetDict({
    'train':train_dataset,
    'test':test_dataset,
    'val':test_dataset
})

In [16]:
split_datasets

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'text'],
        num_rows: 2457
    })
    test: Dataset({
        features: ['question', 'answer', 'text'],
        num_rows: 274
    })
    val: Dataset({
        features: ['question', 'answer', 'text'],
        num_rows: 274
    })
})

In [17]:
split_datasets['train']['question'][1]

'tell be about lying cable biceps curl'

In [18]:
split_datasets['train']['answer'][1]

'the lying cable biceps curl is a cable exercise that targets the biceps it is performed lying on the back either on a bench or the ground with the feet facing toward a cable stack this movement is usually performed for moderate to high reps for a burn and pump as part of an armfocused workout'

In [19]:
split_datasets['train']['text'][1]

'Question: tell be about lying cable biceps curl Answer: the lying cable biceps curl is a cable exercise that targets the biceps it is performed lying on the back either on a bench or the ground with the feet facing toward a cable stack this movement is usually performed for moderate to high reps for a burn and pump as part of an armfocused workout <|endoftext|>'

## Tokenizing question and answer

In [20]:
def tokenize_function(examples):
    inputs = tokenizer(examples['text'], max_length=512, truncation=True, padding="max_length")
    # Use numpy to efficiently replace pad_token_id with -100
    labels = np.array(inputs['input_ids'], dtype=np.int64)
    labels[labels == tokenizer.pad_token_id] = -100
    inputs['labels'] = labels.tolist()
    return inputs

In [21]:
tokenized_datasets={}

## Tokenizing training, testing and validation dataset

In [22]:
tokenized_datasets['train']= split_datasets['train'].map(tokenize_function, batched=True,
                                    remove_columns=['text', 'question', 'answer'])

In [23]:
tokenized_datasets['test']= split_datasets['test'].map(tokenize_function, batched=True,
                                    remove_columns=['text', 'question', 'answer'])

In [24]:
tokenized_datasets['validation']= split_datasets['val'].map(tokenize_function, batched=True,
                                    remove_columns=['text', 'question', 'answer'])

In [25]:
tokenized_datasets

{'train': Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 2457
 }),
 'test': Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 274
 }),
 'validation': Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 274
 })}

In [26]:
tokenized_datasets.keys()

dict_keys(['train', 'test', 'validation'])

## Training parameter

In [27]:
training_args = TrainingArguments(
    output_dir="./gpt2-fitness-finetuned",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    fp16=torch.cuda.is_available(),  # Enable mixed precision if CUDA is available
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
)



## Initializing trainer

In [28]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
)

In [29]:
trainer.train()

Epoch,Training Loss,Validation Loss
0,2.081,1.932665
1,1.3868,1.626459
2,1.5602,1.562353


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=921, training_loss=1.8979219083030112, metrics={'train_runtime': 10468.268, 'train_samples_per_second': 0.704, 'train_steps_per_second': 0.088, 'total_flos': 1924677107712000.0, 'train_loss': 1.8979219083030112, 'epoch': 2.997558991049634})

In [30]:
model.save_pretrained('./chatbot_model_2')
tokenizer.save_pretrained('./chatbot_model_2')

('./chatbot_model_2/tokenizer_config.json',
 './chatbot_model_2/special_tokens_map.json',
 './chatbot_model_2/vocab.json',
 './chatbot_model_2/merges.txt',
 './chatbot_model_2/added_tokens.json',
 './chatbot_model_2/tokenizer.json')