In [3]:
!pip install transformers



In [4]:
!pip install transformers torch



## Necessary Libraries

In [13]:
import pandas as pd
from transformers import AutoTokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch

## Loading and Preprocessing the Dataset

In [9]:
# Load the dataset
df = pd.read_csv('Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv')

# Inspect the dataset
df.head()

Unnamed: 0,flags,instruction,category,intent,response
0,B,question about cancelling order {{Order Number}},ORDER,cancel_order,I've understood you have a question regarding ...
1,BQZ,i have a question about cancelling oorder {{Or...,ORDER,cancel_order,I've been informed that you have a question ab...
2,BLQZ,i need help cancelling puchase {{Order Number}},ORDER,cancel_order,I can sense that you're seeking assistance wit...
3,BL,I need to cancel purchase {{Order Number}},ORDER,cancel_order,I understood that you need assistance with can...
4,BCELN,"I cannot afford this order, cancel purchase {{...",ORDER,cancel_order,I'm sensitive to the fact that you're facing f...


In [11]:
# Preprocess the data
def preprocess_text(text):
    # Implement text cleaning steps
    return text

df['instruction'] = df['instruction'].apply(preprocess_text)
df['response'] = df['response'].apply(preprocess_text)

# Tokenization
tokenizer = AutoTokenizer.from_pretrained('gpt2')

# Set the padding token
tokenizer.pad_token = tokenizer.eos_token

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

def tokenize_function(examples):
    return tokenizer(examples, padding='max_length', truncation=True, max_length=128)

train_encodings = tokenize_function(train_df['instruction'].tolist())
train_labels = tokenize_function(train_df['response'].tolist())

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = CustomDataset(train_encodings, train_labels)

In [14]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
)

trainer.train()

Step,Training Loss
500,4.868
1000,4.431
1500,4.3079
2000,4.1407
2500,4.0955
3000,3.9985
3500,3.9832
4000,3.9583
4500,3.9422
5000,3.8646


TrainOutput(global_step=16125, training_loss=3.8736620412427327, metrics={'train_runtime': 52546.6801, 'train_samples_per_second': 1.227, 'train_steps_per_second': 0.307, 'total_flos': 4212746108928000.0, 'train_loss': 3.8736620412427327, 'epoch': 3.0})