In [3]:
!pip install transformers



In [4]:
!pip install transformers torch



## Necessary Libraries

In [46]:
import pandas as pd
from transformers import AutoTokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch

## Loading and Preprocessing the Dataset

In [35]:
# Load the dataset
df = pd.read_csv('Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv')

# Inspect the dataset
df.head()

Unnamed: 0,flags,instruction,category,intent,response
0,B,question about cancelling order {{Order Number}},ORDER,cancel_order,I've understood you have a question regarding ...
1,BQZ,i have a question about cancelling oorder {{Or...,ORDER,cancel_order,I've been informed that you have a question ab...
2,BLQZ,i need help cancelling puchase {{Order Number}},ORDER,cancel_order,I can sense that you're seeking assistance wit...
3,BL,I need to cancel purchase {{Order Number}},ORDER,cancel_order,I understood that you need assistance with can...
4,BCELN,"I cannot afford this order, cancel purchase {{...",ORDER,cancel_order,I'm sensitive to the fact that you're facing f...


In [47]:
# Preprocess the data
def preprocess_text(text):
    # Implement text cleaning steps
    return text

df['instruction'] = df['instruction'].apply(preprocess_text)
df['response'] = df['response'].apply(preprocess_text)

# Tokenization
tokenizer = AutoTokenizer.from_pretrained('gpt2')

# Set the padding token
tokenizer.pad_token = tokenizer.eos_token

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

def tokenize_function(examples):
    return tokenizer(examples, padding='max_length', truncation=True, max_length=128)

train_encodings = tokenize_function(train_df['instruction'].tolist())
train_labels = tokenize_function(train_df['response'].tolist())

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = CustomDataset(train_encodings, train_labels)

In [14]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
)

trainer.train()

Step,Training Loss
500,4.868
1000,4.431
1500,4.3079
2000,4.1407
2500,4.0955
3000,3.9985
3500,3.9832
4000,3.9583
4500,3.9422
5000,3.8646


TrainOutput(global_step=16125, training_loss=3.8736620412427327, metrics={'train_runtime': 52546.6801, 'train_samples_per_second': 1.227, 'train_steps_per_second': 0.307, 'total_flos': 4212746108928000.0, 'train_loss': 3.8736620412427327, 'epoch': 3.0})

## Evaluating the Model on the Test Set

In [15]:
test_encodings = tokenize_function(test_df['instruction'].tolist())
test_labels = tokenize_function(test_df['response'].tolist())

test_dataset = CustomDataset(test_encodings, test_labels)

# Evaluate the model
eval_results = trainer.evaluate(test_dataset)
print(eval_results)

{'eval_loss': 7.887006759643555, 'eval_runtime': 147.6278, 'eval_samples_per_second': 36.409, 'eval_steps_per_second': 4.552, 'epoch': 3.0}


## Creating Response Generation

In [48]:
# Ensure the model is on the correct device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [56]:
# Function to generate responses
def generate_response(prompt, model, tokenizer, max_length=150):
    # Tokenize the input
    inputs = tokenizer(prompt, return_tensors="pt", padding=True)
    
    # Move the input IDs and attention mask to the correct device
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    # Ensure max_length does not exceed model's max position embeddings
    max_length = min(max_length, model.config.n_positions)
    
    # Generate a response using beam search
    outputs = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=max_length,
        num_beams=5,              # Use beam search with 5 beams
        early_stopping=True,      # Stop when at least num_beams sentences are finished per batch
        no_repeat_ngram_size=2,   # Prevents repeating n-grams
        pad_token_id=tokenizer.eos_token_id
    )
    
    # Decode the generated tokens to text
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return response

# Test the function
query = "I want a refund for my recent purchase."
response = generate_response(query, model, tokenizer)
print(f"Query: {query}\nResponse: {response}")

Query: I want a refund for my recent purchase.
Response: I want a refund for my recent purchase.

Rated 5 out of 5 by Anonymous from Great product! I bought this product because I was looking for a way to get a better quality product. I have been using it for about a month now and it works great. The only thing I would change is the size of the tip. It is a little smaller than the one on the back of my phone, but it is still a good size for me. If you have any questions, please don't hesitate to call us. We will be happy to help you with any issues you may have.


In [61]:
# Function to generate responses
def generate_response(prompt, model, tokenizer, max_length=150):
    # Tokenize the input
    inputs = tokenizer(prompt, return_tensors="pt", padding=True)
    
    # Move the input IDs and attention mask to the correct device
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    # Ensure max_length does not exceed model's max position embeddings
    max_length = min(max_length, model.config.n_positions)
    
    # Generate a response using beam search
    outputs = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=max_length,
        num_beams=5,               # Use beam search with 5 beams
        early_stopping=True,       # Stop when at least num_beams sentences are finished per batch
        no_repeat_ngram_size=2,    # Prevents repeating n-grams
        pad_token_id=tokenizer.eos_token_id
    )
    
    # Decode the generated tokens to text
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return response

# Load the dataset
df = pd.read_csv('Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv')

# Sample a subset for testing
test_sample = df.sample(10, random_state=42)

# Function to test the response generation
def test_response_generation(test_sample, model, tokenizer):
    results = []
    for idx, row in test_sample.iterrows():
        prompt = row['instruction']
        expected_response = row['response']
        generated_response = generate_response(prompt, model, tokenizer)
        
        results.append({
            'prompt': prompt,
            'expected_response': expected_response,
            'generated_response': generated_response
        })
    
    return pd.DataFrame(results)

# Run the test
test_results = test_response_generation(test_sample, model, tokenizer)

# Display the results
for idx, row in test_results.iterrows():
    print(f"Prompt: {row['prompt']}")
    print(f"Expected Response: {row['expected_response']}")
    print(f"Generated Response: {row['generated_response']}")
    print("-" * 50)

Prompt: I can't talk with  a human agent
Expected Response: Thank you for reaching out! I'm sorry to hear that you're facing difficulties in connecting with a human agent. Please rest assured that I'm here to assist you and resolve this issue. Could you please provide me with some more details about the problem you're experiencing? This will help me find the best solution for you.
Generated Response: I can't talk with  a human agent, but I can tell you that I'm not going to lie to you. I don't know what you're talking about."

"I'm sorry," she said. "I just want you to know that this is not the first time I've heard you say something like that. It's not like you've ever done anything like this before. You're just a normal human being, you know? I mean, I know it's hard to believe that you'd say that to someone like me. But I have to admit, it was hard for me when I first met you, because I didn't really know you at all, and I just wanted to make sure you were okay.
--------------------