In [None]:
!pip install datasets pandas matplotlib seaborn transformers accelerate peft bitsandbytes --quiet


[0m

In [None]:
!pip install --upgrade numpy datasets transformers

Collecting numpy
  Downloading numpy-2.3.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers
  Downloading transformers-4.53.0-py3-none-any.whl.metadata (39 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading numpy-2.3.1-cp311-cp311-manylinux_2_28_x86_64.whl (16.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.9/16.9 MB[0m [31m88.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading transformers-4.53.0-py3-none-any.whl (10.8 MB)
[2K   [90m━━━━━

In [70]:
import pandas as pd
from datasets import Dataset, DatasetDict
import matplotlib.pyplot as plt
import seaborn as sns
import random
import textwrap

In [84]:
import pandas as pd
import random

categories = ["Billing", "Technical Issue", "Product Info", "Account Help"]

sample_queries = {
    "Billing": [
        "Why was I charged twice this month?",
        "How can I update my billing address?",
        "I didn't receive my invoice for June.",
        "What is this unknown charge on my statement?",
        "How do I change my payment method?"
    ],
    "Technical Issue": [
        "The website is not loading.",
        "My login screen doesn't load.",
        "App crashes every time I open it.",
        "I can't upload my documents.",
        "Why is the system so slow today?"
    ],
    "Product Info": [
        "Does this product come in blue?",
        "What is the warranty period?",
        "Is this item available in store?",
        "How do I use this product?",
        "Can you tell me the specifications of this model?"
    ],
    "Account Help": [
        "How do I reset my password?",
        "I want to delete my account.",
        "Can I change my email address?",
        "How do I activate two-factor authentication?",
        "My account got locked. What should I do?"
    ]
}

sample_responses = {
    "Billing": [
        "You were likely charged twice due to a duplicate transaction. We've refunded the extra charge.",
        "You can update your billing address from the 'Account Settings' > 'Billing Info' section.",
        "Apologies! You can download your invoice from the 'Billing History' section.",
        "That charge is for your premium subscription. Let us know if it wasn't authorized.",
        "Go to 'Payment Options' and add or remove your preferred method."
    ],
    "Technical Issue": [
        "Try clearing your cache and reloading the page.",
        "Ensure your browser is updated and try again. If it persists, contact support.",
        "Please reinstall the app and try again. If it crashes, let us know your device model.",
        "Try refreshing the page or check your internet connection.",
        "We're aware of the issue and working to resolve it. Thanks for your patience."
    ],
    "Product Info": [
        "Yes, it’s available in blue and other colors. Please check the product page.",
        "This product comes with a 1-year manufacturer warranty.",
        "Yes, check in-store availability by entering your zip code on the product page.",
        "Instructions are included in the box. You can also find a video tutorial online.",
        "The specifications are listed under the 'Details' tab of the product page."
    ],
    "Account Help": [
        "Click on 'Forgot Password' to reset your password via email.",
        "We’re sorry to see you go! Use the 'Delete Account' button in settings.",
        "You can update your email under 'Account Information'.",
        "Go to 'Security Settings' and enable two-factor authentication.",
        "Please wait 15 minutes and try again, or contact support to unlock your account."
    ]
}

# Expand the pools by adding numeric suffixes
N = 50  # Number of variants per base query/response
expanded_queries = {cat: [] for cat in categories}
expanded_responses = {cat: [] for cat in categories}

for cat in categories:
    for base in sample_queries[cat]:
        for i in range(1, N+1):
            expanded_queries[cat].append(f"{base} (case {i})")
    for base in sample_responses[cat]:
        for i in range(1, N+1):
            expanded_responses[cat].append(f"{base} [ref {i}]")

# Now generate unique combinations
unique_rows = set()
while len(unique_rows) < 1000:
    cat = random.choice(categories)
    query = random.choice(expanded_queries[cat])
    response = random.choice(expanded_responses[cat])
    # Avoid query == response
    if query.lower().strip() == response.lower().strip():
        continue
    unique_rows.add((cat, query, response))

# Convert to DataFrame
df = pd.DataFrame(list(unique_rows), columns=["category", "customer_query", "agent_response"])
df.to_csv("customer_support_1000_unique.csv", index=False)
print(f"✅ Generated customer_support_1000_unique.csv with {len(df)} unique rows.")


✅ Generated customer_support_1000_unique.csv with 1000 unique rows.


In [85]:

# Load dataset from KaggleHub path
csv_path = "customer_support_1000.csv"
df = pd.read_csv(csv_path)


In [86]:
df.shape

(1000, 3)

In [87]:
# Fix: keep a copy before filtering if needed for analysis
original_count = len(df)
print(original_count)
# Fix: only drop *exact* full-row duplicates
df.drop_duplicates(subset=["category", "customer_query", "agent_response"], inplace=True)



# Remove repetitive responses
df = df[df["agent_response"].str.lower().str.strip() != df["customer_query"].str.lower().str.strip()]

# Optional: print how many were removed
print(f"Removed {original_count - len(df)} repetitive or duplicate rows.")



1000
Removed 900 repetitive or duplicate rows.


In [69]:
df.shape

(100, 3)

In [33]:
category_mapping = {cat: i for i, cat in enumerate(sorted(df["category"].unique()))}
df["category_id"] = df["category"].map(category_mapping)

# Save category mapping
import json
with open("category_encoding.json", "w") as f:
    json.dump({k: int(v) for k, v in category_mapping.items()}, f)

print("Category encoding:", category_mapping)



Category encoding: {'Account Help': 0, 'Billing': 1, 'Product Info': 2, 'Technical Issue': 3}


In [45]:
df

Unnamed: 0,category,customer_query,agent_response,category_id,instruction,response,text
10,Product Info,Does your plan include unlimited storage?,Only our premium plan includes unlimited storage.,2,You are a helpful customer support agent.\nCat...,Only our premium plan includes unlimited storage.,You are a helpful customer support agent.\nCat...
9,Technical Issue,My login screen doesn't load.,Try clearing the cache and cookies or switchin...,3,You are a helpful customer support agent.\nCat...,Try clearing the cache and cookies or switchin...,You are a helpful customer support agent.\nCat...
0,Product Info,What are the benefits of upgrading?,"You'll get priority support, more features, an...",2,You are a helpful customer support agent.\nCat...,"You'll get priority support, more features, an...",You are a helpful customer support agent.\nCat...
8,Account Help,How can I update my email address?,Go to account settings and click 'Edit' next t...,0,You are a helpful customer support agent.\nCat...,Go to account settings and click 'Edit' next t...,You are a helpful customer support agent.\nCat...
5,Technical Issue,Why is the site so slow?,We are currently facing high traffic. Please t...,3,You are a helpful customer support agent.\nCat...,We are currently facing high traffic. Please t...,You are a helpful customer support agent.\nCat...
2,Product Info,Is there a free trial?,"Yes, we offer a 14-day free trial with full ac...",2,You are a helpful customer support agent.\nCat...,"Yes, we offer a 14-day free trial with full ac...",You are a helpful customer support agent.\nCat...
1,Technical Issue,The app crashes when I open it.,Please try reinstalling the app. Let me know i...,3,You are a helpful customer support agent.\nCat...,Please try reinstalling the app. Let me know i...,You are a helpful customer support agent.\nCat...
11,Account Help,Can I delete my account permanently?,"Yes, please send a request through our contact...",0,You are a helpful customer support agent.\nCat...,"Yes, please send a request through our contact...",You are a helpful customer support agent.\nCat...
4,Billing,I was charged twice for my last order.,I'm sorry for the duplicate charge. I’ll issue...,1,You are a helpful customer support agent.\nCat...,I'm sorry for the duplicate charge. I’ll issue...,You are a helpful customer support agent.\nCat...
7,Billing,Can I change my billing cycle?,"Yes, you can switch to a monthly or yearly cyc...",1,You are a helpful customer support agent.\nCat...,"Yes, you can switch to a monthly or yearly cyc...",You are a helpful customer support agent.\nCat...


In [34]:
def build_instruction(row):
    return (
        f"You are a helpful customer support agent.\n"
        f"Category: {row['category']}\n"
        f"Customer Query: \"{row['customer_query']}\"\n"
        f"Please provide a clear and concise response."
    )

def build_response(row):
    return row['agent_response']

# Apply to dataset
df["instruction"] = df.apply(build_instruction, axis=1)
df["response"] = df.apply(build_response, axis=1)
df["text"] = df["instruction"] + "\n" + df["response"]


In [35]:
from datasets import Dataset, DatasetDict

# Shuffle and sample
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df = df.sample(min(len(df), 2000), random_state=42)

# Split
train_size = int(0.8 * len(df))
val_size = int(0.1 * len(df))

train_df = df[:train_size]
val_df = df[train_size:train_size+val_size]
test_df = df[train_size+val_size:]

# Convert to HuggingFace Dataset
dataset_dict = DatasetDict({
    "train": Dataset.from_pandas(train_df[["instruction", "response", "text"]], preserve_index=False),
    "validation": Dataset.from_pandas(val_df[["instruction", "response", "text"]], preserve_index=False),
    "test": Dataset.from_pandas(test_df[["instruction", "response", "text"]], preserve_index=False),
})


In [44]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['instruction', 'response', 'text'],
        num_rows: 9
    })
    validation: Dataset({
        features: ['instruction', 'response', 'text'],
        num_rows: 1
    })
    test: Dataset({
        features: ['instruction', 'response', 'text'],
        num_rows: 2
    })
})

In [39]:
from transformers import AutoTokenizer

model_checkpoint = "gpt2"  # or any other small model for CPU fine-tuning
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Fix padding token for GPT-style models
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=256)

# Apply tokenization
from datasets import Dataset, DatasetDict

train_dataset = Dataset.from_pandas(train_df[["instruction", "response", "text"]], preserve_index=False)
val_dataset = Dataset.from_pandas(val_df[["instruction", "response", "text"]], preserve_index=False)
test_dataset = Dataset.from_pandas(test_df[["instruction", "response", "text"]], preserve_index=False)

dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset,
})

# Tokenize the dataset
tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

# Remove original text columns
tokenized_datasets = tokenized_datasets.remove_columns(["instruction", "response", "text"])

# Add labels and convert to torch tensors
def format_for_trainer(example):
    return {
        "input_ids": example["input_ids"],
        "attention_mask": example["attention_mask"],
        "labels": example["input_ids"]  # labels = input_ids for Causal LM
    }

tokenized_datasets = tokenized_datasets.map(format_for_trainer)
tokenized_datasets.set_format(type="torch")

print("✅ Tokenization complete. Ready for training.")


Map:   0%|          | 0/9 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

✅ Tokenization complete. Ready for training.


In [43]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2
    })
})

In [40]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType

model = AutoModelForCausalLM.from_pretrained("gpt2")
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



In [41]:
training_args = TrainingArguments(
    output_dir="./lora-gpt2",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    eval_strategy="steps",
    eval_steps=100,
    save_steps=200,
    logging_steps=50,
    learning_rate=2e-4,
    fp16=True,  # If your GPU supports it
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
)


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [42]:
trainer.train()


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss


TrainOutput(global_step=9, training_loss=7.29911634657118, metrics={'train_runtime': 144.4673, 'train_samples_per_second': 0.187, 'train_steps_per_second': 0.062, 'total_flos': 3561076555776.0, 'train_loss': 7.29911634657118, 'epoch': 3.0})

In [49]:
results = trainer.evaluate(tokenized_datasets["test"])
print("Test set evaluation:", results)


Test set evaluation: {'eval_loss': 7.328610897064209, 'eval_runtime': 3.5921, 'eval_samples_per_second': 0.557, 'eval_steps_per_second': 0.278, 'epoch': 3.0}


In [50]:
model.save_pretrained("./lora-gpt2-adapter")
tokenizer.save_pretrained("./lora-gpt2-adapter")

('./lora-gpt2-adapter/tokenizer_config.json',
 './lora-gpt2-adapter/special_tokens_map.json',
 './lora-gpt2-adapter/vocab.json',
 './lora-gpt2-adapter/merges.txt',
 './lora-gpt2-adapter/added_tokens.json',
 './lora-gpt2-adapter/tokenizer.json')

In [52]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained("gpt2")
model = PeftModel.from_pretrained(base_model, "./lora-gpt2-adapter")
tokenizer = AutoTokenizer.from_pretrained("./lora-gpt2-adapter")

prompt = "My login screen doesn't load."
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(
    **inputs,
    max_new_tokens=50,
    do_sample=True,
    top_p=0.95,
    temperature=0.1,
    repetition_penalty=1.2
)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


My login screen doesn't load.
I'm not sure if I should have done this, but it's a good idea to check your browser settings before you log in and then restart the game (it will be fine). If so try again later on or just wait for an update
