# Auto Tagging Support Tickets Using LLM

This notebook demonstrates auto-tagging of support tickets using zero-shot, few-shot and fine-tuned approaches with Hugging Face transformers.

We define categories based on the dataset, manually label for evaluation/fine-tuning, compare performances and save a fine-tuned model for app.py.

In [2]:
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import torch

# Load dataset
df = pd.read_csv('dataset/support_ticket_data.csv')
print(df.head())

  support_tick_id                                support_ticket_text
0      ST2023-006  My internet connection has significantly slowe...
1      ST2023-007  Urgent help required! My laptop refuses to sta...
2      ST2023-008  I've accidentally deleted essential work docum...
3      ST2023-009  Despite being in close proximity to my Wi-Fi r...
4      ST2023-010  My smartphone battery is draining rapidly, eve...


In [3]:
# Define categories based on dataset analysis
categories = [
    "Connectivity Issue",
    "Hardware Malfunction",
    "Data Recovery",
    "Battery Issue",
    "Account Access",
    "Performance Issue",
    "Software Issue"
]

# Manual labels for evaluation and fine-tuning
labels_dict = {
    "ST2023-006": "Connectivity Issue",
    "ST2023-007": "Hardware Malfunction",
    "ST2023-008": "Data Recovery",
    "ST2023-009": "Connectivity Issue",
    "ST2023-010": "Battery Issue",
    "ST2023-011": "Account Access",
    "ST2023-012": "Performance Issue",
    "ST2023-013": "Hardware Malfunction",
    "ST2023-014": "Data Recovery",
    "ST2023-015": "Hardware Malfunction",
    "ST2023-016": "Data Recovery",
    "ST2023-017": "Hardware Malfunction",
    "ST2023-018": "Hardware Malfunction",
    "ST2023-019": "Data Recovery",
    "ST2023-020": "Hardware Malfunction",
    "ST2023-021": "Connectivity Issue",
    "ST2023-022": "Connectivity Issue",
    "ST2023-023": "Data Recovery",
    "ST2023-024": "Data Recovery",
    "ST2023-025": "Connectivity Issue",
    "ST2023-026": "Software Issue"
}
df['label'] = df['support_tick_id'].map(labels_dict)

# Label IDs for fine-tuning
label2id = {cat: i for i, cat in enumerate(categories)}
df['label_id'] = df['label'].map(label2id)

In [4]:
# Zero-shot classification
zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def get_top3_zero_shot(text):
    result = zero_shot_classifier(text, candidate_labels=categories, multi_label=False)
    sorted_labels = [label for _, label in sorted(zip(result['scores'], result['labels']), reverse=True)]
    return sorted_labels[:3]

# Apply and compute accuracy (top-1)
df['zero_shot_top3'] = df['support_ticket_text'].apply(get_top3_zero_shot)
df['zero_shot_pred'] = df['zero_shot_top3'].apply(lambda x: x[0])
zero_shot_acc = accuracy_score(df['label'], df['zero_shot_pred'])
print(f"Zero-shot Accuracy: {zero_shot_acc}")

Device set to use cpu


Zero-shot Accuracy: 0.9047619047619048


In [5]:
# Few-shot using generative model (FLAN-T5 for local use)
# Load FLAN-T5 for text2text-generation
generator = pipeline("text2text-generation", model="google/flan-t5-base")

# Few-shot examples (one per category)
examples = [
    (df[df['support_tick_id'] == 'ST2023-006']['support_ticket_text'].values[0], "Connectivity Issue"),
    (df[df['support_tick_id'] == 'ST2023-007']['support_ticket_text'].values[0], "Hardware Malfunction"),
    (df[df['support_tick_id'] == 'ST2023-008']['support_ticket_text'].values[0], "Data Recovery"),
    (df[df['support_tick_id'] == 'ST2023-010']['support_ticket_text'].values[0], "Battery Issue"),
    (df[df['support_tick_id'] == 'ST2023-011']['support_ticket_text'].values[0], "Account Access"),
    (df[df['support_tick_id'] == 'ST2023-012']['support_ticket_text'].values[0], "Performance Issue"),
    (df[df['support_tick_id'] == 'ST2023-026']['support_ticket_text'].values[0], "Software Issue")
]

# Build few-shot prompt base
few_shot_base = "You are a support ticket classifier. Given a ticket, return the top 3 most probable categories from: " + ", ".join(categories) + ".\n\n"

for text, cat in examples:
    few_shot_base += f"Ticket: {text}\nTop category: {cat}\n\n"

def get_top3_few_shot(text):
    prompt = few_shot_base + f"Ticket: {text}\nTop 3 categories:"
    output = generator(prompt, max_new_tokens=50)[0]['generated_text']
    tags = [t.strip() for t in output.split(",")[:3]]
    return tags


# Apply and compute accuracy (excluding example tickets for fairness)
example_ids = ["ST2023-006", "ST2023-007", "ST2023-008", "ST2023-010", "ST2023-011", "ST2023-012", "ST2023-026"]
eval_df = df[~df['support_tick_id'].isin(example_ids)]
eval_df['few_shot_top3'] = eval_df['support_ticket_text'].apply(get_top3_few_shot)
eval_df['few_shot_pred'] = eval_df['few_shot_top3'].apply(lambda x: x[0] if len(x) > 0 else "")
few_shot_acc = accuracy_score(eval_df['label'], eval_df['few_shot_pred'])
print(f"Few-shot Accuracy (FLAN-T5): {few_shot_acc}")

Device set to use cpu
Token indices sequence length is longer than the specified maximum sequence length for this model (642 > 512). Running this sequence through the model will result in indexing errors


Few-shot Accuracy (FLAN-T5): 0.7142857142857143


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eval_df['few_shot_top3'] = eval_df['support_ticket_text'].apply(get_top3_few_shot)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eval_df['few_shot_pred'] = eval_df['few_shot_top3'].apply(lambda x: x[0] if len(x) > 0 else "")


In [6]:
# Fine-tuning
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)  # Removed stratify

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["support_ticket_text"], padding="max_length", truncation=True, max_length=512)

train_ds = Dataset.from_pandas(train_df[['support_ticket_text', 'label_id']]).rename_column("label_id", "labels")
test_ds = Dataset.from_pandas(test_df[['support_ticket_text', 'label_id']]).rename_column("label_id", "labels")

tokenized_train = train_ds.map(tokenize_function, batched=True)
tokenized_test = test_ds.map(tokenize_function, batched=True)

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(categories))

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_dir="./logs",
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
)

trainer.train()

# Save model
trainer.save_model("./fine_tuned_model")

Map: 100%|██████████| 16/16 [00:00<00:00, 1574.44 examples/s]
Map: 100%|██████████| 5/5 [00:00<00:00, 903.79 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,1.780836
2,No log,1.641653
3,1.795600,1.595633




In [7]:
# Evaluate fine-tuned model
def get_top3_fine_tuned(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=-1)[0]
    top3_idx = torch.topk(probs, 3).indices.tolist()
    return [categories[i] for i in top3_idx]

test_df['fine_tuned_top3'] = test_df['support_ticket_text'].apply(get_top3_fine_tuned)
test_df['fine_tuned_pred'] = test_df['fine_tuned_top3'].apply(lambda x: x[0])
fine_tuned_acc = accuracy_score(test_df['label'], test_df['fine_tuned_pred'])
print(f"Fine-tuned Accuracy: {fine_tuned_acc}")

# Comparison
print(f"\nPerformance Comparison:\nZero-shot Acc: {zero_shot_acc}\nFew-shot Acc: {few_shot_acc}\nFine-tuned Acc: {fine_tuned_acc}")

Fine-tuned Accuracy: 0.4

Performance Comparison:
Zero-shot Acc: 0.9047619047619048
Few-shot Acc: 0.7142857142857143
Fine-tuned Acc: 0.4
