In [26]:
# Step 1: Install required libs (run once)
!pip install transformers datasets scikit-learn torch pandas --quiet

# Step 2: Imports
import pandas as pd
import torch
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    pipeline,
)
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score

# Step 3: Load your CSV dataset
df = pd.read_csv("customer_support_tickets.csv")

# Step 4: Prepare input text and labels
df['text'] = df['Ticket Subject'].fillna('') + ". " + df['Ticket Description'].fillna('')
df = df.dropna(subset=['Ticket Type', 'text'])

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['Ticket Type'])
print(f"Classes (tags): {list(label_encoder.classes_)}")

dataset = Dataset.from_pandas(df[['text', 'label']])

# Step 5: Tokenize dataset (batched=True for padding consistency)
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_fn(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=256)

tokenized_dataset = dataset.map(tokenize_fn, batched=True)

# Step 6: Split dataset (train/test)
split = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split['train']
test_dataset = split['test']

# Step 7: Load model for sequence classification
num_labels = len(label_encoder.classes_)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# Step 8: Define training arguments with report_to="none" to silence warnings
training_args = TrainingArguments(
    output_dir="./ticket_classifier",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
    report_to="none"
)

# Step 9: Metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {"accuracy": acc, "f1": f1}

# Step 10: Use DataCollatorWithPadding for dynamic batch padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,  # Important for padding batches dynamically
)

# Step 11: Train the model
trainer.train()

#Fix for your prediction loop (Step 12):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

print("Top 3 tags per ticket (test set):")
for example in test_dataset:
    inputs = tokenizer(example['text'], return_tensors="pt", truncation=True, padding=True, max_length=256)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # <-- move inputs to model device
    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=-1)[0]
    top3 = torch.topk(probs, k=3)
    tags = [label_encoder.classes_[i] for i in top3.indices.cpu().numpy()]
    scores = top3.values.cpu().numpy()
    print(f"Ticket text (first 60 chars): {example['text'][:60]!r}")
    for tag, score in zip(tags, scores):
        print(f"  {tag}: {score:.3f}")
    print()


# Step 13 (optional): Zero-shot classification for comparison
zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
candidate_labels = list(label_encoder.classes_)

print("Zero-shot classification example:")
for text in df['text'].sample(3, random_state=42):
    result = zero_shot_classifier(text, candidate_labels, multi_label=True)
    print(f"Text (first 60 chars): {text[:60]!r}")
    for label, score in zip(result['labels'][:3], result['scores'][:3]):
        print(f"  {label}: {score:.3f}")
    print()


Classes (tags): ['Billing inquiry', 'Cancellation request', 'Product inquiry', 'Refund request', 'Technical issue']


Map:   0%|          | 0/8469 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.6181,1.608834,0.212515,0.124649
2,1.6175,1.609197,0.207792,0.071498
3,1.6123,1.608951,0.207202,0.084645


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Ticket text (first 60 chars): "Data loss. I'm having an issue with the {product_purchased}."
  Refund request: 0.207
  Technical issue: 0.206
  Cancellation request: 0.199

Ticket text (first 60 chars): 'Payment issue. There seems to be a hardware problem with my '
  Refund request: 0.209
  Technical issue: 0.206
  Cancellation request: 0.197

Ticket text (first 60 chars): "Payment issue. I'm having an issue with the {product_purchas"
  Refund request: 0.208
  Technical issue: 0.207
  Cancellation request: 0.198

Ticket text (first 60 chars): "Battery life. I'm having trouble connecting my {product_purc"
  Refund request: 0.208
  Technical issue: 0.207
  Cancellation request: 0.198

Ticket text (first 60 chars): "Account access. I'm having an issue with the {product_purcha"
  Refund request: 0.208
  Technical issue: 0.207
  Cancellation request: 0.199

Ticket text (first 60 chars): "Cancellation request. I'm facing issues

Device set to use cuda:0


Zero-shot classification example:
Text (first 60 chars): "Product setup. I'm having an issue with the {product_purchas"
  Technical issue: 0.917
  Product inquiry: 0.872
  Refund request: 0.251

Text (first 60 chars): "Battery life. I'm having trouble connecting my {product_purc"
  Product inquiry: 0.872
  Technical issue: 0.815
  Billing inquiry: 0.273

Text (first 60 chars): "Refund request. I'm having an issue with the {product_purcha"
  Refund request: 0.993
  Product inquiry: 0.849
  Technical issue: 0.656

