In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from transformers import pipeline

from sklearn.metrics import accuracy_score
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings("ignore")
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/customer-support-ticket-dataset/customer_support_tickets.csv')

# Drop rows with missing values (inplace=False by default, so it doesn't modify df in place)
df.dropna()


# Rename certain columns for consistency and easier access
df.rename(columns={'Ticket Status': 'Ticket_Status', 'Customer Gender': 'Customer_Gender', 
                   'Ticket Priority': 'Ticket_Priority', 'Ticket Type': 'Ticket_Type'}, inplace=True)

In [None]:
df.head(1).T

In [None]:
df['Ticket Description'] = df.apply(lambda row: row['Ticket Description'].replace('{product_purchased}', row['Product Purchased']), axis=1)

In [None]:
df["Ticket_Type"].value_counts().plot(kind='bar', title='Value Counts of Ticket Type', xlabel='Ticket Type', ylabel='Count')


In [None]:
labels = df["Ticket_Type"].unique()
labels

In [None]:
model="google-bert/bert-base-uncased"

In [None]:
pipe = pipeline("zero-shot-classification", model)

# Zero shot prompting

## 4 out 5 are correct but the scores have minute difference in their decisions

#### Sample

In [None]:
inference_sample_ip = df["Ticket Description"].iloc[:5].tolist()
inference_sample_op = pipe(inference_sample_ip,labels, multi_label=True, return_all_scores = False)

In [None]:
inference_sample_op

In [None]:
[op['labels'][0] for op in inference_sample_op]

#### Sample Data

In [None]:
sample_len = 10 
sampled_df = df.groupby('Ticket_Type', group_keys=False).apply(lambda x: x.sample(n=min(len(x), sample_len), replace=False)).reset_index(drop=True)

In [None]:
sampled_df["Ticket_Type"].value_counts().plot(kind='bar', title='Value Counts of Ticket Type', xlabel='Ticket Type', ylabel='Count')

In [None]:
from sklearn.model_selection import train_test_split

def create_train_test_split(df, test_size=0.2):
    """
    Splits the input DataFrame into train and test datasets with equal distribution of each Ticket_Type.

    Args:
    df (pd.DataFrame): Input DataFrame containing 'Ticket ID', 'Ticket Description', and 'Ticket_Type'.
    test_size (float): Proportion of the dataset to include in the test split.

    Returns:
    train_dataset (Dataset): Training dataset.
    test_dataset (Dataset): Testing dataset.
    """
    # Perform stratified train-test split
    train_df, test_df = train_test_split(df[['Ticket Description','Ticket_Type']], test_size=test_size, stratify=df['Ticket_Type'])

    # Convert the split data back to Hugging Face datasets
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)

    train_dataset = train_dataset.rename_column("Ticket Description", "text")
    train_dataset = train_dataset.rename_column("Ticket_Type", "label")

    test_dataset = test_dataset.rename_column("Ticket Description", "text")
    test_dataset = test_dataset.rename_column("Ticket_Type", "label")

    return train_dataset, test_dataset

In [None]:
from torch.utils.data import Dataset

class ListDataset(Dataset):
     def __init__(self, original_list):
        self.original_list = original_list
     def __len__(self):
        return len(self.original_list)

     def __getitem__(self, i):
        return self.original_list[i]

In [None]:
text_data = ListDataset(sampled_df["Ticket Description"].tolist())
text_data

In [None]:
sampled_df.head()

In [None]:

def calculate_accuracy(df,ip_data):
    # Initialize lists to store true labels and predicted labels
    true_labels = df.Ticket_Type.tolist()
    predictions = []
    for pred in tqdm(pipe(ip_data,labels, multi_label=True,do_sample=True,
    top_k=10)):
        predictions.append(pred)
    predicted_labels = [op['labels'][0] for op in predictions]
    print(true_labels, predicted_labels)
    # Calculate accuracy
    accuracy = accuracy_score(true_labels, predicted_labels)
    
    return accuracy

In [None]:
accuracy_sample = calculate_accuracy(sampled_df, text_data)

In [None]:
accuracy_sample

# Few shot learnings

In [None]:
!pip install setfit

In [None]:
from datasets import Dataset, get_dataset_split_names
from transformers import AutoTokenizer
from sentence_transformers.losses import CosineSimilarityLoss

from setfit import SetFitModel, SetFitTrainer


In [None]:
train_dataset, test_dataset =  create_train_test_split(sampled_df, test_size=0.2)

In [None]:
train_dataset,test_dataset

In [None]:
# Load SetFit model from Hub
setfit_model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")

# Create trainer
trainer = SetFitTrainer(
    model=setfit_model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    loss_class=CosineSimilarityLoss,
    batch_size=16,
    num_iterations=5, # Number of text pairs to generate for contrastive learning
    num_epochs=5 # Number of epochs to use for contrastive learning
)


In [None]:
trainer.train()
metrics = trainer.evaluate()

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
!git config --global credential.helper store

In [None]:
trained_model_name = "neha-duggirala/customer-support-classifier_setfit-trainer"
trainer.push_to_hub(trained_model_name)

In [None]:
trained_model = SetFitModel.from_pretrained(trained_model_name)

In [None]:
validation_dataset = Dataset.from_pandas(df[['Ticket Description', 'Ticket_Type']])
validation_dataset = validation_dataset.rename_column('Ticket Description','text')
validation_dataset = validation_dataset.rename_column('Ticket_Type','label')
predicted = trained_model.predict(validation_dataset['text'])

In [None]:
actual = validation_dataset['label']

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(actual, predicted)

## Weights and biases Logging

In [None]:
import wandb
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()

my_secret = user_secrets.get_secret("wandb_api_key") 

wandb.login(key=my_secret)

In [None]:
run = wandb.init(
    project="customer-support-classification",
)

## Prompt

In [None]:
from langchain.prompts import PromptTemplate

template = '''
Classify the following ticket description into one of the five issue types: ['Technical issue', 'Billing inquiry', 'Cancellation request', 'Product inquiry', 'Refund request']. Carefully read the ticket description and identify key phrases and context that indicate the nature of the issue. Match the identified key phrases and context to one of the five issue types. Consider the specific characteristics of each issue type:

Technical issue: Problems related to software, hardware, or system functionality.
Billing inquiry: Questions or concerns about charges, payments, or invoices.
Cancellation request: Requests to terminate a subscription or service.
Product inquiry: Questions about product features, usage, or availability.
Refund request: Requests for reimbursement for a purchased product or service.

Here are a few examples:
description: {description_billing_1}
label: {label_1}

description: {description_cancellation_1}
label: {label_2}

description: {description_product_1}
label: {label_3}

description: {description_refund_1}
label: {label_4}

description: {description_technical_1}
label: 
'''

prompt = PromptTemplate(
    input_variables=["description_billing_1", "label_1", "description_cancellation_1", "label_2","description_product_1","label_3", "description_refund_1","label_4", "description_technical_1","label_5"],
    template=template
)


filled_prompt = prompt.format(
    description_billing_1=sampled_df["Ticket Description"][0],
    label_1=sampled_df["Ticket_Type"][0],
    description_cancellation_1=sampled_df["Ticket Description"][11],
    label_2=sampled_df["Ticket_Type"][11],
    description_product_1=sampled_df["Ticket Description"][21],
    label_3=sampled_df["Ticket_Type"][21],
    description_refund_1=sampled_df["Ticket Description"][31],
    label_4=sampled_df["Ticket_Type"][31],
    description_technical_1=sampled_df["Ticket Description"][41],
    label_5=sampled_df["Ticket_Type"][41]
)

# print(filled_prompt)
