In [1]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub
import os


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path = kagglehub.dataset_download("bitext/bitext-gen-ai-chatbot-customer-support-dataset")
files = os.listdir(path)
print("Files in dataset:", files)

csv_files = [f for f in files if f.endswith(".csv")]
if csv_files:
    df = pd.read_csv(os.path.join(path, csv_files[0]))  # Load the first CSV file
    print(df.head())
else:
    print("No CSV files found in the dataset.")

Files in dataset: ['Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv']
   flags                                        instruction category  \
0      B   question about cancelling order {{Order Number}}    ORDER   
1    BQZ  i have a question about cancelling oorder {{Or...    ORDER   
2   BLQZ    i need help cancelling puchase {{Order Number}}    ORDER   
3     BL         I need to cancel purchase {{Order Number}}    ORDER   
4  BCELN  I cannot afford this order, cancel purchase {{...    ORDER   

         intent                                           response  
0  cancel_order  I've understood you have a question regarding ...  
1  cancel_order  I've been informed that you have a question ab...  
2  cancel_order  I can sense that you're seeking assistance wit...  
3  cancel_order  I understood that you need assistance with can...  
4  cancel_order  I'm sensitive to the fact that you're facing f...  


In [3]:
X = df['instruction']
y = df['intent']

In [4]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Print dataset sizes
print(f"Training set: {len(X_train)}")
print(f"Validation set: {len(X_val)}")
print(f"Test set: {len(X_test)}")

Training set: 21497
Validation set: 2687
Test set: 2688


In [5]:

# Encode intent labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)
y_test_encoded = label_encoder.transform(y_test)


In [6]:

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Limit vocab size for efficiency

# Convert instructions into numerical vectors
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)


In [7]:
model_tfidf = LogisticRegression()
model_tfidf.fit(X_train_tfidf, y_train_encoded)
y_pred_tfidf = model_tfidf.predict(X_test_tfidf)

# Evaluate
print("TF-IDF Accuracy:", accuracy_score(y_test_encoded, y_pred_tfidf))


TF-IDF Accuracy: 0.9951636904761905


In [8]:

# Generate classification report
report_tfidf = classification_report(y_test_encoded, y_pred_tfidf, target_names=label_encoder.classes_)
print("TF-IDF Model Performance:\n", report_tfidf)


TF-IDF Model Performance:
                           precision    recall  f1-score   support

            cancel_order       1.00      1.00      1.00       100
            change_order       0.97      0.98      0.97        99
 change_shipping_address       1.00      1.00      1.00        97
  check_cancellation_fee       1.00      1.00      1.00        95
           check_invoice       1.00      1.00      1.00       100
   check_payment_methods       1.00      1.00      1.00       100
     check_refund_policy       1.00      1.00      1.00       100
               complaint       1.00      1.00      1.00       100
contact_customer_service       1.00      1.00      1.00       100
     contact_human_agent       1.00      1.00      1.00       100
          create_account       0.98      0.98      0.98       100
          delete_account       0.98      1.00      0.99        99
        delivery_options       1.00      1.00      1.00       100
         delivery_period       1.00      1.00   

In [10]:
sentence = 'what is your return policy?'

sentence_tfidf = vectorizer.transform([sentence])
intent = label_encoder.inverse_transform(model_tfidf.predict(sentence_tfidf))
print(f"Sentence: {sentence}")
print(f"Predicted Intent: {intent[0]}")

Sentence: what is your return policy?
Predicted Intent: check_refund_policy
