In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# Download stopwords if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\APPLE\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Load dataset
data_path = 'data/Bitext_Sample_Customer_Service_Training_Dataset.csv'
df = pd.read_csv(data_path)

In [5]:
df.head(5)

Unnamed: 0,flags,utterance,category,intent
0,BM,I have problems with canceling an order,ORDER,cancel_order
1,BIM,how can I find information about canceling ord...,ORDER,cancel_order
2,B,I need help with canceling the last order,ORDER,cancel_order
3,BIP,could you help me cancelling the last order I ...,ORDER,cancel_order
4,B,problem with cancelling an order I made,ORDER,cancel_order


In [6]:
# Sample cleaning function
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters and punctuation
    text = text.lower()  # Lowercase the text
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

df['cleaned_text'] = df['utterance'].apply(clean_text)

In [7]:
df.head(4)

Unnamed: 0,flags,utterance,category,intent,cleaned_text
0,BM,I have problems with canceling an order,ORDER,cancel_order,problems canceling order
1,BIM,how can I find information about canceling ord...,ORDER,cancel_order,find information canceling orders
2,B,I need help with canceling the last order,ORDER,cancel_order,need help canceling last order
3,BIP,could you help me cancelling the last order I ...,ORDER,cancel_order,could help cancelling last order made


In [8]:
# Feature Extraction

# extract featuyres and labels

utterances = df['utterance'].values
intents = df['intent'].values
categories=df['category'].values

## Create a dictionary to display the queries and intents for inspection
for utterance, intent in zip(utterances[:5], intents[:5]):
    print(f"User Query: {utterance} -> Intent: {intent}")

User Query: I have problems with canceling an order -> Intent: cancel_order
User Query: how can I find information about canceling orders? -> Intent: cancel_order
User Query: I need help with canceling the last order -> Intent: cancel_order
User Query: could you help me cancelling the last order I made? -> Intent: cancel_order
User Query: problem with cancelling an order I made -> Intent: cancel_order


In [9]:
# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['intent'], test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

Training set size: 6540
Testing set size: 1635


In [10]:
# Tokenization and vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=3000)

# Fit and transform the training data into TF-IDF vectors
X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = vectorizer.transform(X_test).toarray()

print(f"TF-IDF matrix shape: {X_train_tfidf.shape}")

TF-IDF matrix shape: (6540, 578)


In [11]:
from sklearn.preprocessing import LabelEncoder

# Initialize label encoder
label_encoder = LabelEncoder()

# Fit and transform the labels for training
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

print(f"Encoded labels: {y_train_encoded[:5]}")

Encoded labels: [11  7  1 14 19]


In [12]:
#Retrieval - based model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Fit the TF-IDF vectorizer on the training data
vectorizer = TfidfVectorizer(max_features=3000)
X_train_tfidf = vectorizer.fit_transform(X_train).toarray()

# Define a function to retrieve the most similar response
def get_response(query):
    # Transform the user query to TF-IDF
    query_tfidf = vectorizer.transform([query]).toarray()
    
    # Compute cosine similarity between the query and all training utterances
    similarities = cosine_similarity(query_tfidf, X_train_tfidf)
    
    # Get the index of the most similar utterance
    max_sim_index = np.argmax(similarities)
    
    # Return the corresponding response (intent or action)
    return y_train.iloc[max_sim_index]

# Test the retrieval-based model
query = "could I check if there is anything wrong with my refund?"
response = get_response(query)
print(f"User Query: {query} \nPredicted Response/Intent: {response}")


User Query: could I check if there is anything wrong with my refund? 
Predicted Response/Intent: track_refund


In [15]:
# Transformerws -based models

from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Replace 'your_token_here' with your actual Hugging Face token

# Load GPT-2 model and tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Tokenize input and generate a response
input_text = "I do not know howI can change to a different account?"
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# Generate response using GPT-2
response_ids = model.generate(input_ids, max_length=50)
response = tokenizer.decode(response_ids[0], skip_special_tokens=True)

print(f"User Query: {input_text} \nGenerated Response: {response}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


User Query: I do not know howI can change to a different account? 
Generated Response: I do not know howI can change to a different account?

I am not sure how to change to a different account.

I am not sure how to change to a different account.

I am not sure how to change


In [13]:
# Quantitative Evaluation mertics
import nltk
from nltk.translate.bleu_score import sentence_bleu

# Reference response (correct response)
reference = [['I', 'can', 'help', 'you', 'cancel', 'your', 'order']]

# Generated response from the chatbot
candidate = ['I', 'can', 'help', 'cancel', 'your', 'order']

# Compute BLEU score
bleu_score = sentence_bleu(reference, candidate)
print(f"BLEU Score: {bleu_score}")

BLEU Score: 8.221833772233233e-78


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [14]:
# Save the model and tokenizer
model.save_pretrained('model/gpt2_model')
tokenizer.save_pretrained('model/gpt2_model')


('model/gpt2_model\\tokenizer_config.json',
 'model/gpt2_model\\special_tokens_map.json',
 'model/gpt2_model\\vocab.json',
 'model/gpt2_model\\merges.txt',
 'model/gpt2_model\\added_tokens.json')