In [4]:

import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /home/gaurav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/gaurav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gaurav/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
import re
import string

# Load the dataset
file_path = 'updated_banking_faq_queries.csv'
df = pd.read_csv(file_path)

# Step 1: Preprocessing function to clean the text
def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove digits
    text = re.sub(r'\d+', '', text)
    return text

# Apply the preprocessing function to the 'Variation' column
df['cleaned_variation'] = df['Variation'].apply(preprocess_text)

# Step 2: Encode the target variable 'CAT_A' using LabelEncoder
le = LabelEncoder()
df['CAT_A_encoded'] = le.fit_transform(df['CAT_A'])

# Step 3: Train-test split
X = df['cleaned_variation']
y = df['CAT_A_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Create a pipeline with TF-IDF Vectorizer and SVM classifier
model = make_pipeline(
    TfidfVectorizer(),
    SVC(kernel='linear', random_state=42)
)

# Step 5: Train the model
model.fit(X_train, y_train)

# Step 6: Make predictions and evaluate the model
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred, target_names=le.classes_)

# Display the classification report
print(report)


                                                                precision    recall  f1-score   support

                            Can I deposit checks using an ATM?       1.00      1.00      1.00        11
                      Can I get a refund for an overdraft fee?       1.00      0.89      0.94         9
 Can I withdraw money from my savings account before maturity?       0.92      0.85      0.88        13
                           How can I block or unblock my card?       1.00      1.00      1.00        12
     How can I change my transaction limits in the mobile app?       1.00      0.75      0.86         8
                    How can I check my account balance online?       0.92      0.92      0.92        12
            How can I check the status of a scheduled payment?       1.00      1.00      1.00         8
             How can I enable two-factor authentication (2FA)?       1.00      1.00      1.00         9
           How can I protect myself from online banking fraud? 

In [8]:
import joblib

# Save the model to a file
model_filename = 'svm_cat_a_model.pkl'
joblib.dump(model, model_filename)

# Save the LabelEncoder as well (to decode the predictions)
label_encoder_filename = 'label_encoder.pkl'
joblib.dump(le, label_encoder_filename)

print(f"Model and Label Encoder saved to {model_filename} and {label_encoder_filename}")


Model and Label Encoder saved to svm_cat_a_model.pkl and label_encoder.pkl


In [10]:
import joblib

# Load the saved model and label encoder
model = joblib.load('svm_cat_a_model.pkl')
le = joblib.load('label_encoder.pkl')

# Function to predict CAT_A based on a new input
def predict_cat_a(new_input):
    # Preprocess the input
    cleaned_input = preprocess_text(new_input)
    # Make the prediction
    prediction_encoded = model.predict([cleaned_input])[0]
    # Convert the encoded label back to the original category
    predicted_cat_a = le.inverse_transform([prediction_encoded])[0]
    return predicted_cat_a

# Example usage:
new_query = "money transferred wrongly?"
predicted_category = predict_cat_a(new_query)
print(f"The predicted CAT_A for '{new_query}' is: {predicted_category}")


The predicted CAT_A for 'money transferred wrongly?' is: I made a transfer to the wrong account. Can I reverse it?
