In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import string


In [None]:
file_path = '/content/updated_banking_faq_queries.csv'
df = pd.read_csv(file_path)

In [None]:
def preprocess_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    return text

In [None]:
df['cleaned_variation'] = df['Variation'].apply(preprocess_text)

In [None]:
distinct_values = df['CAT_A'].unique()
distinct_values


array(['How do I open a new bank account?',
       'What documents are required for account verification?',
       'How do I close my account?',
       'How do I update my personal details?',
       'I forgot my username/password. How can I recover it?',
       'How do I reset my online banking password?',
       'What should I do if my account is locked?',
       'How can I enable two-factor authentication (2FA)?',
       'Why hasn’t my transfer gone through?',
       'How can I track my transaction history?',
       'I made a transfer to the wrong account. Can I reverse it?',
       'What are the daily transfer limits?',
       'How do I activate my new credit/debit card?',
       'What should I do if my card is lost/stolen?',
       'How do I request a credit limit increase?',
       'How can I block or unblock my card?',
       'How do I apply for a personal loan?',
       'What is the interest rate on loans?',
       'How do I repay my loan early?',
       'What are the eligibilit

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['CAT_A_encoded'] = le.fit_transform(df['CAT_A'])

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['CAT_A_encoded'] = le.fit_transform(df['CAT_A'])

label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

print(label_mapping)

{'Can I deposit checks using an ATM?': 0, 'Can I get a refund for an overdraft fee?': 1, 'Can I withdraw money from my savings account before maturity?': 2, 'How can I block or unblock my card?': 3, 'How can I change my transaction limits in the mobile app?': 4, 'How can I check my account balance online?': 5, 'How can I check the status of a scheduled payment?': 6, 'How can I enable two-factor authentication (2FA)?': 7, 'How can I protect myself from online banking fraud?': 8, 'How can I stop or modify a recurring payment?': 9, 'How can I track my transaction history?': 10, 'How do I activate my new credit/debit card?': 11, 'How do I apply for a new service or product?': 12, 'How do I apply for a personal loan?': 13, 'How do I avoid monthly maintenance fees?': 14, 'How do I close my account?': 15, 'How do I contact customer service?': 16, 'How do I dispute a transaction?': 17, 'How do I download the mobile banking app?': 18, 'How do I open a fixed deposit account?': 19, 'How do I open

In [None]:
train_sentences = df["cleaned_variation"].to_numpy()
train_labels = df["CAT_A_encoded"].to_numpy()

In [None]:
train_labels.shape

(2400,)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_sentences, train_labels, test_size=0.2, random_state=42)

In [None]:
y_train

array([15, 24,  2, ...,  6, 17, 39])

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization


In [None]:
text_vectorizer = TextVectorization(max_tokens=None, standardize="lower_and_strip_punctuation", split="whitespace",  ngrams=None, output_mode="int", output_sequence_length=None)

In [None]:
round(sum([len(i.split()) for i in X_train])/len(X_train))

11

In [None]:
max_vocab_length = 10000
max_length = 20

In [None]:
text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [None]:
text_vectorizer.adapt(X_train)

In [None]:
sample_sentence = "There's a flood in my street!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 20), dtype=int64, numpy=
array([[838,   8,   1,  27,   3,   1,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0]])>

In [None]:
tf.random.set_seed(42)
from tensorflow.keras import layers

In [None]:
embedding = layers.Embedding(input_dim=max_vocab_length,
                             output_dim=128,
                             embeddings_initializer="uniform",
                             input_length=max_length,
                             name="embedding_1")



In [None]:
import random
random_sentence = random.choice(X_train)
print(f"Original text:\n{random_sentence}\
      \n\nEmbedded version:")

sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
how do i check my payment history for issues      

Embedded version:


<tf.Tensor: shape=(1, 20, 128), dtype=float32, numpy=
array([[[-0.04900318, -0.04708189, -0.02478098, ..., -0.01102585,
         -0.03193197,  0.01119925],
        [-0.01910412,  0.00801634,  0.0207254 , ..., -0.03519938,
         -0.00616542,  0.03229127],
        [-0.04346534,  0.0112267 , -0.04433036, ...,  0.03350014,
         -0.01906863, -0.02327411],
        ...,
        [ 0.02816925, -0.01936547,  0.0268073 , ..., -0.01825867,
          0.03880591,  0.00524354],
        [ 0.02816925, -0.01936547,  0.0268073 , ..., -0.01825867,
          0.03880591,  0.00524354],
        [ 0.02816925, -0.01936547,  0.0268073 , ..., -0.01825867,
          0.03880591,  0.00524354]]], dtype=float32)>

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):

  model_accuracy = accuracy_score(y_true, y_pred) * 100

  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [None]:
from tensorflow.keras import layers
from tensorflow.keras.regularizers import l2

inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.LSTM(128)(x)
x = layers.Dense(100, activation="relu")(x)
x = layers.Dense(50, activation="relu")(x)
outputs = layers.Dense(48, activation="softmax")(x)
model_1 = tf.keras.Model(inputs, outputs, name="model_1_LSTM")

In [None]:
model_1.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"],
)


In [None]:
model_1.summary()

In [None]:
model_1_history = model_1.fit(X_train,
                              y_train,
                              epochs=50,
                              )

Epoch 1/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 40ms/step - accuracy: 0.0238 - loss: 3.8727
Epoch 2/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 39ms/step - accuracy: 0.0422 - loss: 3.7301
Epoch 3/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 65ms/step - accuracy: 0.1424 - loss: 2.8970
Epoch 4/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 38ms/step - accuracy: 0.2677 - loss: 2.2914
Epoch 5/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 41ms/step - accuracy: 0.3520 - loss: 1.9707
Epoch 6/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 37ms/step - accuracy: 0.3880 - loss: 1.7380
Epoch 7/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 45ms/step - accuracy: 0.4405 - loss: 1.5418
Epoch 8/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 39ms/step - accuracy: 0.5463 - loss: 1.2483
Epoch 9/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━

In [None]:
model_1_pred_probs = model_1.predict(X_test)

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step


In [None]:
import numpy as np
model_1_preds = np.argmax(model_1_pred_probs, axis=1)

In [None]:
model_1_preds

array([44, 40, 39,  2, 45, 18,  8,  2, 17, 25,  2, 15,  9, 47, 18, 28, 26,
       21, 40,  1, 45, 31,  6,  0,  2, 30, 37, 40, 37, 27, 28,  6, 44,  9,
        9, 27, 29, 17, 30, 34, 46, 37, 43, 33, 36,  6, 44, 27, 16,  6, 25,
       19, 30, 21, 18, 16, 40, 47,  8,  1, 28, 22, 19, 34, 38, 23, 17, 46,
        5, 25, 47, 18, 46, 19,  7, 24, 36, 43,  3, 15, 24,  3, 21,  3, 33,
       39,  6, 10, 15, 20, 10, 11, 29, 32, 35, 40, 23, 37,  8,  0, 17,  2,
       45, 42, 18, 17, 31, 39, 19, 10, 23, 28, 32, 38, 13, 46, 19, 28,  9,
       11, 42, 39,  9, 38, 18, 31, 12, 32, 34, 26, 40,  6, 16,  7,  6, 47,
       44, 29,  5, 13, 25,  4, 31, 14,  9, 31,  1, 31, 27, 10,  1, 40, 45,
       29, 38,  3,  3,  6, 26, 37, 21, 22, 11, 20, 11,  4, 35, 29, 20, 26,
       46, 31, 38, 29, 10, 42, 24, 11, 40, 13, 28, 24, 46, 40, 19, 17,  8,
       40, 36, 46, 37, 46, 35, 34, 12, 36, 24, 16, 18, 38, 31,  9, 19,  8,
        0, 45,  8, 10, 46, 24, 28, 46, 25, 47, 24, 43, 28, 44,  5,  3, 22,
       46, 24, 47, 25,  2

In [None]:
y_test

array([44,  4, 39,  2, 44, 15,  8,  2, 17, 25,  2, 15,  9, 35, 18, 28, 26,
       21, 35,  1, 45,  0, 45,  0,  8, 30, 37,  2, 37, 27, 28,  6, 44,  9,
        9, 27, 29, 10, 30, 34, 46, 37,  3, 33, 36,  6, 44, 40, 16,  6, 25,
       19, 46, 21, 18, 16, 12, 47,  8,  1, 28, 22, 19, 34, 28, 23, 17, 46,
        5, 25, 47, 18, 43, 19, 29, 24, 32, 26,  3, 14, 29,  3, 21,  3, 33,
       39,  6, 10, 15, 20, 31, 11, 29, 32, 35, 40, 23, 37,  8,  0, 17, 47,
       39, 37, 18,  4, 31, 39, 19, 47, 23, 28, 32, 28, 13, 46, 19, 28,  9,
       11, 42,  1,  9, 38, 18, 30, 13, 32, 34, 26, 40,  6, 16,  7,  6, 47,
       44, 29,  5, 13, 25,  4, 37, 14,  9, 43,  1, 12, 27, 10,  1, 40, 45,
       24, 38,  3,  3, 22, 26, 31, 21, 22, 11, 20, 11, 44, 35, 29, 46, 26,
       46, 32, 28, 29, 10, 42, 24, 11, 40, 13,  2, 24, 46, 40, 19, 22,  8,
        2, 36, 32, 37, 46, 35, 34, 12, 36, 24, 16, 32, 38, 31,  9, 22,  8,
        0, 45, 31, 10, 46, 24, 38, 46, 25, 14, 24, 43, 28, 44, 14,  3, 22,
       46, 12,  8, 25,  2

In [None]:
model_1_results = calculate_results(y_true=y_test,
                                    y_pred=model_1_preds)
model_1_results

{'accuracy': 77.08333333333334,
 'precision': 0.7910764736299275,
 'recall': 0.7708333333333334,
 'f1': 0.767641522052219}

In [None]:
mapped_labels = le.inverse_transform(model_1_preds)


In [None]:
mapped_labels

array(['Where is the nearest bank branch or ATM?',
       'What is the maximum amount I can withdraw from an ATM?',
       'What is the interest rate on loans?',
       'Can I withdraw money from my savings account before maturity?',
       'Why did my payment fail?',
       'How do I download the mobile banking app?',
       'How can I protect myself from online banking fraud?',
       'Can I withdraw money from my savings account before maturity?',
       'How do I dispute a transaction?',
       'How do I set up auto-payments for my bills?',
       'Can I withdraw money from my savings account before maturity?',
       'How do I close my account?',
       'How can I stop or modify a recurring payment?',
       'Why was I charged a fee on my account?',
       'How do I download the mobile banking app?',
       'How is interest calculated on savings accounts?',
       'How do I update my personal details?',
       'How do I register for online banking?',
       'What is the maximum am

In [None]:
model_1.save('model_path.h5')

