In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import string


In [None]:
file_path = '/content/updated_banking_faq_queries.csv.2'
df = pd.read_csv(file_path)

In [None]:
def preprocess_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    return text

In [None]:
df['cleaned_variation'] = df['Variation'].apply(preprocess_text)

In [None]:
distinct_values = df['CAT_A'].unique()
distinct_values


array(['How do I open a new bank account?',
       'What documents are required for account verification?',
       'How do I close my account?',
       'How do I update my personal details?',
       'I forgot my username/password. How can I recover it?',
       'How do I reset my online banking password?',
       'What should I do if my account is locked?',
       'How can I enable two-factor authentication (2FA)?',
       'Why hasn’t my transfer gone through?',
       'How can I track my transaction history?',
       'I made a transfer to the wrong account. Can I reverse it?',
       'What are the daily transfer limits?',
       'How do I activate my new credit/debit card?',
       'What should I do if my card is lost/stolen?',
       'How do I request a credit limit increase?',
       'How can I block or unblock my card?',
       'How do I apply for a personal loan?',
       'What is the interest rate on loans?',
       'How do I repay my loan early?',
       'What are the eligibilit

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['CAT_A_encoded'] = le.fit_transform(df['CAT_A'])

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['CAT_A_encoded'] = le.fit_transform(df['CAT_A'])

label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

print(label_mapping)

{'Can I deposit checks using an ATM?': 0, 'Can I get a refund for an overdraft fee?': 1, 'Can I withdraw money from my savings account before maturity?': 2, 'How can I block or unblock my card?': 3, 'How can I change my transaction limits in the mobile app?': 4, 'How can I check my account balance online?': 5, 'How can I check the status of a scheduled payment?': 6, 'How can I enable two-factor authentication (2FA)?': 7, 'How can I protect myself from online banking fraud?': 8, 'How can I stop or modify a recurring payment?': 9, 'How can I track my transaction history?': 10, 'How do I activate my new credit/debit card?': 11, 'How do I apply for a new service or product?': 12, 'How do I apply for a personal loan?': 13, 'How do I avoid monthly maintenance fees?': 14, 'How do I close my account?': 15, 'How do I contact customer service?': 16, 'How do I dispute a transaction?': 17, 'How do I download the mobile banking app?': 18, 'How do I open a fixed deposit account?': 19, 'How do I open

In [None]:
train_sentences = df["cleaned_variation"].to_numpy()
train_labels = df["CAT_A_encoded"].to_numpy()

In [None]:
train_labels.shape

(2400,)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_sentences, train_labels, test_size=0.2, random_state=42)

In [None]:
y_train

array([15, 24,  2, ...,  6, 17, 39])

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization


In [None]:
text_vectorizer = TextVectorization(max_tokens=None, standardize="lower_and_strip_punctuation", split="whitespace",  ngrams=None, output_mode="int", output_sequence_length=None)

In [None]:
round(sum([len(i.split()) for i in X_train])/len(X_train))

11

In [None]:
max_vocab_length = 10000
max_length = 20

In [None]:
text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [None]:
text_vectorizer.adapt(X_train)

In [None]:
sample_sentence = "There's a flood in my street!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 20), dtype=int64, numpy=
array([[838,   8,   1,  27,   3,   1,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0]])>

In [None]:
tf.random.set_seed(42)
from tensorflow.keras import layers

In [None]:
embedding = layers.Embedding(input_dim=max_vocab_length,
                             output_dim=128,
                             embeddings_initializer="uniform",
                             input_length=max_length,
                             name="embedding_1")



In [None]:
import random
random_sentence = random.choice(X_train)
print(f"Original text:\n{random_sentence}\
      \n\nEmbedded version:")

sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
what protections do i have against fraud on my account      

Embedded version:


<tf.Tensor: shape=(1, 20, 128), dtype=float32, numpy=
array([[[-0.04966151, -0.01337762, -0.03872795, ...,  0.04911056,
          0.02110047, -0.01044925],
        [ 0.00264834,  0.03271407,  0.01709182, ..., -0.0046545 ,
         -0.02428919,  0.01326612],
        [-0.02545686,  0.0101047 ,  0.00307274, ...,  0.04726168,
          0.01722542,  0.00299506],
        ...,
        [-0.01642863,  0.03795857, -0.01505759, ...,  0.03567597,
         -0.0482098 ,  0.02025864],
        [-0.01642863,  0.03795857, -0.01505759, ...,  0.03567597,
         -0.0482098 ,  0.02025864],
        [-0.01642863,  0.03795857, -0.01505759, ...,  0.03567597,
         -0.0482098 ,  0.02025864]]], dtype=float32)>

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):

  model_accuracy = accuracy_score(y_true, y_pred) * 100

  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [None]:
from tensorflow.keras import layers
from tensorflow.keras.regularizers import l2

inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.LSTM(128)(x)
x = layers.Dense(100, activation="relu")(x)
x = layers.Dense(50, activation="relu")(x)
outputs = layers.Dense(48, activation="softmax")(x)
model_1 = tf.keras.Model(inputs, outputs, name="model_1_LSTM")

In [None]:
model_1.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"],
)


In [None]:
model_1.summary()

In [None]:
model_1_history = model_1.fit(X_train,
                              y_train,
                              epochs=50,
                              )

Epoch 1/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 51ms/step - accuracy: 0.0099 - loss: 3.8727
Epoch 2/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 41ms/step - accuracy: 0.0541 - loss: 3.5922
Epoch 3/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 42ms/step - accuracy: 0.1445 - loss: 2.7539
Epoch 4/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 56ms/step - accuracy: 0.2820 - loss: 2.1869
Epoch 5/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 58ms/step - accuracy: 0.4332 - loss: 1.7270
Epoch 6/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 41ms/step - accuracy: 0.4893 - loss: 1.4932
Epoch 7/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 41ms/step - accuracy: 0.5613 - loss: 1.2367
Epoch 8/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 44ms/step - accuracy: 0.6633 - loss: 0.9722
Epoch 9/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━

In [None]:
model_1_pred_probs = model_1.predict(X_test)

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step


In [None]:
import numpy as np
model_1_preds = np.argmax(model_1_pred_probs, axis=1)

In [None]:
model_1_preds

array([44, 40, 39,  2,  0, 26,  8,  2, 30, 12,  2, 15,  6, 45, 18, 28, 26,
       21, 23,  1, 45, 24, 45,  0, 21, 30,  3, 40, 37, 27,  0,  6, 44,  9,
        9, 27, 29, 10, 30, 34, 46, 37,  3, 33, 36,  6, 44, 27, 16,  6, 25,
       19, 30, 21, 18, 16, 12, 47, 41,  0, 28, 22, 19, 34, 38, 23, 17, 46,
        5, 25, 47, 18, 42, 19, 29, 24, 29, 43,  3,  7, 24,  3, 21,  3, 33,
       39,  6, 10, 15, 20, 43, 11, 29, 32, 35, 40, 40, 37,  8,  0, 17, 47,
        6, 16, 18,  4, 43, 39, 19, 47, 23, 38, 32, 38, 13, 46, 19, 28,  9,
       11, 42,  1,  9, 38, 18, 42, 12, 32, 34, 26, 40,  6, 16,  7,  6, 47,
       44, 29, 21, 13, 25,  4, 31, 47,  9, 43,  1, 12, 27, 10,  1, 40, 45,
       24, 38,  3,  3, 21, 26, 37, 21, 22, 11, 20, 11, 44, 35, 29, 33, 26,
       46, 29,  0, 29, 10, 42, 24, 11, 40, 13, 39, 24, 46, 40, 19,  6, 21,
       40, 36, 32, 37, 46, 35, 25, 21, 36, 24, 26, 44, 22,  3,  6, 22, 21,
        0, 45, 31, 10, 46, 24, 28, 46, 25, 47, 24, 43, 28, 44, 21,  3, 22,
       46, 13,  8, 47,  2

In [None]:
y_test

array([44,  4, 39,  2, 44, 15,  8,  2, 17, 25,  2, 15,  9, 35, 18, 28, 26,
       21, 35,  1, 45,  0, 45,  0,  8, 30, 37,  2, 37, 27, 28,  6, 44,  9,
        9, 27, 29, 10, 30, 34, 46, 37,  3, 33, 36,  6, 44, 40, 16,  6, 25,
       19, 46, 21, 18, 16, 12, 47,  8,  1, 28, 22, 19, 34, 28, 23, 17, 46,
        5, 25, 47, 18, 43, 19, 29, 24, 32, 26,  3, 14, 29,  3, 21,  3, 33,
       39,  6, 10, 15, 20, 31, 11, 29, 32, 35, 40, 23, 37,  8,  0, 17, 47,
       39, 37, 18,  4, 31, 39, 19, 47, 23, 28, 32, 28, 13, 46, 19, 28,  9,
       11, 42,  1,  9, 38, 18, 30, 13, 32, 34, 26, 40,  6, 16,  7,  6, 47,
       44, 29,  5, 13, 25,  4, 37, 14,  9, 43,  1, 12, 27, 10,  1, 40, 45,
       24, 38,  3,  3, 22, 26, 31, 21, 22, 11, 20, 11, 44, 35, 29, 46, 26,
       46, 32, 28, 29, 10, 42, 24, 11, 40, 13,  2, 24, 46, 40, 19, 22,  8,
        2, 36, 32, 37, 46, 35, 34, 12, 36, 24, 16, 32, 38, 31,  9, 22,  8,
        0, 45, 31, 10, 46, 24, 38, 46, 25, 14, 24, 43, 28, 44, 14,  3, 22,
       46, 12,  8, 25,  2

In [None]:
model_1_results = calculate_results(y_true=y_test,
                                    y_pred=model_1_preds)
model_1_results

{'accuracy': 76.45833333333333,
 'precision': 0.7939104963323713,
 'recall': 0.7645833333333333,
 'f1': 0.7609430720231661}

In [None]:
mapped_labels = le.inverse_transform(model_1_preds)


In [None]:
mapped_labels

array(['Where is the nearest bank branch or ATM?',
       'What is the maximum amount I can withdraw from an ATM?',
       'What is the interest rate on loans?',
       'Can I withdraw money from my savings account before maturity?',
       'Can I deposit checks using an ATM?',
       'How do I update my personal details?',
       'How can I protect myself from online banking fraud?',
       'Can I withdraw money from my savings account before maturity?',
       'I made a transfer to the wrong account. Can I reverse it?',
       'How do I apply for a new service or product?',
       'Can I withdraw money from my savings account before maturity?',
       'How do I close my account?',
       'How can I check the status of a scheduled payment?',
       'Why did my payment fail?',
       'How do I download the mobile banking app?',
       'How is interest calculated on savings accounts?',
       'How do I update my personal details?',
       'How do I register for online banking?',
       

In [None]:
model_1.save('model_path.h5')



In [None]:
import tensorflow as tf
import numpy as np

custom_message = "How can I create a new bank account?"
cleaned_text = preprocess_text(custom_message)

custom_message_array = np.array([cleaned_text])

custom_message_tensor = tf.convert_to_tensor(custom_message_array, dtype=tf.string)

prediction_probs = model_1.predict(custom_message_tensor)

print(prediction_probs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[[4.4351380e-13 1.6389726e-13 1.6836075e-03 3.8635766e-19 7.8487272e-14
  4.2292608e-10 1.1607899e-13 4.6821960e-06 7.5300704e-07 4.5420765e-15
  4.1983782e-08 1.1791831e-06 3.7116126e-06 1.1038336e-12 2.6234648e-10
  5.1265115e-06 5.8389521e-08 2.3707118e-11 1.9879453e-08 5.5379977e-17
  9.9379987e-01 2.7647731e-03 1.2369738e-15 3.7152569e-17 8.6810799e-09
  4.0470134e-04 1.2890004e-03 2.1081842e-19 1.6417875e-18 1.4191388e-08
  1.4056562e-11 4.2549447e-05 1.1314153e-14 3.3318345e-17 5.8160049e-10
  1.5294935e-15 8.4390984e-12 6.3026749e-17 1.8607032e-12 2.0457576e-17
  1.0993642e-09 7.7457166e-16 1.2395671e-15 1.2431662e-11 3.4795492e-19
  9.7993453e-17 4.0875117e-19 6.6910880e-11]]


In [None]:
model_1_preds = np.argmax(prediction_probs, axis = 1)

In [None]:
mapped_labels = le.inverse_transform(model_1_preds)


In [None]:
mapped_labels

array(['How do I open a new bank account?'], dtype=object)

In [None]:
import tensorflow as tf

model = tf.keras.models.load_model('/content/model_path.h5')




In [None]:
!wget https://raw.githubusercontent.com/omsharma-001/CS683_NLP_Project/refs/heads/main/updated_banking_faq_queries.csv

--2024-11-03 10:25:04--  https://raw.githubusercontent.com/omsharma-001/CS683_NLP_Project/refs/heads/main/updated_banking_faq_queries.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 305396 (298K) [text/plain]
Saving to: ‘updated_banking_faq_queries.csv.2’


2024-11-03 10:25:04 (2.46 MB/s) - ‘updated_banking_faq_queries.csv.2’ saved [305396/305396]

