In [53]:
import pandas as pd
import numpy as np
import re
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

In [54]:
# Step 1: Load the Dataset
file_path = "/Users/mohammadadnaan/Downloads/Prodigal_Tech/intelligent-debt-recovery-genai/src/ml_models/privacy_compliance_detection/conversations_with_labels.xlsx"
df = pd.read_excel(file_path)

In [55]:
# Step 2: Preprocess the Text Data
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    lemmatized_text = []
    for word in text.split():
        if word.endswith('ing'):
            word = word[:-3]
        elif word.endswith('ed'):
            word = word[:-2]
        elif word.endswith('s'):
            word = word[:-1]
        lemmatized_text.append(word)
    text = " ".join(lemmatized_text)
    stop_words = set(["the", "and", "is", "in", "it", "to", "of", "for", "with", "on", "at", "by", "this", "that", "are", "as", "be", "was", "were", "you", "your", "we", "our"])
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

df["Processed_Text"] = df["Text"].apply(preprocess_text)


In [56]:
# Step 3: Tokenization and Padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df["Processed_Text"])

with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

sequences = tokenizer.texts_to_sequences(df["Processed_Text"])
max_len = 100
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding="post")


In [57]:
# Step 4: Prepare Labels
labels = df["Label"].values

# Check class distribution
print("Class Distribution:", Counter(labels))


Class Distribution: Counter({0: 220, 1: 29})


In [58]:
# Step 5: Handle Data Imbalance
# Option 1: Oversampling the Minority Class
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(padded_sequences, labels)

# Check new class distribution
print("Resampled Class Distribution:", Counter(y_resampled))


Resampled Class Distribution: Counter({0: 220, 1: 220})


In [59]:
# Step 6: Build the LSTM Model
embedding_dim = 100
vocab_size = len(tokenizer.word_index) + 1

def build_model():
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_shape=(max_len,)),
        Bidirectional(LSTM(128, dropout=0.3, recurrent_dropout=0.3, kernel_regularizer=l2(0.01))),
        Dense(64, activation="relu", kernel_regularizer=l2(0.01)),
        Dropout(0.5),
        Dense(1, activation="sigmoid")
    ])
    # Option 2: Class Weighting
    class_weights = {0: 1, 1: 5}  # Adjust weights based on class imbalance
    optimizer = Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"])
    return model, class_weights

In [61]:
# Step 7: Train the Model
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

model, class_weights = build_model()

early_stopping = EarlyStopping(monitor='val_loss', patience=25, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)

history = model.fit(X_train, y_train, epochs=200, batch_size=32, validation_split=0.2, callbacks=[early_stopping, reduce_lr], class_weight=class_weights)


Epoch 1/200
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 85ms/step - accuracy: 0.5353 - loss: 6.1593 - val_accuracy: 0.6479 - val_loss: 4.0466 - learning_rate: 0.0010
Epoch 2/200
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - accuracy: 0.5401 - loss: 4.6509 - val_accuracy: 0.6479 - val_loss: 3.2850 - learning_rate: 0.0010
Epoch 3/200
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 56ms/step - accuracy: 0.6404 - loss: 3.7038 - val_accuracy: 0.6479 - val_loss: 2.8129 - learning_rate: 0.0010
Epoch 4/200
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 59ms/step - accuracy: 0.6474 - loss: 3.0083 - val_accuracy: 0.6620 - val_loss: 2.2862 - learning_rate: 0.0010
Epoch 5/200
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 62ms/step - accuracy: 0.6749 - loss: 2.3808 - val_accuracy: 0.7465 - val_loss: 1.7034 - learning_rate: 0.0010
Epoch 6/200
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 73ms/s

In [62]:
# Step 8: Evaluate the Model
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.88      0.93        49
           1       0.87      1.00      0.93        39

    accuracy                           0.93        88
   macro avg       0.93      0.94      0.93        88
weighted avg       0.94      0.93      0.93        88

Accuracy: 93.18%


In [63]:
# Step 8: Save the Model
model.save("privacy_compliance_lstm_model.keras")
print("Model saved successfully.")

Model saved successfully.


In [64]:
message = ['Hello, this is Mark from XYZ Collections. How are you today? I am okay, thanks. What is this about? I am calling regarding your outstanding debt with Global Bank. Can you confirm your name for me? It is Jordan Smith. Thank you, Jordan. Your account shows a balance of $750. How would you like to settle this today? I need to check my finances before deciding. I understand. Just so you know, we can set up a payment plan if that helps. That could work. Can you send me the details? Absolutely, I can send that via email. Can you confirm your email address? It is jordan.smith@email.com. Great! I will send the details right away. Is there anything else you need? No, that is all for now. Thanks. You are welcome! Have a great day. You too. Bye!']

# Step 1: Tokenize and pad the message
seq = tokenizer.texts_to_sequences(message)  # Convert text to sequence of integers
padded = pad_sequences(seq, maxlen=max_len, dtype='int32', value=0)  # Pad the sequence

# Step 2: Make a prediction with the model
pred = model.predict(padded)  # Get the prediction probability

# Define labels
labels = ['No issue (Label 0)', 'Sensitive info shared without verification (Label 1)']

# Get the predicted class (0 or 1)
predicted_class = int(pred[0][0] > 0.5)  # Threshold at 0.5 for binary classification

# Step 4: Show the predicted class with its probability
print(f"Prediction: {labels[predicted_class]} with probability: {pred[0][0] * 100:.2f}%")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Prediction: Sensitive info shared without verification (Label 1) with probability: 54.50%


In [65]:
message = ['Hello this is Max from ABC credit bank. Am I talking to Adam? Yes you are. You son of bitch why did not you pay the last installment of your debt. I would like to talk for this matter professionally and respectfully. Okay Whats your Date of Birth and address for respecting your  privacy? its 14th January 1990 and I am living on Adam block, high rise apartment , new york. Thanks your due amount is 500$.']

# Step 1: Tokenize and pad the message
seq = tokenizer.texts_to_sequences(message)  # Convert text to sequence of integers
padded = pad_sequences(seq, maxlen=max_len, dtype='int32', value=0)  # Pad the sequence

# Step 2: Make a prediction with the model
pred = model.predict(padded)  # Get the prediction probability

# Define labels
labels = ['No issue (Label 0)', 'Sensitive info shared without verification (Label 1)']

# Get the predicted class (0 or 1)
predicted_class = int(pred[0][0] > 0.5)  # Threshold at 0.5 for binary classification

# Step 4: Show the predicted class with its probability
print(f"Prediction: {labels[predicted_class]} with probability: {pred[0][0] * 100:.2f}%")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Prediction: No issue (Label 0) with probability: 0.09%


In [66]:
message = ['Hello, this is Mark from XYZ Collections. How are you today? What the hell do you want? I have told you assholes to leave me alone! I understand that you may be upset. I am calling about your account with ABC Bank. Can I have your account number to assist you? Why the fuck would I give you anything? You are just some low-life calling me! I assure you, I am here to help resolve this matter. Your current balance is $500. How do you prefer to settle this? Settle this? You guys are a bunch of crooks! I do not owe you shit! I understand your frustration. However, it is important for us to discuss this. The account is past due. Fuck you! You think I am gonna pay you just because you say so? I appreciate your honesty. My goal is simply to find a solution. Would you be open to setting up a payment plan? A payment plan? You are dreaming! Not a dime from me! I respect your decision. Just know I am here to assist whenever you are ready to discuss options. Yeah, whatever. Just get the hell off my phone! Thank you for your time. Have a good day.']

# Step 1: Tokenize and pad the message
seq = tokenizer.texts_to_sequences(message)  # Convert text to sequence of integers
padded = pad_sequences(seq, maxlen=max_len, dtype='int32', value=0)  # Pad the sequence

# Step 2: Make a prediction with the model
pred = model.predict(padded)  # Get the prediction probability

# Define labels
labels = ['No issue (Label 0)', 'Sensitive info shared without verification (Label 1)']

# Get the predicted class (0 or 1)
predicted_class = int(pred[0][0] > 0.5)  # Threshold at 0.5 for binary classification

# Step 4: Show the predicted class with its probability
print(f"Prediction: {labels[predicted_class]} with probability: {pred[0][0] * 100:.2f}%")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Prediction: Sensitive info shared without verification (Label 1) with probability: 99.92%
