In [1]:
import pandas as pd
import numpy as np
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [2]:
# Step 1: Load the Dataset
file_path = "/Users/mohammadadnaan/Downloads/Prodigal_Tech/intelligent-debt-recovery-genai/src/ml_models/privacy_compliance_detection/conversations_with_labels.xlsx"
df = pd.read_excel(file_path)

# Display the first few rows of the dataset
print(df.head())


                                Call ID  \
0  2db2965e-54fa-41fa-823b-ed79b943f0b1   
1  8a9655a7-be88-4921-b0ad-04aa1b9953d1   
2  19169ec6-213f-48e9-8ec2-c24ee2e6eb20   
3  c86714f0-3aeb-4628-8cfa-ee0a46839508   
4  0c8297aa-ced1-414e-a175-bf29a9763d30   

                                                Text  Label  \
0  Hello, this is Emma from XYZ Collections, how ...      0   
1  Hello, this is Mark from XYZ Collections. How ...      0   
2  Hello, this is Mark calling from XYZ Collectio...      0   
3  Hello, this is Mark calling from XYZ Collectio...      0   
4  Hello, this is Mike from XYZ Collections. Am I...      0   

   Double Verification  
0                    0  
1                    1  
2                    0  
3                    1  
4                    1  


In [3]:
# Step 2: Preprocess the Text Data
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and numbers
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    
    # Lemmatization (using simple rule-based approach)
    lemmatized_text = []
    for word in text.split():
        if word.endswith('ing'):
            word = word[:-3]  # Remove 'ing'
        elif word.endswith('ed'):
            word = word[:-2]  # Remove 'ed'
        elif word.endswith('s'):
            word = word[:-1]  # Remove 's'
        lemmatized_text.append(word)
    text = " ".join(lemmatized_text)
    
    # Remove stop words (using a custom list)
    stop_words = set(["the", "and", "is", "in", "it", "to", "of", "for", "with", "on", "at", "by", "this", "that", "are", "as", "be", "was", "were", "you", "your", "we", "our"])
    text = " ".join([word for word in text.split() if word not in stop_words])
    
    return text

# Apply preprocessing to the text column
df["Processed_Text"] = df["Text"].apply(preprocess_text)


In [4]:
# Step 3: Tokenization and Padding
tokenizer = Tokenizer(num_words=5000)  # Limit vocabulary to 5000 words
tokenizer.fit_on_texts(df["Processed_Text"])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(df["Processed_Text"])

# Pad sequences to ensure uniform input size
max_len = 100  # Maximum sequence length
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding="post")

import pickle
with open('/Users/mohammadadnaan/Downloads/Prodigal_Tech/intelligent-debt-recovery-genai/src/ml_models/privacy_compliance_detection/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

In [5]:
# Step 4: Prepare Labels
labels = df["Label"].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

In [6]:
# Step 5: Build the LSTM Model
embedding_dim = 100  # Dimension of word embeddings
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_shape=(max_len,)),  # Add input_shape
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation="relu"),
    Dropout(0.5),
    Dense(1, activation="sigmoid")  # Binary classification
])

# Compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Display the model summary
model.summary()

  super().__init__(**kwargs)


In [7]:
# Step 6: Train the Model
history = model.fit(X_train, y_train, epochs=200, batch_size=32, validation_split=0.2)

Epoch 1/200
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 89ms/step - accuracy: 0.7475 - loss: 0.6828 - val_accuracy: 0.9500 - val_loss: 0.6287
Epoch 2/200
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - accuracy: 0.8745 - loss: 0.5936 - val_accuracy: 0.9500 - val_loss: 0.3507
Epoch 3/200
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step - accuracy: 0.8775 - loss: 0.4267 - val_accuracy: 0.9500 - val_loss: 0.1820
Epoch 4/200
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - accuracy: 0.8389 - loss: 0.4613 - val_accuracy: 0.9500 - val_loss: 0.2954
Epoch 5/200
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.8662 - loss: 0.3761 - val_accuracy: 0.9500 - val_loss: 0.2685
Epoch 6/200
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.8610 - loss: 0.3451 - val_accuracy: 0.9500 - val_loss: 0.1998
Epoch 7/200
[1m5/5[0m [32m━━━━━━━━━━━

In [8]:
# Step 7: Evaluate the Model
# Predict on the test set
y_pred = (model.predict(X_test) > 0.5).astype("int32")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96        45
           1       0.60      0.60      0.60         5

    accuracy                           0.92        50
   macro avg       0.78      0.78      0.78        50
weighted avg       0.92      0.92      0.92        50

Accuracy: 92.00%


In [9]:
# Step 8: Save the Model
model.save("privacy_compliance_lstm_model.keras")
print("Model saved successfully.")

Model saved successfully.


In [10]:
import numpy as np

# Example message to test
message = ['Your account has been flagged for suspicious activity. Please call us immediately to resolve the issue.']

# Step 1: Tokenize and pad the message
seq = tokenizer.texts_to_sequences(message)  # Convert text to sequence of integers
padded = pad_sequences(seq, maxlen=max_len, dtype='int32', value=0)  # Pad the sequence

# Step 2: Make a prediction with the model
pred = model.predict(padded)  # Get the prediction probability

# Define labels
labels = ['No issue (Label 0)', 'Sensitive info shared without verification (Label 1)']

# Get the predicted class (0 or 1)
predicted_class = int(pred[0][0] > 0.5)  # Threshold at 0.5 for binary classification

# Step 4: Show the predicted class with its probability
print(f"Prediction: {labels[predicted_class]} with probability: {pred[0][0] * 100:.2f}%")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
Prediction: No issue (Label 0) with probability: 0.00%


In [11]:
message = ['Hello, this is Mark from XYZ Collections. How are you today? What the hell do you want? I have told you assholes to leave me alone! I understand that you may be upset. I am calling about your account with ABC Bank. Can I have your account number to assist you? Why the fuck would I give you anything? You are just some low-life calling me! I assure you, I am here to help resolve this matter. Your current balance is $500. How do you prefer to settle this? Settle this? You guys are a bunch of crooks! I do not owe you shit! I understand your frustration. However, it is important for us to discuss this. The account is past due. Fuck you! You think I am gonna pay you just because you say so? I appreciate your honesty. My goal is simply to find a solution. Would you be open to setting up a payment plan? A payment plan? You are dreaming! Not a dime from me! I respect your decision. Just know I am here to assist whenever you are ready to discuss options. Yeah, whatever. Just get the hell off my phone! Thank you for your time. Have a good day.']

# Step 1: Tokenize and pad the message
seq = tokenizer.texts_to_sequences(message)  # Convert text to sequence of integers
padded = pad_sequences(seq, maxlen=max_len, dtype='int32', value=0)  # Pad the sequence

# Step 2: Make a prediction with the model
pred = model.predict(padded)  # Get the prediction probability

# Define labels
labels = ['No issue (Label 0)', 'Sensitive info shared without verification (Label 1)']

# Get the predicted class (0 or 1)
predicted_class = int(pred[0][0] > 0.5)  # Threshold at 0.5 for binary classification

# Step 4: Show the predicted class with its probability
print(f"Prediction: {labels[predicted_class]} with probability: {pred[0][0] * 100:.2f}%")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
Prediction: Sensitive info shared without verification (Label 1) with probability: 99.97%


In [12]:
message = ['Hello, this is Mark from XYZ Collections. How are you today? I am okay, thanks for asking. What is this about? I am calling to discuss your outstanding balance with ABC Credit Union. Can I have your account number to proceed? I do not have that right now, can you give me more details first? Sure! You have an overdue balance of $450. How do you wish to handle this? That sounds high. When was this due? The payment was due on June 15th. I can set up a payment plan if needed. I may need some time, can you reduce the amount? Unfortunately, I can not adjust the balance. However, I can help with the payment options available. What are the options then? You can either pay in full or set up monthly payments. Would you like to discuss that? I need to think about it before making a decision. Of course! Just call us back when you are ready. Thank you for your time. Thank you, goodbye.']

# Step 1: Tokenize and pad the message
seq = tokenizer.texts_to_sequences(message)  # Convert text to sequence of integers
padded = pad_sequences(seq, maxlen=max_len, dtype='int32', value=0)  # Pad the sequence

# Step 2: Make a prediction with the model
pred = model.predict(padded)  # Get the prediction probability

# Define labels
labels = ['No issue (Label 0)', 'Sensitive info shared without verification (Label 1)']

# Get the predicted class (0 or 1)
predicted_class = int(pred[0][0] > 0.5)  # Threshold at 0.5 for binary classification

# Step 4: Show the predicted class with its probability
print(f"Prediction: {labels[predicted_class]} with probability: {pred[0][0] * 100:.2f}%")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
Prediction: Sensitive info shared without verification (Label 1) with probability: 99.98%
