In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# Step 1: Load the Dataset
file_path = '/Users/mohammadadnaan/Downloads/Prodigal_Tech/intelligent-debt-recovery-genai/src/ml_models/profanity_detection/profanity_data.csv'
df = pd.read_csv(file_path)

# Display few rows
print(df.head())
print(df.tail())

                                                text  label
0  hello this is emma from xyz collections how ar...      0
1  i am fine but i really do not have time for th...      0
2  i understand i will be brief i am calling abou...      0
3  look i have told you before i want my number o...      0
4  i apologize for that i will ensure your number...      0
                                                   text  label
3182  time you think you can just sit on your ass an...      1
3183  i assure you that i am taking this seriously a...      0
3184  you better do it fast or i am going to escalat...      1
3185          i understand thank you for your call mike      0
3186  whatever just make sure you do not screw this ...      1


In [3]:
# Step 2: Preprocess the Text Data
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and numbers
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    
    # Lemmatization (using simple rule-based approach)
    lemmatized_text = []
    for word in text.split():
        if word.endswith('ing'):
            word = word[:-3]  # Remove 'ing'
        elif word.endswith('ed'):
            word = word[:-2]  # Remove 'ed'
        elif word.endswith('s'):
            word = word[:-1]  # Remove 's'
        lemmatized_text.append(word)
    text = " ".join(lemmatized_text)
    
    # Remove stop words (using a custom list)
    stop_words = set(["the", "and", "is", "in", "it", "to", "of", "for", "with", "on", "at", "by", "this", "that", "are", "as", "be", "was", "were", "you", "your", "we", "our"])
    text = " ".join([word for word in text.split() if word not in stop_words])
    
    return text

# Apply preprocessing to the text column
df["Processed_Text"] = df["text"].apply(preprocess_text)


In [4]:
# Step 3: Convert Text to Numerical Features
vectorizer = TfidfVectorizer(max_features=5000)  # Limit vocabulary to 5000 words
X = vectorizer.fit_transform(df["Processed_Text"]).toarray()  # Convert text to TF-IDF features
y = df["label"]

# Save the TF-IDF vectorizer for future use
import pickle
with open('/Users/mohammadadnaan/Downloads/Prodigal_Tech/intelligent-debt-recovery-genai/src/ml_models/profanity_detection/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [5]:
# Step 4: Handle Class Imbalance
# Use SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [6]:
# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [7]:
# Step 6: Build the Model
model = Sequential([
    Dense(128, activation="relu", input_shape=(X_train.shape[1],), kernel_regularizer=l2(0.01)),  # Input layer
    Dropout(0.5),  # Add dropout to prevent overfitting
    Dense(64, activation="relu", kernel_regularizer=l2(0.01)),  # Hidden layer
    Dropout(0.5),  # Add dropout to prevent overfitting
    Dense(1, activation="sigmoid")  # Output layer for binary classification
])

# Compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])


# Display the model summary
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [8]:
# Step 7: Train the Model with Early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=200, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/200
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7378 - loss: 2.1855 - val_accuracy: 0.9665 - val_loss: 0.4804
Epoch 2/200
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9620 - loss: 0.4248 - val_accuracy: 0.9717 - val_loss: 0.3129
Epoch 3/200
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9738 - loss: 0.3016 - val_accuracy: 0.9780 - val_loss: 0.2749
Epoch 4/200
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 990us/step - accuracy: 0.9727 - loss: 0.2784 - val_accuracy: 0.9822 - val_loss: 0.2536
Epoch 5/200
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1000us/step - accuracy: 0.9830 - loss: 0.2466 - val_accuracy: 0.9811 - val_loss: 0.2479
Epoch 6/200
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 996us/step - accuracy: 0.9797 - loss: 0.2451 - val_accuracy: 0.9801 - val_loss: 0.2330
Epoch 7/200
[1

In [9]:
# Step 8: Evaluate the Model
# Predict on the test set
y_pred = (model.predict(X_test) > 0.5).astype("int32")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 713us/step
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       614
           1       0.99      1.00      1.00       579

    accuracy                           1.00      1193
   macro avg       1.00      1.00      1.00      1193
weighted avg       1.00      1.00      1.00      1193

Accuracy: 99.66%


In [10]:
# Print accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 99.66%


In [11]:
def predict_profanity(text):
    # Step 1: Preprocess the input text
    processed_text = preprocess_text(text)
    
    # Step 2: Convert the preprocessed text to numerical features using the same vectorizer
    text_vectorized = vectorizer.transform([processed_text]).toarray()
    
    # Step 3: Make a prediction using the trained model
    pred = model.predict(text_vectorized)
    
    # Step 4: Interpret the prediction
    label = "profane" if pred[0][0] > 0.5 else "not profane"
    probability = pred[0][0] * 100 if label == "profane" else (1 - pred[0][0]) * 100
    
    # Step 5: Print the result
    print(f"Text: {text}")
    print(f"Prediction: {label} with probability: {probability:.2f}%")
    print()

# Test examples
predict_profanity("Your account has been flagged for suspicious activity. Please call us immediately to resolve the issue.")
predict_profanity("Fuck you! Go to hell!")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
Text: Your account has been flagged for suspicious activity. Please call us immediately to resolve the issue.
Prediction: not profane with probability: 99.69%

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
Text: Fuck you! Go to hell!
Prediction: profane with probability: 99.75%



In [12]:
# Save the model after evaluation
model.save('profanity.keras')