In [10]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
import re

In [73]:
# 1. Load the datasets
try_data_path = 'annotated_bad_words.csv'

badwordlist_data_path = 'BadWordListUpdated.csv'

In [74]:
try_df = pd.read_csv(try_data_path)
badwordlist_df = pd.read_csv(badwordlist_data_path)

In [75]:
try_df.dropna(inplace=True)

In [76]:
# 2. Extract bad words and sentences
badwords_devanagari = badwordlist_df['Devanagari'].dropna().tolist()
sentences = try_df['Filtered Sentences']

In [77]:
len(badwords_devanagari)

453

In [15]:
# Function to check if a sentence contains any bad words
def contains_badword(sentence, badword_list):
    for word in badword_list:
        if re.search(r'\b' + re.escape(word) + r'\b', sentence):
            return 1  # Contains a bad word
    return 0  # No bad word found

In [16]:
# 3. Annotate sentences with bad word labels
try_df['contains_badword'] = sentences.apply(lambda x: contains_badword(x, badwords_devanagari))

In [17]:
# 4. Tokenize and pad sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(try_df['Filtered Sentences'])

In [18]:
# Convert sentences to sequences of token IDs
sequences = tokenizer.texts_to_sequences(try_df['Filtered Sentences'])

In [19]:
# Pad the sequences to ensure uniform length
max_sequence_length = max(len(seq) for seq in sequences)
X = pad_sequences(sequences, maxlen=max_sequence_length)

In [20]:
# 5. Prepare the labels
y = try_df['contains_badword'].values

In [21]:
# 6. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
# 7. Build the LSTM model
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size for embedding layer

In [23]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_sequence_length),
    LSTM(128, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Binary classification (badword or not)
])



In [24]:
# 8. Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [25]:
# 9. Train the model
model.fit(X_train, y_train, epochs=5, batch_size=8, validation_data=(X_test, y_test))

Epoch 1/5
[1m1013/1013[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m188s[0m 182ms/step - accuracy: 0.6851 - loss: 0.5592 - val_accuracy: 0.9195 - val_loss: 0.1882
Epoch 2/5
[1m1013/1013[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 193ms/step - accuracy: 0.9741 - loss: 0.0821 - val_accuracy: 0.9368 - val_loss: 0.1674
Epoch 3/5
[1m1013/1013[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 161ms/step - accuracy: 0.9964 - loss: 0.0180 - val_accuracy: 0.9210 - val_loss: 0.2606
Epoch 4/5
[1m1013/1013[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m166s[0m 163ms/step - accuracy: 0.9955 - loss: 0.0146 - val_accuracy: 0.9402 - val_loss: 0.2829
Epoch 5/5
[1m1013/1013[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 156ms/step - accuracy: 0.9989 - loss: 0.0030 - val_accuracy: 0.9417 - val_loss: 0.3729


<keras.src.callbacks.history.History at 0x1cfd91a2610>

In [26]:
# 10. Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 100ms/step - accuracy: 0.9450 - loss: 0.3818
Test Accuracy: 94.17%


In [86]:
# Function to highlight bad words in the sentence
def highlight_badwords(sentence, badword_list):
    # Find all bad words in the sentence first without nesting
    badword_set = set()  # To store unique bad words found in the sentence
    for word in badword_list:
        if re.search(r'\b' + re.escape(word) + r'\b', sentence):
            badword_set.add(word)
    
    # Replace each found bad word with tags, avoiding repeated replacements
    for word in badword_set:
        print(word)
        sentence = sentence.replace(word, f"<badword>{word}</badword>")
    
    return sentence

In [87]:
new_test_sentence = "तू भड़वे है, तेरी माँ की चूत!"

In [91]:
new_test_sentence = "मादरचोद ये फोटोशोप किया हवा हेध्यानसे देख चुतियो कुछ तो सही तरीके से करो"

In [92]:
# Tokenize and pad the test sentence
max_sequence_length = 200  # Set the same max length used during training
test_sequence = tokenizer.texts_to_sequences([new_test_sentence])
test_sequence_padded = pad_sequences(test_sequence, maxlen=max_sequence_length)

In [93]:
# Predict if it contains a bad word
prediction = model.predict(test_sequence_padded)
predicted_label = (prediction > 0.5).astype(int)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step


In [94]:
# Output the prediction result
if predicted_label[0][0] == 1:
    highlighted_sentence = highlight_badwords(new_test_sentence, badwords_devanagari)
    print(f"Predicted: The sentence contains bad words.\nHighlighted: {highlighted_sentence}")
else:
    print("Predicted: The sentence does not contain bad words.")

मादरचोद
Predicted: The sentence contains bad words.
Highlighted: <badword>मादरचोद</badword> ये फोटोशोप किया हवा हेध्यानसे देख चुतियो कुछ तो सही तरीके से करो


In [81]:
import re

def find_badwords(sentence, badword_list):
    # Create a regular expression pattern to match all bad words
    pattern = r'\b(' + '|'.join(map(re.escape, badword_list)) + r')\b'
    print(f"Regex pattern: {pattern}")  # Debug output for regex pattern

    # Find all bad words in the sentence
    found_badwords = re.findall(pattern, sentence)
    unique_badwords = set(found_badwords)  # Remove duplicates if needed

    return list(unique_badwords)  # Return the list of found bad words

# Example usage
new_test_sentence = "तू भड़वे है, तेरी माँ की चूत!"
  # The test sentence

# Tokenize and pad the test sentence
max_sequence_length = 200  # Set the same max length used during training
test_sequence = tokenizer.texts_to_sequences([new_test_sentence])
test_sequence_padded = pad_sequences(test_sequence, maxlen=max_sequence_length)

# Predict if it contains a bad word
prediction = model.predict(test_sequence_padded)
predicted_label = (prediction > 0.5).astype(int)

# Output the prediction result
if predicted_label[0][0] == 1:
    badwords_found = find_badwords(new_test_sentence, badwords_devanagari)
    print(f"Predicted: The sentence contains bad words: {badwords_found}")
else:
    print("Predicted: The sentence does not contain bad words.")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Regex pattern: \b(आंड़|आंड|बहनचोद|बेहेनचोद|बेहेन्का\ लौडा|बेहेन्का\ लौडा|बहनके\ लौडे|बहन\ के\ लौडे|भेनचोद|भेनचोद|बहनचोद|बहनचोद|बकचोद|बकचोद|बकचोदी|बेवड़ा|बेवड़ा|बेवड़े|बेवड़े|बेवकूफ|बेवकूफ|बेवकूफ|बेवकूफ|बेवकूफ|बेवकूफ|भड़ुआ|भड़ुआ|भड़वा|भड़वा|भड़वा|भड़वा|भोसड़ा|भोसड़ा|भोसड़ा|भोसड़ीके|भोसड़ीके|भोसड़ीके|भोसड़ीके|भोसड़ीकी|भोसड़ीवाला|भोसड़ीवाले|भोसरचोदल|भोसदचोद|बब्बे|बब्बे|बूबे|बूबे|बुर|बुर|बुर|बुर|चरसी|चूचे|चूची|चुची|चोद|चोद|चोद|चुदने|चुदने|चुदवा|चुदवा|चुदवाने|चुदवाने|चूत|चूत|चूत|चूतिया|चुटिया|चूतिये|चुत्तड़|चूत्तड़|दलाल|दलाल|दलले|दलले|फट्टू|गधा|गधे|गधालंड|गांड|गांड|गांडू|गंडफट|गंडफट|गंडिया|गंडिये|गू|गू|गोटे|गोटे|गोटे|हग|हग्गू|हगने|हगने|हरामी|हरामजादा|हरामजादा|हरामज़ादा|हरामज़ादा|हरामजादे|हरामज़ादे|हरामखोर|हरामखोर|झाट|झाट|झाटू|झाटू|कुत्ता|कुत्ते|कुत्ते|कुतिया|कुतिया|कुतिया|कुत्ती|लेंडी|लेंडी|लोड़े|लौड़े|लौड़ा|लोड़ा|लौडा|लिंग|लोडा|लोडे|लंड|लौंडा|लौंडे|लौंडे|लौंडी|लौंडी|लौंडिया|लौंडिया|लुल्ली|मार|माँ\ का\ लौड़ा|माँ\ का\ लो