In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv('annotated_bad_words.csv')

# Preprocess the text data
def extract_labels(sentence):
    words = sentence.split()
    tokens = []
    labels = []
    
    for word in words:
        if "<bad_word>" in word:
            word = word.replace("<bad_word>", "").replace("</bad_word>", "")
            labels.append(1)  # Label '1' for bad word
        else:
            labels.append(0)  # Label '0' for normal word
        tokens.append(word)
    
    return " ".join(tokens), labels

# Apply label extraction
data['Processed Sentences'], data['Labels'] = zip(*data['Annotated Sentences'].apply(extract_labels))

# Split the data into training and test sets
sentences = data['Processed Sentences'].tolist()
labels = data['Labels'].tolist()
train_sentences, test_sentences, train_labels, test_labels = train_test_split(sentences, labels, test_size=0.2, random_state=42)

# Tokenize the sentences
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(train_sentences)
vocab_size = len(tokenizer.word_index) + 1
sequences = tokenizer.texts_to_sequences(sentences)

# Convert text to sequences and pad them
max_length = max(len(seq) for seq in sequences)  
print(max_length) # Define max length for padding
train_sequences = tokenizer.texts_to_sequences(train_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

# Pad the labels as well to match max_length
train_labels_padded = pad_sequences(train_labels, maxlen=max_length, padding='post', truncating='post')
test_labels_padded = pad_sequences(test_labels, maxlen=max_length, padding='post', truncating='post')

# Load pre-trained fastText Hindi embeddings
embedding_index = {}
embedding_dim = 300  # fastText embeddings are typically 300-dimensional

# Path to the fastText Hindi embedding file (e.g., "cc.hi.300.vec")
with open("cc.hi.300.vec", encoding="utf-8") as f:
    for line in f:
        values = line.strip().split()
        word = values[0]
        vector = np.asarray(values[1:], dtype="float32")
        embedding_index[word] = vector

# Prepare the embedding matrix
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Build the model with pre-trained embeddings
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        input_length=max_length,
        trainable=False  # Set to True if you want to fine-tune embeddings
    ),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(32, activation='relu')),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(1, activation='sigmoid'))
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(train_padded, train_labels_padded, epochs=5, validation_data=(test_padded, test_labels_padded))

# Evaluate the model
loss, accuracy = model.evaluate(test_padded, test_labels_padded)
print(f'Test Accuracy: {accuracy * 100:.2f}%')


In [None]:
def preprocess_sentence(sentence):
    """
    Preprocesses a single sentence by tokenizing and padding.
    """
    # Tokenize the sentence
    sequence = tokenizer.texts_to_sequences([sentence])
    
    # Pad the sequence to match the max_length
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')
    
    return padded_sequence

def predict_bad_words(sentence):
    """
    Predicts bad words in a new sentence.
    """
    # Preprocess the sentence
    padded_sequence = preprocess_sentence(sentence)
    
    # Get predictions from the model
    predictions = model.predict(padded_sequence)
    
    # Threshold to classify as bad word or not
    threshold = 0.5
    predicted_labels = (predictions > threshold).astype(int)[0]
    
    # Print out each word with its prediction
    words = sentence.split()
    for i, word in enumerate(words[:max_length]):
        is_bad_word = "Bad Word" if predicted_labels[i] == 1 else "Normal Word"
        print(f"{word}: {is_bad_word}")

# Example sentence in Hindi containing some abusive words
new_sentence = "तू खुद को क्या समझता है, चूतिया साला? हर बार बकवास करता है और दूसरों को परेशान करता है। कमीने, तमीज से पेश आ।"



# Predict on the new sentence
predict_bad_words(new_sentence)


# ---------------------------------------------------------------------------------------------------------------

In [11]:
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

# Load the tokenizers
with open('w2v_word_tokenizer.pkl', 'rb') as handle:
    tokenizer = pickle.load(handle)

with open('w2v_char_tokenizer.pkl', 'rb') as handle:
    char_tokenizer = pickle.load(handle)

# Load the saved model
model = load_model('word2vec_bad_model.h5')

def preprocess_sentence(sentence):
    """
    Preprocesses a single sentence by tokenizing and padding for both word and character levels.
    """
    char_max_length = 15
    max_length = 475
    
    # Tokenize the sentence for words
    word_sequence = tokenizer.texts_to_sequences([sentence])
    padded_word_sequence = pad_sequences(word_sequence, maxlen=max_length, padding='post', truncating='post')
    
    # Tokenize and pad character sequences for each word
    char_sequence = [[char_tokenizer.word_index.get(char, 0) for char in word] for word in sentence.split()]
    char_sequence = pad_sequences(char_sequence, maxlen=char_max_length, padding="post")
    
    # Pad the entire sequence of words to ensure uniformity
    padded_char_sequence = pad_sequences([char_sequence], maxlen=max_length, padding='post', dtype='int32')

    return padded_word_sequence, padded_char_sequence

def predict_bad_words(sentence):
    """
    Predicts bad words in a new sentence using the trained model and returns a list of detected bad words.
    """
    max_length = 475
    
    # Preprocess the sentence
    padded_word_sequence, padded_char_sequence = preprocess_sentence(sentence)
    
    # Get predictions from the model
    predictions = model.predict([padded_word_sequence, padded_char_sequence])
    
    # Threshold to classify as bad word or not
    threshold = 0.5
    predicted_labels = (predictions > threshold).astype(int)[0]  # Flatten the predictions array
    
    # Extract and print only bad words
    words = sentence.split()
    bad_words = [word for i, word in enumerate(words[:max_length]) if predicted_labels[i] == 1]
    # Print the bad words
    if bad_words:
        print("Detected Bad Words:", ", ".join(bad_words))
    else:
        print("No bad words detected.")

    return bad_words

# Example sentence in Hindi containing some abusive words
new_sentence = "तू हमेशा दूसरों को चुतिये और गांडू बोलता है, खुद हर बात में फुद्दी की बातें करता है, मादरचोद"
# Predict and get the list of bad words
bad_words_list = predict_bad_words(new_sentence)

print(bad_words_list)







[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 20s/step
Detected Bad Words: गांडू, मादरचोद
['गांडू', 'मादरचोद']


In [None]:
import os
import wave
import json
from vosk import Model, KaldiRecognizer
from pydub import AudioSegment
import subprocess

def convert_mp3_to_wav(mp3_file, wav_file):
    try:
        subprocess.run(['ffmpeg', '-i', mp3_file, '-ac', '1', '-ar', '16000', wav_file], check=True)
    except subprocess.CalledProcessError as e:
        print("Error during MP3 to WAV conversion:", e)  # Use PCM codec for export

def transcribe_audio_with_timestamps(audio_file, model_path):
    # Load the Vosk model
    if not os.path.exists(model_path):
        print(f"Model not found at {model_path}")
        return
    
    model = Model(model_path)

    # Check if the audio file is in WAV format
    if not audio_file.lower().endswith('.wav'):
        # Convert MP3 to WAV if necessary
        wav_file = audio_file.rsplit('.', 1)[0] + '.wav'
        if audio_file.lower().endswith('.mp3'):
            convert_mp3_to_wav(audio_file, wav_file)
            audio_file = wav_file
        else:
            print("Audio file must be in WAV format or MP3 format.")
            return

    # Open the audio file
    try:
        wf = wave.open(audio_file, "rb")
        if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
            print("Audio file must be WAV format mono PCM.")
            return
    except Exception as e:
        print(f"Error opening audio file: {e}")
        return
    
    # Initialize recognizer with sample rate
    rec = KaldiRecognizer(model, wf.getframerate())
    rec.SetWords(True)  # This line ensures word-level timestamps are included

    print("Processing audio...")

    results = []
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            result = json.loads(rec.Result())
            results.append(result)
        else:
            rec.PartialResult()

    # Get the final result
    final_result = json.loads(rec.FinalResult())
    results.append(final_result)

    for result in results:
        if 'result' in result:  
            for word_info in result['result']:
                word = word_info.get('word', 'Unknown')
                start_time = word_info.get('start', 0)
                end_time = word_info.get('end', 0)
                print(f"Word: {word}, Start: {start_time:.2f}s, End: {end_time:.2f}s")
        else:
            print("No word-level results in this segment")

transcribe_audio_with_timestamps(r'myfile.wav', 'vosk-model-small-hi-0.22')


In [20]:
import os
import wave
import json
from vosk import Model, KaldiRecognizer
import subprocess

def convert_mp3_to_wav(mp3_file, wav_file):
    try:
        subprocess.run(['ffmpeg', '-i', mp3_file, '-ac', '1', '-ar', '16000', wav_file], check=True)
    except subprocess.CalledProcessError as e:
        print("Error during MP3 to WAV conversion:", e)

def transcribe_audio_with_timestamps(audio_file, model_path):
    if not os.path.exists(model_path):
        print(f"Model not found at {model_path}")
        return None, {}

    model = Model(model_path)

    if not audio_file.lower().endswith('.wav'):
        wav_file = audio_file.rsplit('.', 1)[0] + '.wav'
        if audio_file.lower().endswith('.mp3'):
            convert_mp3_to_wav(audio_file, wav_file)
            audio_file = wav_file
        else:
            print("Audio file must be in WAV format or MP3 format.")
            return None, {}

    try:
        wf = wave.open(audio_file, "rb")
        if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
            print("Audio file must be WAV format mono PCM.")
            return None, {}
    except Exception as e:
        print(f"Error opening audio file: {e}")
        return None, {}
    
    rec = KaldiRecognizer(model, wf.getframerate())
    rec.SetWords(True)

    print("Processing audio...")

    results = []
    transcribed_text = ""
    word_timestamps = {}

    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            result = json.loads(rec.Result())
            results.append(result)
        else:
            rec.PartialResult()

    final_result = json.loads(rec.FinalResult())
    results.append(final_result)

    for result in results:
        if 'result' in result:  
            for word_info in result['result']:
                word = word_info.get('word', 'Unknown')
                start_time = word_info.get('start', 0)
                end_time = word_info.get('end', 0)
                
                transcribed_text += word + " "
                
                word_timestamps[word] = {'start': start_time, 'end': end_time}

    transcribed_text = transcribed_text.strip()

    return transcribed_text, word_timestamps

transcribed_sentence, timestamps = transcribe_audio_with_timestamps(r"M:\Coding\NLP_Project\AudioData\Audio_7618.wav", 'vosk-model-small-hi-0.22')
if transcribed_sentence:
    print("Transcribed Sentence:", transcribed_sentence)
    

    print("Word Timestamps (JSON):", timestamps)


Processing audio...
Transcribed Sentence: इस रंडी साली जिससे कोई भी मुल्ला मौलवी और घोडों ने चोदने से नहीं छोडा यह मंदिर को अपवित्र करने की गई और किस दोगले पंडित ने इसकी पूजा कराई यह हिन्दू नहीं है पता नहीं कितने कुछ बातों से सोनिया भोसड़ी छुडवाकर इस हाइब्रिड और राहुल गांधी पप्पू को पैदा किया इसके चूत की प्यास नहीं बुझी है सालों को जेल में रहना चाहिए ये देशद्रोही गद्दार हिंदू विरोधी और देश के सबसे बड़े दुश्मन है अगर जरा भी देश से प्रेम होता तो प्रियंका की शादी किसी हिन्दू से करती यह राजीव गांधी फैंस का चौथा पता नहीं इंदिरा गाँधी ने कितने लोगों से चुदवा कर इस नमूने को पैदा किया था जिस मात्र चोट को इस देश में झंडी नहीं मिली जाकर इन तीनों मारिया जैसी महानदी को ले आया ये देश के गद्दार और देशद्रोही और पाकिस्तान पर थे जय भारत जय भीम वंदे मातरम जय हिंद नेहरू गांधी और इंदिरा भोसड़ी के इस परिवार का समूल नाश में ही देश की भलाई है
Word Timestamps (JSON): {'इस': {'start': 52.32, 'end': 52.56}, 'रंडी': {'start': 0.27, 'end': 0.57}, 'साली': {'start': 0.57, 'end': 0.9}, 'जिससे': {'start': 0.9, 'end': 

In [19]:
from vosk import Model, KaldiRecognizer
from pydub import AudioSegment
import wave
import json
import pickle
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import os
import subprocess

# Load the tokenizers and model (assuming they're saved as shown earlier)
with open('w2v_word_tokenizer.pkl', 'rb') as handle:
    tokenizer = pickle.load(handle)
with open('w2v_char_tokenizer.pkl', 'rb') as handle:
    char_tokenizer = pickle.load(handle)
model = load_model('word2vec_bad_model.h5')

def preprocess_sentence(sentence):
    char_max_length = 15
    max_length = 475
    word_sequence = tokenizer.texts_to_sequences([sentence])
    padded_word_sequence = pad_sequences(word_sequence, maxlen=max_length, padding='post', truncating='post')
    char_sequence = [[char_tokenizer.word_index.get(char, 0) for char in word] for word in sentence.split()]
    char_sequence = pad_sequences(char_sequence, maxlen=char_max_length, padding="post")
    padded_char_sequence = pad_sequences([char_sequence], maxlen=max_length, padding='post', dtype='int32')
    return padded_word_sequence, padded_char_sequence

def predict_bad_words(sentence):
    max_length = 475
    padded_word_sequence, padded_char_sequence = preprocess_sentence(sentence)
    predictions = model.predict([padded_word_sequence, padded_char_sequence])
    threshold = 0.5
    predicted_labels = (predictions > threshold).astype(int)[0]
    words = sentence.split()
    bad_words = [word for i, word in enumerate(words[:max_length]) if predicted_labels[i] == 1]
    return bad_words

def convert_mp3_to_wav(mp3_file, wav_file):
    try:
        subprocess.run(['ffmpeg', '-i', mp3_file, '-ac', '1', '-ar', '16000', wav_file], check=True)
    except subprocess.CalledProcessError as e:
        print("Error during MP3 to WAV conversion:", e)

def transcribe_audio_with_timestamps(audio_file, model_path):
    if not os.path.exists(model_path):
        print(f"Model not found at {model_path}")
        return []
    
    model = Model(model_path)
    if not audio_file.lower().endswith('.wav'):
        wav_file = audio_file.rsplit('.', 1)[0] + '.wav'
        if audio_file.lower().endswith('.mp3'):
            convert_mp3_to_wav(audio_file, wav_file)
            audio_file = wav_file
        else:
            print("Audio file must be in WAV format or MP3 format.")
            return []
    
    try:
        wf = wave.open(audio_file, "rb")
        if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
            print("Audio file must be WAV format mono PCM.")
            return []
    except Exception as e:
        print(f"Error opening audio file: {e}")
        return []
    
    rec = KaldiRecognizer(model, wf.getframerate())
    rec.SetWords(True)
    results = []
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            result = json.loads(rec.Result())
            results.append(result)
        else:
            rec.PartialResult()
    final_result = json.loads(rec.FinalResult())
    results.append(final_result)
    
    word_timestamps = []
    for result in results:
        if 'result' in result:
            for word_info in result['result']:
                word_timestamps.append({
                    'word': word_info.get('word', ''),
                    'start': word_info.get('start', 0),
                    'end': word_info.get('end', 0)
                })
    return word_timestamps

def mute_bad_words_in_audio(audio_file, bad_words, word_timestamps):
    audio = AudioSegment.from_wav(audio_file)
    for item in word_timestamps:
        if item['word'] in bad_words:
            start_ms = item['start'] * 1000
            end_ms = item['end'] * 1000
            audio = audio[:start_ms] + AudioSegment.silent(duration=(end_ms - start_ms)) + audio[end_ms:]
    muted_file = "muted_" + os.path.basename(audio_file)
    audio.export(muted_file, format="wav")
    return muted_file

# Integrate all steps
audio_file = r"M:\Coding\NLP_Project\AudioData\Audio_3009.wav"
model_path = 'vosk-model-small-hi-0.22'

# Step 1: Transcribe audio with timestamps
word_timestamps = transcribe_audio_with_timestamps(audio_file, model_path)

# Step 2: Detect bad words from transcribed text
transcribed_text = " ".join([item['word'] for item in word_timestamps])  # Create sentence from transcribed words
bad_words = predict_bad_words(transcribed_text)
print(bad_words)

# Step 3: Mute bad words in audio
muted_file = mute_bad_words_in_audio(audio_file, bad_words, word_timestamps)
print(f"Muted audio saved to: {muted_file}")




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 19s/step
['हमारे', 'चूत', 'भोसड़ी', 'मारते', 'रंडी', 'चूत', 'मार', 'दलाल', 'रंडी', 'गांड', 'गांडू', 'लंड', 'लंड', 'गांड', 'चोदने', 'लंड', 'चूत', 'लंड', 'लौड़ा', 'लंड', 'लंड', 'हमारा', 'लंड', 'गांड', 'लंड', 'चूत']
Muted audio saved to: muted_Audio_3009.wav


In [18]:
from vosk import Model, KaldiRecognizer
from pydub import AudioSegment
import wave
import json
import pickle
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import os
import subprocess

# Load the tokenizers and model (assuming they're saved as shown earlier)
with open('ft_word_tokenizer.pkl', 'rb') as handle:
    tokenizer = pickle.load(handle)
with open('ft_char_tokenizer.pkl', 'rb') as handle:
    char_tokenizer = pickle.load(handle)
model = load_model('fast_text_bad_word_detection_model.h5')

def preprocess_sentence(sentence):
    char_max_length = 15
    max_length = 475
    word_sequence = tokenizer.texts_to_sequences([sentence])
    padded_word_sequence = pad_sequences(word_sequence, maxlen=max_length, padding='post', truncating='post')
    char_sequence = [[char_tokenizer.word_index.get(char, 0) for char in word] for word in sentence.split()]
    char_sequence = pad_sequences(char_sequence, maxlen=char_max_length, padding="post")
    padded_char_sequence = pad_sequences([char_sequence], maxlen=max_length, padding='post', dtype='int32')
    return padded_word_sequence, padded_char_sequence

def predict_bad_words(sentence):
    max_length = 475
    padded_word_sequence, padded_char_sequence = preprocess_sentence(sentence)
    predictions = model.predict([padded_word_sequence, padded_char_sequence])
    threshold = 0.5
    predicted_labels = (predictions > threshold).astype(int)[0]
    words = sentence.split()
    bad_words = [word for i, word in enumerate(words[:max_length]) if predicted_labels[i] == 1]
    return bad_words

def convert_mp3_to_wav(mp3_file, wav_file):
    try:
        subprocess.run(['ffmpeg', '-i', mp3_file, '-ac', '1', '-ar', '16000', wav_file], check=True)
    except subprocess.CalledProcessError as e:
        print("Error during MP3 to WAV conversion:", e)

def transcribe_audio_with_timestamps(audio_file, model_path):
    if not os.path.exists(model_path):
        print(f"Model not found at {model_path}")
        return []
    
    model = Model(model_path)
    if not audio_file.lower().endswith('.wav'):
        wav_file = audio_file.rsplit('.', 1)[0] + '.wav'
        if audio_file.lower().endswith('.mp3'):
            convert_mp3_to_wav(audio_file, wav_file)
            audio_file = wav_file
        else:
            print("Audio file must be in WAV format or MP3 format.")
            return []
    
    try:
        wf = wave.open(audio_file, "rb")
        if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
            print("Audio file must be WAV format mono PCM.")
            return []
    except Exception as e:
        print(f"Error opening audio file: {e}")
        return []
    
    rec = KaldiRecognizer(model, wf.getframerate())
    rec.SetWords(True)
    results = []
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            result = json.loads(rec.Result())
            results.append(result)
        else:
            rec.PartialResult()
    final_result = json.loads(rec.FinalResult())
    results.append(final_result)
    
    word_timestamps = []
    for result in results:
        if 'result' in result:
            for word_info in result['result']:
                word_timestamps.append({
                    'word': word_info.get('word', ''),
                    'start': word_info.get('start', 0),
                    'end': word_info.get('end', 0)
                })
    return word_timestamps

def mute_bad_words_in_audio(audio_file, bad_words, word_timestamps):
    audio = AudioSegment.from_wav(audio_file)
    for item in word_timestamps:
        if item['word'] in bad_words:
            start_ms = item['start'] * 1000
            end_ms = item['end'] * 1000
            audio = audio[:start_ms] + AudioSegment.silent(duration=(end_ms - start_ms)) + audio[end_ms:]
    muted_file = "muted_" + os.path.basename(audio_file)
    audio.export(muted_file, format="wav")
    return muted_file

# Integrate all steps
audio_file = r"M:\Coding\NLP_Project\AudioData\Audio_3009.wav"
model_path = 'vosk-model-small-hi-0.22'

# Step 1: Transcribe audio with timestamps
word_timestamps = transcribe_audio_with_timestamps(audio_file, model_path)

# Step 2: Detect bad words from transcribed text
transcribed_text = " ".join([item['word'] for item in word_timestamps])  # Create sentence from transcribed words
bad_words = predict_bad_words(transcribed_text)
print(bad_words)

# Step 3: Mute bad words in audio
muted_file = mute_bad_words_in_audio(audio_file, bad_words, word_timestamps)
print(f"Muted audio saved to: {muted_file}")




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 20s/step
['हमारे', 'चूत', 'भोसड़ी', 'मूठ', 'मारते', 'रंडी', 'चूत', 'मार', 'दलाल', 'रंडी', 'गांड', 'गांडू', 'लंड', 'लंड', 'गांड', 'चोदने', 'लंड', 'चूत', 'लंड', 'लौड़ा', 'लंड', 'लंड', 'हमारा', 'लंड', 'गांड', 'चूतड़ों', 'लंड', 'चूत']
Muted audio saved to: muted_Audio_3009.wav
