In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
import re

In [2]:
# 1. Load the datasets
try_data_path = r'try.csv'
badwordlist_data_path = r'BadWordList.csv'

In [3]:
try_df = pd.read_csv(try_data_path)
badwordlist_df = pd.read_csv(badwordlist_data_path)

In [4]:
# 2. Extract bad words and sentences
badwords_devanagari = badwordlist_df['Devanagari'].dropna().tolist()
sentences = try_df['Filtered Sentences']

In [5]:
# Function to check if a sentence contains any bad words
def contains_badword(sentence, badword_list):
    for word in badword_list:
        if re.search(r'\b' + re.escape(word) + r'\b', sentence):
            return 1  # Contains a bad word
    return 0  # No bad word found

In [6]:
# 3. Annotate sentences with bad word labels
try_df['contains_badword'] = sentences.apply(lambda x: contains_badword(x, badwords_devanagari))

In [7]:
# 4. Tokenize and pad sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(try_df['Filtered Sentences'])

In [8]:
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [8]:
# Convert sentences to sequences of token IDs
sequences = tokenizer.texts_to_sequences(try_df['Filtered Sentences'])

In [9]:
# Pad the sequences to ensure uniform length
max_sequence_length = max(len(seq) for seq in sequences)
X = pad_sequences(sequences, maxlen=max_sequence_length)

In [10]:
# 5. Prepare the labels
y = try_df['contains_badword'].values

In [11]:
# 6. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# 7. Build the LSTM model
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size for embedding layer

In [13]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_sequence_length),
    LSTM(128, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Binary classification (badword or not)
])



In [14]:
# 8. Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [15]:
# 9. Train the model
model.fit(X_train, y_train, epochs=5, batch_size=8, validation_data=(X_test, y_test))

Epoch 1/5
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 22ms/step - accuracy: 0.5892 - loss: 0.6864 - val_accuracy: 0.5737 - val_loss: 0.6648
Epoch 2/5
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.7576 - loss: 0.5262 - val_accuracy: 0.6653 - val_loss: 0.7810
Epoch 3/5
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.9527 - loss: 0.1545 - val_accuracy: 0.6773 - val_loss: 0.7840
Epoch 4/5
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.9984 - loss: 0.0206 - val_accuracy: 0.6932 - val_loss: 1.0132
Epoch 5/5
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.9817 - loss: 0.0406 - val_accuracy: 0.7570 - val_loss: 1.2287


<keras.src.callbacks.history.History at 0x7a7ddb18b5e0>

In [16]:
# 10. Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7779 - loss: 1.2037 
Test Accuracy: 75.70%


In [21]:
test_sentence = "तेरी रंडी माँ की चूत"

In [22]:
# Tokenize and pad the test sentence
test_sequence = tokenizer.texts_to_sequences([test_sentence])
test_sequence_padded = pad_sequences(test_sequence, maxlen=max_sequence_length)

In [23]:
# Predict if it contains a bad word
prediction = model.predict(test_sequence_padded)
predicted_label = (prediction > 0.5).astype(int)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step


In [24]:
if predicted_label[0][0] == 1:
    print("The sentence contains bad words.")
else:
    print("The sentence does not contain bad words.")

The sentence contains bad words.


In [1]:
import os
import wave
import json
from vosk import Model, KaldiRecognizer
from pydub import AudioSegment

def transcribe_audio_with_timestamps(audio_file, model_path):
    # Load the Vosk model
    if not os.path.exists(model_path):
        print(f"Model not found at {model_path}")
        return
    
    model = Model(model_path)

    # Open the audio file
    try:
        wf = wave.open(audio_file, "rb")
        if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
            print("Audio file must be WAV format mono PCM.")
            return
    except Exception as e:
        print(f"Error opening audio file: {e}")
        return
    
    # Initialize recognizer with sample rate
    rec = KaldiRecognizer(model, wf.getframerate())
    rec.SetWords(True)  # This line ensures word-level timestamps are included

    print("Processing audio...")

    results = []
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            result = json.loads(rec.Result())
            results.append(result)
        else:
            rec.PartialResult()

    # Get the final result
    final_result = json.loads(rec.FinalResult())
    results.append(final_result)

    # Debug: print the full structure of the results
    # print(json.dumps(results, indent=4))

    # Extract and print transcription with timestamps
    for result in results:
        if 'result' in result:  # Checking if word-level info is present
            for word_info in result['result']:
                word = word_info.get('word', 'Unknown')
                start_time = word_info.get('start', 0)
                end_time = word_info.get('end', 0)
                print(f"Word: {word}, Start: {start_time:.2f}s, End: {end_time:.2f}s")
        else:
            print("No word-level results in this segment")

transcribe_audio_with_timestamps(r'Audio_Data\Audio_9.mp3', 'vosk-model-hi-0.22')

Error opening audio file: file does not start with RIFF id


In [5]:
import os
import wave
import json
from vosk import Model, KaldiRecognizer
from pydub import AudioSegment

def convert_mp3_to_wav(mp3_file, wav_file):
    # Convert MP3 file to WAV using pydub
    audio = AudioSegment.from_mp3(mp3_file)
    audio.export(wav_file, format="wav")

def transcribe_audio_with_timestamps(audio_file, model_path):
    # Load the Vosk model
    if not os.path.exists(model_path):
        print(f"Model not found at {model_path}")
        return
    
    model = Model(model_path)

    # Check if the audio file is in WAV format
    if not audio_file.lower().endswith('.wav'):
        # Convert MP3 to WAV if necessary
        wav_file = audio_file.rsplit('.', 1)[0] + '.wav'
        if audio_file.lower().endswith('.mp3'):
            convert_mp3_to_wav(audio_file, wav_file)
            audio_file = wav_file
        else:
            print("Audio file must be in WAV format or MP3 format.")
            return

    # Open the audio file
    try:
        wf = wave.open(audio_file, "rb")
        if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
            print("Audio file must be WAV format mono PCM.")
            return
    except Exception as e:
        print(f"Error opening audio file: {e}")
        return
    
    # Initialize recognizer with sample rate
    rec = KaldiRecognizer(model, wf.getframerate())
    rec.SetWords(True)  # This line ensures word-level timestamps are included

    print("Processing audio...")

    results = []
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            result = json.loads(rec.Result())
            results.append(result)
        else:
            rec.PartialResult()

    # Get the final result
    final_result = json.loads(rec.FinalResult())
    results.append(final_result)

    # Extract and print transcription with timestamps
    for result in results:
        if 'result' in result:  # Checking if word-level info is present
            for word_info in result['result']:
                word = word_info.get('word', 'Unknown')
                start_time = word_info.get('start', 0)
                end_time = word_info.get('end', 0)
                print(f"Word: {word}, Start: {start_time:.2f}s, End: {end_time:.2f}s")
        else:
            print("No word-level results in this segment")

transcribe_audio_with_timestamps(r'audio.mp3', 'vosk-model-hi-0.22')


Audio file must be WAV format mono PCM.


In [4]:
import os
import wave
import json
from vosk import Model, KaldiRecognizer
from pydub import AudioSegment

def convert_mp3_to_wav(mp3_file, wav_file):
    try:
        subprocess.run(['ffmpeg', '-i', mp3_file, '-ac', '1', '-ar', '16000', wav_file], check=True)
    except subprocess.CalledProcessError as e:
        print("Error during MP3 to WAV conversion:", e)  # Use PCM codec for export

def transcribe_audio_with_timestamps(audio_file, model_path):
    # Load the Vosk model
    if not os.path.exists(model_path):
        print(f"Model not found at {model_path}")
        return
    
    model = Model(model_path)

    # Check if the audio file is in WAV format
    if not audio_file.lower().endswith('.wav'):
        # Convert MP3 to WAV if necessary
        wav_file = audio_file.rsplit('.', 1)[0] + '.wav'
        if audio_file.lower().endswith('.mp3'):
            convert_mp3_to_wav(audio_file, wav_file)
            audio_file = wav_file
        else:
            print("Audio file must be in WAV format or MP3 format.")
            return

    # Open the audio file
    try:
        wf = wave.open(audio_file, "rb")
        if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
            print("Audio file must be WAV format mono PCM.")
            return
    except Exception as e:
        print(f"Error opening audio file: {e}")
        return
    
    # Initialize recognizer with sample rate
    rec = KaldiRecognizer(model, wf.getframerate())
    rec.SetWords(True)  # This line ensures word-level timestamps are included

    print("Processing audio...")

    results = []
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            result = json.loads(rec.Result())
            results.append(result)
        else:
            rec.PartialResult()

    # Get the final result
    final_result = json.loads(rec.FinalResult())
    results.append(final_result)

    # Extract and print transcription with timestamps
    for result in results:
        if 'result' in result:  # Checking if word-level info is present
            for word_info in result['result']:
                word = word_info.get('word', 'Unknown')
                start_time = word_info.get('start', 0)
                end_time = word_info.get('end', 0)
                print(f"Word: {word}, Start: {start_time:.2f}s, End: {end_time:.2f}s")
        else:
            print("No word-level results in this segment")

transcribe_audio_with_timestamps(r'myfile.wav', 'vosk-model-small-hi-0.22')


Processing audio...
Word: अबे, Start: 0.60s, End: 0.90s
Word: साले, Start: 0.90s, End: 1.23s
Word: तेरी, Start: 1.23s, End: 1.47s
Word: माँ, Start: 1.47s, End: 1.62s
Word: की, Start: 1.62s, End: 1.77s
Word: चूत, Start: 1.83s, End: 2.16s


In [3]:
import subprocess

def convert_mp3_to_wav(mp3_file, wav_file):
    try:
        subprocess.run(['ffmpeg', '-i', mp3_file, '-ac', '1', '-ar', '16000', wav_file], check=True)
    except subprocess.CalledProcessError as e:
        print("Error during MP3 to WAV conversion:", e)

convert_mp3_to_wav("Test2.wav", "myfile.wav")