In [None]:
from vosk import Model, KaldiRecognizer
from pydub import AudioSegment
import wave
import json
import pickle
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import os
import subprocess

# Load the tokenizers and model (assuming they're saved as shown earlier)
with open('ft_word_tokenizer.pkl', 'rb') as handle:
    tokenizer = pickle.load(handle)
with open('ft_char_tokenizer.pkl', 'rb') as handle:
    char_tokenizer = pickle.load(handle)
model = load_model('fast_text_bad_word_detection_model.h5')

def preprocess_sentence(sentence):
    char_max_length = 15
    max_length = 475
    word_sequence = tokenizer.texts_to_sequences([sentence])
    padded_word_sequence = pad_sequences(word_sequence, maxlen=max_length, padding='post', truncating='post')
    char_sequence = [[char_tokenizer.word_index.get(char, 0) for char in word] for word in sentence.split()]
    char_sequence = pad_sequences(char_sequence, maxlen=char_max_length, padding="post")
    padded_char_sequence = pad_sequences([char_sequence], maxlen=max_length, padding='post', dtype='int32')
    return padded_word_sequence, padded_char_sequence

def predict_bad_words(sentence):
    max_length = 475
    padded_word_sequence, padded_char_sequence = preprocess_sentence(sentence)
    predictions = model.predict([padded_word_sequence, padded_char_sequence])
    threshold = 0.5
    predicted_labels = (predictions > threshold).astype(int)[0]
    words = sentence.split()
    bad_words = [word for i, word in enumerate(words[:max_length]) if predicted_labels[i] == 1]
    return bad_words

def convert_mp3_to_wav(mp3_file, wav_file):
    try:
        subprocess.run(['ffmpeg', '-i', mp3_file, '-ac', '1', '-ar', '16000', wav_file], check=True)
    except subprocess.CalledProcessError as e:
        print("Error during MP3 to WAV conversion:", e)

def transcribe_audio_with_timestamps(audio_file, model_path):
    if not os.path.exists(model_path):
        print(f"Model not found at {model_path}")
        return []
    
    model = Model(model_path)
    if not audio_file.lower().endswith('.wav'):
        wav_file = audio_file.rsplit('.', 1)[0] + '.wav'
        if audio_file.lower().endswith('.mp3'):
            convert_mp3_to_wav(audio_file, wav_file)
            audio_file = wav_file
        else:
            print("Audio file must be in WAV format or MP3 format.")
            return []
    
    try:
        wf = wave.open(audio_file, "rb")
        if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
            print("Audio file must be WAV format mono PCM.")
            return []
    except Exception as e:
        print(f"Error opening audio file: {e}")
        return []
    
    rec = KaldiRecognizer(model, wf.getframerate())
    rec.SetWords(True)
    results = []
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            result = json.loads(rec.Result())
            results.append(result)
        else:
            rec.PartialResult()
    final_result = json.loads(rec.FinalResult())
    results.append(final_result)
    
    word_timestamps = []
    for result in results:
        if 'result' in result:
            for word_info in result['result']:
                word_timestamps.append({
                    'word': word_info.get('word', ''),
                    'start': word_info.get('start', 0),
                    'end': word_info.get('end', 0)
                })
    return word_timestamps

def mute_bad_words_in_audio(audio_file, bad_words, word_timestamps):
    audio = AudioSegment.from_wav(audio_file)
    for item in word_timestamps:
        if item['word'] in bad_words:
            start_ms = item['start'] * 1000
            end_ms = item['end'] * 1000
            audio = audio[:start_ms] + AudioSegment.silent(duration=(end_ms - start_ms)) + audio[end_ms:]
    muted_file = "muted_" + os.path.basename(audio_file)
    audio.export(muted_file, format="wav")
    return muted_file

# Integrate all steps
audio_file = r"M:\Coding\NLP_Project\AudioData\Audio_3009.wav"
model_path = 'vosk-model-small-hi-0.22'

# Step 1: Transcribe audio with timestamps
word_timestamps = transcribe_audio_with_timestamps(audio_file, model_path)

# Step 2: Detect bad words from transcribed text
transcribed_text = " ".join([item['word'] for item in word_timestamps])  # Create sentence from transcribed words
bad_words = predict_bad_words(transcribed_text)
print(bad_words)

# Step 3: Mute bad words in audio
muted_file = mute_bad_words_in_audio(audio_file, bad_words, word_timestamps)
print(f"Muted audio saved to: {muted_file}")
