## Transcribe all words

In [None]:
import os
import wave
import json
from vosk import Model, KaldiRecognizer
from pydub import AudioSegment
import subprocess

def convert_mp3_to_wav(mp3_file, wav_file):
    try:
        subprocess.run(['ffmpeg', '-i', mp3_file, '-ac', '1', '-ar', '16000', wav_file], check=True)
    except subprocess.CalledProcessError as e:
        print("Error during MP3 to WAV conversion:", e)  # Use PCM codec for export

def transcribe_audio_with_timestamps(audio_file, model_path):
    # Load the Vosk model
    if not os.path.exists(model_path):
        print(f"Model not found at {model_path}")
        return
    
    model = Model(model_path)

    # Check if the audio file is in WAV format
    if not audio_file.lower().endswith('.wav'):
        # Convert MP3 to WAV if necessary
        wav_file = audio_file.rsplit('.', 1)[0] + '.wav'
        if audio_file.lower().endswith('.mp3'):
            convert_mp3_to_wav(audio_file, wav_file)
            audio_file = wav_file
        else:
            print("Audio file must be in WAV format or MP3 format.")
            return

    # Open the audio file
    try:
        wf = wave.open(audio_file, "rb")
        if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
            print("Audio file must be WAV format mono PCM.")
            return
    except Exception as e:
        print(f"Error opening audio file: {e}")
        return
    
    # Initialize recognizer with sample rate
    rec = KaldiRecognizer(model, wf.getframerate())
    rec.SetWords(True)  # This line ensures word-level timestamps are included

    print("Processing audio...")

    results = []
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            result = json.loads(rec.Result())
            results.append(result)
        else:
            rec.PartialResult()

    # Get the final result
    final_result = json.loads(rec.FinalResult())
    results.append(final_result)

    # Extract and print transcription with timestamps
    for result in results:
        if 'result' in result:  # Checking if word-level info is present
            for word_info in result['result']:
                word = word_info.get('word', 'Unknown')
                start_time = word_info.get('start', 0)
                end_time = word_info.get('end', 0)
                print(f"Word: {word}, Start: {start_time:.2f}s, End: {end_time:.2f}s")
        else:
            print("No word-level results in this segment")

transcribe_audio_with_timestamps(r'Audio_Data\Audio_28.mp3', 'vosk-model-small-hi-0.22')


## Trascribe only Bad Words

In [2]:
import pandas as pd
bad_words = pd.read_csv("BadWordListUpdated.csv", encoding='utf-8')
bad_words.head()

Unnamed: 0,Transliteration,Devanagari
0,aad,आंड़
1,aand,आंड
2,bahenchod,बहनचोद
3,behenchod,बेहेनचोद
4,bahen ka lauda,बेहेन्का लौडा


In [3]:
bad_word_list = bad_words['Devanagari'].to_list()

In [8]:
import os
import wave
import json
from vosk import Model, KaldiRecognizer
from pydub import AudioSegment
import subprocess

def convert_mp3_to_wav(mp3_file, wav_file):
    try:
        subprocess.run(['ffmpeg', '-i', mp3_file, '-ac', '1', '-ar', '16000', wav_file], check=True)
    except subprocess.CalledProcessError as e:
        print("Error during MP3 to WAV conversion:", e) # Use PCM codec for export

def transcribe_audio_with_timestamps(audio_file, model_path, allowed_words):
    # Load the Vosk model
    if not os.path.exists(model_path):
        print(f"Model not found at {model_path}")
        return
    
    model = Model(model_path)

    # Check if the audio file is in WAV format
    if not audio_file.lower().endswith('.wav'):
        # Convert MP3 to WAV if necessary
        wav_file = audio_file.rsplit('.', 1)[0] + '.wav'
        if audio_file.lower().endswith('.mp3'):
            convert_mp3_to_wav(audio_file, wav_file)
            audio_file = wav_file
        else:
            print("Audio file must be in WAV format or MP3 format.")
            return

    # Open the audio file
    try:
        wf = wave.open(audio_file, "rb")
        if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
            print("Audio file must be WAV format mono PCM.")
            return
    except Exception as e:
        print(f"Error opening audio file: {e}")
        return
    
    # Initialize recognizer with sample rate
    rec = KaldiRecognizer(model, wf.getframerate())
    rec.SetWords(True)  # This line ensures word-level timestamps are included

    print("Processing audio...")

    results = []
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            result = json.loads(rec.Result())
            results.append(result)
        else:
            rec.PartialResult()

    # Get the final result
    final_result = json.loads(rec.FinalResult())
    results.append(final_result)

    # Extract and print transcription with timestamps, filtering by allowed words
    for result in results:
        if 'result' in result:  # Checking if word-level info is present
            for word_info in result['result']:
                word = word_info.get('word', 'Unknown').lower()
                if word in allowed_words:  # Only keep allowed words
                    start_time = word_info.get('start', 0)
                    end_time = word_info.get('end', 0)
                    print(f"Word: {word}, Start: {start_time:.2f}s, End: {end_time:.2f}s")
        else:
            print("No word-level results in this segment")

# Define the set of allowed words
allowed_words = set(bad_word_list)

# Run transcription with filtering
transcribe_audio_with_timestamps(r"Audio_Data\Audio_4.mp3", 'vosk-model-small-hi-0.22', allowed_words)


Processing audio...
Word: कुत्ते, Start: 0.12s, End: 0.39s


In [5]:
import os
import wave
import json
from vosk import Model, KaldiRecognizer
import subprocess

def convert_mp3_to_wav(mp3_file, wav_file):
    try:
        subprocess.run(['ffmpeg', '-i', mp3_file, '-ac', '1', '-ar', '16000', wav_file], check=True)
    except subprocess.CalledProcessError as e:
        print("Error during MP3 to WAV conversion:", e)

def transcribe_audio_with_timestamps(audio_file, model, allowed_words, filename):
    # Open the audio file
    try:
        wf = wave.open(audio_file, "rb")
        if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
            print(f"Audio file {audio_file} must be in WAV format mono PCM.")
            return []
    except Exception as e:
        print(f"Error opening audio file {audio_file}: {e}")
        return []
    
    # Initialize recognizer with sample rate
    rec = KaldiRecognizer(model, wf.getframerate())
    rec.SetWords(True)

    results = []
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            result = json.loads(rec.Result())
            results.append(result)
    
    # Get the final result
    final_result = json.loads(rec.FinalResult())
    results.append(final_result)

    # Extract bad words with timestamps
    bad_word_entries = []
    for result in results:
        if 'result' in result:
            for word_info in result['result']:
                word = word_info.get('word', '').lower()
                if word in allowed_words:
                    bad_word_entries.append({
                        "file": filename,
                        "word": word,
                        "start": word_info.get("start", 0),
                        "end": word_info.get("end", 0)
                    })
    return bad_word_entries

def process_audio_folder(folder_path, model_path, allowed_words):
    # Load the Vosk model
    if not os.path.exists(model_path):
        print(f"Model not found at {model_path}")
        return

    model = Model(model_path)
    all_bad_words = []

    # Process each file in the folder
    for filename in os.listdir(folder_path):
        if filename.lower().endswith(('.wav', '.mp3')):
            audio_file = os.path.join(folder_path, filename)
            
            # Convert MP3 to WAV if needed
            if filename.lower().endswith('.mp3'):
                wav_file = os.path.splitext(audio_file)[0] + '.wav'
                convert_mp3_to_wav(audio_file, wav_file)
                audio_file = wav_file

            print(f"Processing file: {audio_file}")
            bad_words = transcribe_audio_with_timestamps(audio_file, model, allowed_words, filename)
            all_bad_words.extend(bad_words)
    
    # Save all bad words with timestamps to a JSON file
    with open("bad_words_timestamps.json", "w") as json_file:
        json.dump(all_bad_words, json_file, indent=4)
    print("Bad words with timestamps saved to bad_words_timestamps.json")

# Define the set of allowed words
allowed_words = set(bad_word_list)

# Run transcription with filtering on a folder of audio files
process_audio_folder("TempConvert", 'vosk-model-small-hi-0.22', allowed_words)


Processing file: TempConvert\Audio_0.wav
Processing file: TempConvert\Audio_1.wav
Processing file: TempConvert\Audio_2.wav
Processing file: TempConvert\Audio_3.wav
Processing file: TempConvert\Audio_4.wav
Bad words with timestamps saved to bad_words_timestamps.json


In [4]:
import os
import wave
import json
from vosk import Model, KaldiRecognizer
import subprocess
from tqdm import tqdm

def convert_mp3_to_wav(mp3_file, wav_file):
    try:
        subprocess.run(['ffmpeg', '-i', mp3_file, '-ac', '1', '-ar', '16000', wav_file], check=True)
    except subprocess.CalledProcessError as e:
        print("Error during MP3 to WAV conversion:", e)

def transcribe_audio_with_timestamps(audio_file, model, allowed_words, filename):
    # Open the audio file
    try:
        wf = wave.open(audio_file, "rb")
        if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
            print(f"Audio file {audio_file} must be in WAV format mono PCM.")
            return []
    except Exception as e:
        print(f"Error opening audio file {audio_file}: {e}")
        return []
    
    # Initialize recognizer with sample rate
    rec = KaldiRecognizer(model, wf.getframerate())
    rec.SetWords(True)

    results = []
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            result = json.loads(rec.Result())
            results.append(result)
    
    # Get the final result
    final_result = json.loads(rec.FinalResult())
    results.append(final_result)

    # Extract bad words with timestamps
    bad_word_entries = []
    for result in results:
        if 'result' in result:
            for word_info in result['result']:
                word = word_info.get('word', '').lower()
                if word in allowed_words:
                    bad_word_entries.append({
                        "file": filename,
                        "word": word,
                        "start": word_info.get("start", 0),
                        "end": word_info.get("end", 0)
                    })
    return bad_word_entries

def process_audio_folder(folder_path, model_path, allowed_words):
    # Load the Vosk model
    if not os.path.exists(model_path):
        print(f"Model not found at {model_path}")
        return

    model = Model(model_path)
    all_bad_words = []

    # Get a list of audio files in the folder
    audio_files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.wav', '.mp3'))]

    # Initialize the progress bar
    with tqdm(total=len(audio_files), desc="Processing audio files") as pbar:
        for filename in audio_files:
            audio_file = os.path.join(folder_path, filename)
            
            # Convert MP3 to WAV if needed
            if filename.lower().endswith('.mp3'):
                wav_file = os.path.splitext(audio_file)[0] + '.wav'
                convert_mp3_to_wav(audio_file, wav_file)
                audio_file = wav_file

            bad_words = transcribe_audio_with_timestamps(audio_file, model, allowed_words, filename)
            all_bad_words.extend(bad_words)
            
            # Update the progress bar
            pbar.update(1)
    
    # Save all bad words with timestamps to a JSON file
    with open("bad_words_timestamps2.json", "w") as json_file:
        json.dump(all_bad_words, json_file, indent=4)
    print("Bad words with timestamps saved to bad_words_timestamps.json")

# Define the set of allowed words
allowed_words = set(bad_word_list)

# Run transcription with filtering on a folder of audio files
process_audio_folder("TempAudioData", 'vosk-model-small-hi-0.22', allowed_words)


Processing audio files: 100%|██████████| 2500/2500 [1:03:13<00:00,  1.52s/it]

Bad words with timestamps saved to bad_words_timestamps.json



