In [2]:
import os
import librosa
import whisper
import soundfile as sf  # To save the processed audio as .wav

# Set the directory where your .wav files are stored
audio_directory = "../wav_data/"  # Replace with your actual path
processed_audio_directory = "../dataset/preprocessed/"  # Folder to save preprocessed audio

# Create the processed audio directory if it doesn't exist
os.makedirs(processed_audio_directory, exist_ok=True)

# Initialize the Whisper model (use a model appropriate for your needs)
model = whisper.load_model("base")  # You can choose 'small', 'medium', or 'large' for better accuracy

# Function to transcribe audio using Whisper ASR
def transcribe_audio(wav_file_path, processed_audio_path):
    """Transcribe audio using Whisper ASR and save the processed audio."""
    # Load the audio file with librosa, resampling to 16kHz
    audio, sr = librosa.load(wav_file_path, sr=16000)  # Whisper works best with 16kHz audio
    
    # Save the processed audio to a new location
    sf.write(processed_audio_path, audio, sr)  # Saving preprocessed audio

    # Use the Whisper model to transcribe the audio
    result = model.transcribe(audio, language="kn")  # Language code for Kannada is "kn"
    return result['text']

# Function to save transcription to a text file
def save_transcription(wav_file, transcription):
    """Save transcription to a text file."""
    transcription_file = wav_file.replace(".wav", "_transcription.txt")
    
    # Save the transcription with UTF-8 encoding
    with open(transcription_file, "w", encoding="utf-8") as file:
        file.write(transcription)

# Function to process each audio file in the directory
def process_audio_files():
    # List all .wav files in the audio directory
    for wav_file in os.listdir(audio_directory):
        if wav_file.endswith(".wav"):
            wav_file_path = os.path.join(audio_directory, wav_file)
            processed_audio_path = os.path.join(processed_audio_directory, wav_file)
            
            print(f"Processing file: {wav_file}")
            
            # Transcribe the audio and save the processed file
            transcription = transcribe_audio(wav_file_path, processed_audio_path)
            
            # Save the transcription to a text file
            save_transcription(wav_file, transcription)
            print(f"Transcription saved for: {wav_file}")
            print(f"Processed audio saved to: {processed_audio_path}")

# Run the process
process_audio_files()


  checkpoint = torch.load(fp, map_location=device)


Processing file: SandalWoodNewsStories_1.wav
Transcription saved for: SandalWoodNewsStories_1.wav
Processed audio saved to: ../dataset/preprocessed/SandalWoodNewsStories_1.wav
Processing file: SandalWoodNewsStories_107.wav
Transcription saved for: SandalWoodNewsStories_107.wav
Processed audio saved to: ../dataset/preprocessed/SandalWoodNewsStories_107.wav
Processing file: SandalWoodNewsStories_112.wav
Transcription saved for: SandalWoodNewsStories_112.wav
Processed audio saved to: ../dataset/preprocessed/SandalWoodNewsStories_112.wav
Processing file: SandalWoodNewsStories_144.wav
Transcription saved for: SandalWoodNewsStories_144.wav
Processed audio saved to: ../dataset/preprocessed/SandalWoodNewsStories_144.wav
Processing file: SandalWoodNewsStories_146.wav
Transcription saved for: SandalWoodNewsStories_146.wav
Processed audio saved to: ../dataset/preprocessed/SandalWoodNewsStories_146.wav
Processing file: SandalWoodNewsStories_148.wav
Transcription saved for: SandalWoodNewsStories_14

In [9]:
import os
import speech_recognition as sr
from googletrans import Translator
from pydub import AudioSegment

def convert_wav_to_text(file_path, target_language='en'):
    # Initialize recognizer and translator
    recognizer = sr.Recognizer()
    translator = Translator()
    
    # Convert audio file to a format compatible with SpeechRecognition
    audio = AudioSegment.from_wav(file_path)
    temp_wav_file = "temp.wav"
    audio.export(temp_wav_file, format="wav")
    
    # Convert speech to Kannada text
    with sr.AudioFile(temp_wav_file) as source:
        print("Processing audio...")
        audio_data = recognizer.record(source)
        try:
            kannada_text = recognizer.recognize_google(audio_data, language="kn-IN")
            print("Recognized Kannada text:", kannada_text)
        except sr.UnknownValueError:
            return "Could not understand audio"
        except sr.RequestError as e:
            return f"Speech Recognition error: {e}"
    
    # Translate Kannada text to English
    try:
        translated_text = translator.translate(kannada_text, src="kn", dest=target_language).text
        print("Translated English text:", translated_text)
        return translated_text
    except Exception as e:
        return f"Translation error: {e}"

# Path to your Kannada WAV file
wav_file_path = "../dataset/preprocessed/SandalWoodNewsStories_1.wav"

# Convert audio to text and translate
translated_text = convert_wav_to_text(wav_file_path)
if translated_text:
    # Save to a text file
    with open("output.txt", "w", encoding="utf-8") as f:
        f.write(translated_text)
    print("Translation saved to output.txt")


AttributeError: module 'httpcore' has no attribute 'SyncHTTPTransport'

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Define the EncoderRNN and AttnDecoderRNN models (ensure these match the original definitions)

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size)

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=10):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size)

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define input and hidden sizes (ensure these match the original)
input_size = 5000  # Example input size, replace with actual value
output_size = 5000  # Example output size, replace with actual value
hidden_size = 256  # Example hidden size, replace with actual value

# Instantiate the models
encoder_eng = EncoderRNN(input_size, hidden_size).to(device)
attn_decoder_eng = AttnDecoderRNN(hidden_size, output_size).to(device)

# Load the state dictionaries with map_location to handle CPU-only environments
encoder_eng.load_state_dict(torch.load("/content/model_enc_eng.dict", map_location=torch.device('cpu')))
attn_decoder_eng.load_state_dict(torch.load("/content/model_dec_eng.dict", map_location=torch.device('cpu')))

# Now the models encoder_eng and attn_decoder_eng are ready to be used
