In [3]:
!pip install deep_translator 

Collecting deep_translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deep_translator
Successfully installed deep_translator-1.11.4


In [11]:
import os
os.environ["HF_TOKEN"] = ""

In [None]:
import torch
import os
import requests
import torchaudio
from transformers import pipeline, AutoProcessor, VitsModel
from deep_translator import GoogleTranslator
from transformers.pipelines.audio_utils import ffmpeg_microphone_live
from threading import Thread
from queue import Queue
from IPython.display import Audio
import numpy as np

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

classifier = pipeline("audio-classification", model="MIT/ast-finetuned-speech-commands-v2", device=device)
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=device)
processor = AutoProcessor.from_pretrained("facebook/mms-tts-mar")
model = VitsModel.from_pretrained("facebook/mms-tts-mar").to(device)


In [None]:
def transcribe_audio_from_wav(wav_file):
    waveform, sample_rate = torchaudio.load(wav_file)
    waveform = waveform.squeeze(0).numpy()  
    transcription = transcriber(waveform,  generate_kwargs={"max_new_tokens": 64})
    print(transcription["text"])
    return transcription["text"]

In [None]:
def query_model(text, model_id="mistralai/Mistral-7B-Instruct-v0.1"):
    api_url = f"https://api-inference.huggingface.co/models/{model_id}"
    headers = {"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"}
    response = requests.post(api_url, headers=headers, json={"inputs": text})
    print(response)

    if response.status_code != 200:
        print(f"Error: {response.status_code}, {response.text}")  # Debugging output
        return "API Error"

    try:
        json_response = response.json()
        if not json_response:
            return "No response from model"
        return json_response[0].get("generated_text", "No text generated")[len(text) + 1 :]
    except requests.exceptions.JSONDecodeError:
        print("Invalid JSON response from API")
        return "Invalid response"

# def query_model(text, model_id="mistralai/Mistral-7B-Instruct-v0.1"):
#     api_url = f"https://api-inference.huggingface.co/models/{model_id}"
#     headers = {"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"}
#     response = requests.post(api_url, headers=headers, json={"inputs": text})
#     print(response)
#     return response.json()[0]["generated_text"][len(text) + 1 :]

In [None]:
def synthesise(text):
    inputs = processor(text=text, return_tensors="pt").to(device)
    with torch.no_grad():
        speech = model(input_ids=inputs["input_ids"]).waveform
    return speech.cpu()

In [None]:
def assistant_pipeline(wav_file):
    transcription = transcribe_audio_from_wav(wav_file)
    english_text = GoogleTranslator(source="mr", target="en").translate(transcription)
    response_text = query_model(english_text)
    marathi_response = GoogleTranslator(source="en", target="mr").translate(response_text)
    audio = synthesise(marathi_response)
    Audio(audio, rate=16000, autoplay=True)

In [None]:
if __name__ == "__main__":
    wav_file_path = "/kaggle/input/wav-exa/sample.wav"  
    assistant_pipeline(wav_file_path)


In [None]:
import torch
import torchaudio
import numpy as np
from faster_whisper import WhisperModel
from transformers import AutoProcessor, VitsModel
import time
import gc
import os

In [None]:
class MarathiWavProcessor:
    def __init__(self):
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            print(f"Initial CUDA memory allocated: {torch.cuda.memory_allocated()/1e9:.2f} GB")
        gc.collect()
        
        try:
            import psutil
            print(f"Available system memory: {psutil.virtual_memory().available/1e9:.2f} GB")
        except ImportError:
            print("psutil not installed, skipping memory check")

        total_gpu_memory = torch.cuda.get_device_properties(0).total_memory if torch.cuda.is_available() else 0
        self.device = "cuda" if total_gpu_memory > 4e9 else "cpu"  
        print(f"Using device: {self.device}")

        print("Loading models...")
        try:
            self.load_whisper()
            
            self.load_tts()
            
            print("Models loaded successfully!")
        except Exception as e:
            print(f"Error during model loading: {str(e)}")
            raise

    def load_whisper(self):
        try:
            self.transcriber = WhisperModel(
                "tiny",
                device=self.device,
                compute_type="float32", 
                download_root="./models", 
                num_workers=1  
            )
            print("Whisper model loaded")
        except Exception as e:
            print(f"Error loading Whisper model: {str(e)}")
            raise

    def load_tts(self):
        try:
            self.processor = AutoProcessor.from_pretrained(
                "facebook/mms-tts-mar",
                local_files_only=False
            )
            self.tts_model = VitsModel.from_pretrained(
                "facebook/mms-tts-mar"
            ).to(self.device)
            self.tts_model.eval()
            print("TTS model loaded")
        except Exception as e:
            print(f"Error loading TTS model: {str(e)}")
            raise

    def process_audio(self, audio_data, sample_rate):
        """Process audio data with error handling and memory management"""
        try:
            # Resample if needed
            if sample_rate != 16000:
                resampler = torchaudio.transforms.Resample(sample_rate, 16000)
                audio_data = resampler(audio_data)
                sample_rate = 16000

            # Convert to mono if stereo
            if audio_data.shape[0] > 1:
                audio_data = torch.mean(audio_data, dim=0, keepdim=True)

            # Convert to numpy and normalize
            audio_np = audio_data.numpy().flatten()
            audio_np = audio_np / np.max(np.abs(audio_np))

            return audio_np, sample_rate
        except Exception as e:
            print(f"Error processing audio: {str(e)}")
            raise

    def process_wav(self, wav_path):
        print(f"Processing {wav_path}")
        start_time = time.time()

        try:
            if not os.path.exists(wav_path):
                raise FileNotFoundError(f"Audio file not found: {wav_path}")
            
            audio_data, sample_rate = torchaudio.load(wav_path)
            print(f"Loaded audio: {audio_data.shape}, {sample_rate}Hz")

            audio_np, sample_rate = self.process_audio(audio_data, sample_rate)
            
            del audio_data
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            gc.collect()

            print("Transcribing...")
            try:
                segments, _ = self.transcriber.transcribe(
                    audio_np,
                    language="mr",
                    beam_size=1,  # Reduce beam size
                    vad_filter=True,  # Filter out non-speech
                    initial_prompt="मराठी" 
                )
                transcription = " ".join([segment.text for segment in segments])
                print(f"Transcription: {transcription}")
            except Exception as e:
                print(f"Transcription error: {str(e)}")
                raise

            response = "तुम्ही म्हणालात: " + transcription
            print(f"Response: {response}")

            print("Synthesizing speech...")
            try:
                with torch.no_grad():
                    inputs = self.processor(text=response, return_tensors="pt")
                    speech = self.tts_model(
                        input_ids=inputs["input_ids"].to(self.device)
                    ).waveform
            except Exception as e:
                print(f"Speech synthesis error: {str(e)}")
                raise

            output_path = "response.wav"
            torchaudio.save(output_path, speech.cpu(), 16000)
            
            del speech
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            gc.collect()

            end_time = time.time()
            print(f"Total processing time: {end_time - start_time:.2f} seconds")
            return output_path

        except Exception as e:
            print(f"Error in process_wav: {str(e)}")
            raise

In [None]:
try:
    processor = MarathiWavProcessor()
    output_file = processor.process_wav("/kaggle/input/wav-exa/sample.wav")
    print(f"Response saved to: {output_file}")
except Exception as e:
    print(f"Main execution error: {str(e)}")

Initial CUDA memory allocated: 0.00 GB
Available system memory: 31.46 GB
Using device: cuda
Loading models...
Whisper model loaded
TTS model loaded
Models loaded successfully!
Processing /kaggle/input/wav-exa/sample.wav
Loaded audio: torch.Size([1, 142848]), 48000Hz


In [None]:
import torch
import torchaudio
from faster_whisper import WhisperModel
from transformers import AutoProcessor, VitsModel
import time

class MarathiWavProcessor:
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print("Loading models...")
        
        self.transcriber = WhisperModel(
            "tiny",
            device=self.device,
            compute_type="float16" if torch.cuda.is_available() else "float32"
        )
        
        self.processor = AutoProcessor.from_pretrained("facebook/mms-tts-mar")
        self.tts_model = VitsModel.from_pretrained("facebook/mms-tts-mar").to(self.device)
        self.tts_model.eval()
        
        print("Models loaded!")

    def process_wav(self, wav_path):
        print(f"Processing {wav_path}")
        start_time = time.time()
        
        print("Transcribing...")
        segments, _ = self.transcriber.transcribe(wav_path, language="mr")
        transcription = " ".join([segment.text for segment in segments])
        print(f"Transcription: {transcription}")
        
        print("Generating response...")
        response = "हो, मी तुमचे ऐकले आहे. तुम्ही काय म्हणालात: " + transcription
        print(f"Response: {response}")
        
        print("Synthesizing speech...")
        with torch.no_grad():
            inputs = self.processor(text=response, return_tensors="pt")
            speech = self.tts_model(
                input_ids=inputs["input_ids"].to(self.device)
            ).waveform
        
        output_path = "response.wav"
        torchaudio.save(output_path, speech.cpu(), 16000)
        
        end_time = time.time()
        print(f"Total processing time: {end_time - start_time:.2f} seconds")
        return output_path

In [None]:
processor = MarathiWavProcessor()
output_file = processor.process_wav("/kaggle/input/wav-exa/sample.wav")
print(f"Response saved to: {output_file}")

Loading models...


config.json:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

vocabulary.txt:   0%|          | 0.00/460k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

model.bin:   0%|          | 0.00/75.5M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.64k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/918 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/145M [00:00<?, ?B/s]

Models loaded!
Processing /kaggle/input/wav-exa/sample.wav
Transcribing...
