In [None]:
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration, WhisperFeatureExtractor, WhisperTokenizer
import os

import torch
if torch.cuda.is_available():
    device = 'cuda'
    print(f'Using GPU: {torch.cuda.get_device_name(0)}')
else:
    device = 'cpu'
    print('GPU not available, using CPU.')
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [3]:
   from dotenv import load_dotenv
   import os

   load_dotenv("HF.config")
   hf_token = os.getenv("HF_TOKEN")

In [3]:
#load feature extractor from pre-trained checkpoint
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Spanish", task="transcribe")


In [124]:
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="spanish", task="transcribe")
og_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("mevitts/whisper-small-es")

og_model = og_model.to(device)
model = model.to(device)


In [125]:
from transformers import GenerationConfig


# Set model configuration
og_model.generation_config = GenerationConfig.from_pretrained("openai/whisper-small")
og_model.generation_config.language = "spanish"
og_model.generation_config.task = "transcribe"
og_model.config.pad_token_id = og_model.config.eos_token_id

model.generation_config = GenerationConfig.from_pretrained("openai/whisper-small")
model.generation_config.language = "spanish"
model.generation_config.task = "transcribe"
model.config.pad_token_id = model.config.eos_token_id


In [121]:
import sounddevice as sd
import numpy as np
from scipy.signal import butter, filtfilt


def record_audio(duration=5, sample_rate=16000):
    """Record audio for set duration"""
    print("Say what you want")
    myrecording = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1)
    sd.wait()
    
    #convert to single array, then normalize audio
    audio = myrecording.flatten()
    
    audio = audio / np.max(np.abs(audio))
    
    # apply filter to reduce noise
    nyquist = sample_rate / 2
    cutoff = 100 #Hz
    b, a = butter(4, cutoff/nyquist, btype='high')
    audio = filtfilt(b, a, audio)
    
    audio = audio / np.max(np.abs(audio))
    return audio

In [127]:
def transcribe_audio(audio, sample_rate=16000, model=og_model):
    
    
    inputs = feature_extractor(
        audio,
        sampling_rate=sample_rate,
        return_tensors="pt"
    )
    input_features = inputs.input_features.to(device)
    
    predicted_ids = model.generate(
        input_features,
        max_length=448,  # Increased from default
        num_beams=5,     # Increased from default
        temperature=0.7  # Slightly reduced from default
    )
    transcription = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    
    return transcription
    

In [None]:
rec = record_audio()
msg = transcribe_audio(rec, model=og_model)
matt_msg = transcribe_audio(rec, model=model)
print(msg)
print(matt_msg)

In [157]:

from dotenv import load_dotenv
import os
import google.generativeai as gem

load_dotenv("HF.config")
google_api_key = os.getenv("GOOGLE_API_KEY")
if not google_api_key:
    raise ValueError("GOOGLE_API_KEY not found in environment variables")
#genai.configure(api_key=google_api_key)
gem.configure(api_key=google_api_key)


In [163]:
gemini = gem.GenerativeModel('gemini-2.0-flash')

In [166]:
def generate_response(text):
    response = gemini.generate_content(
            f"Eres un asistente conversacional en español. Adaptarse al contexto de la conversación. (Por ejemplo: si la persona te diga algo como si fuera tu amigo o un familiar, respondele con ese rol.) Mantenga la duración de tu respuesta corta. Responde de manera natural y conversacional a: {text}"
        )
    return response

In [None]:
generate_response("Cual es tu comida favorita?")

In [160]:
def test_full_pipeline():
    print("Recording audio...")
    audio = record_audio()
    
    print("\nTranscribing audio...")
    transcription = transcribe_audio(audio, model=og_model)
    print(f"Transcription: {transcription}")
    
    print("\nGenerating response...")
    response = generate_response(transcription)
    print(f"Response: {response}")
    
    return transcription, response

In [None]:
transcription, response = test_full_pipeline()

In [10]:
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration, WhisperFeatureExtractor, WhisperTokenizer
import os
import sounddevice as sd
import numpy as np
from scipy.io import wavfile
from TTS.api import TTS

def text_to_speech(text, output_path="output.wav"):
    # Initialize TTS with Spanish xtts model
    tts = TTS(model_name="tts_models/es/css10/vits", progress_bar=False)
    
    # Generate speech
    tts.tts_to_file(text=text, file_path=output_path)
    
    # Play the audio
    sample_rate, audio = wavfile.read(output_path)
    sd.play(audio, sample_rate)
    sd.wait()
    
    return output_path

In [11]:
output_wav = text_to_speech("Mirense a estos pinches gringos!")

 > tts_models/es/css10/vits is already downloaded.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > initialization of speaker-embedding layers.
 > initialization of language-embedding layers.
 > Text splitted to sentences.
['Mirense a estos pinches gringos!']
 > Processing time: 

In [15]:
from TTS.utils.manage import ModelManager
manager = ModelManager()
print("Available Spanish models:")
for model in manager.list_models():
    if "es" in model or "spanish" in model.lower():
        print(model)

Available Spanish models:

 Name format: type/language/dataset/model
 1: tts_models/multilingual/multi-dataset/xtts_v2
 2: tts_models/multilingual/multi-dataset/xtts_v1.1
 3: tts_models/multilingual/multi-dataset/your_tts
 4: tts_models/multilingual/multi-dataset/bark
 5: tts_models/bg/cv/vits
 6: tts_models/cs/cv/vits
 7: tts_models/da/cv/vits
 8: tts_models/et/cv/vits
 9: tts_models/ga/cv/vits
 10: tts_models/en/ek1/tacotron2
 11: tts_models/en/ljspeech/tacotron2-DDC
 12: tts_models/en/ljspeech/tacotron2-DDC_ph
 13: tts_models/en/ljspeech/glow-tts
 14: tts_models/en/ljspeech/speedy-speech
 15: tts_models/en/ljspeech/tacotron2-DCA
 16: tts_models/en/ljspeech/vits
 17: tts_models/en/ljspeech/vits--neon
 18: tts_models/en/ljspeech/fast_pitch
 19: tts_models/en/ljspeech/overflow
 20: tts_models/en/ljspeech/neural_hmm
 21: tts_models/en/vctk/vits
 22: tts_models/en/vctk/fast_pitch
 23: tts_models/en/sam/tacotron-DDC
 24: tts_models/en/blizzard2013/capacitron-t2-c50
 25: tts_models/en/bliz

In [13]:
!pip install time

ERROR: Could not find a version that satisfies the requirement time (from versions: none)
ERROR: No matching distribution found for time


In [14]:
import time

def run_conversation():
    print("Starting conversation")
    
    while True:
        try:
            
            
            #record user
            print("Recording now")
            try:
                audio = record_audio()
            except Exception as e:
                print(f"Error recording audio: {str(e)}")
                print("Please try again.")
                continue
            
            #transcribe audio
            try:
                transcription = transcribe_audio(audio, model=og_model)
                if not transcription.strip():
                    print('No speech detected. Please try again')
                    continue
                print(f'\nDid you say: {transcription}?')
            except Exception as e:
                print(f"Error recording audio: {str(e)}")
                print("Please try again.")
                continue
            
            #if exit conversation
            if transcription.lower() in ['adiós', 'adios', 'chao', 'bye', 'goodbye', 'done', 'stop']:
                print("\nEnding conversation. Chao!")
                break
            
            #generate response
            try:
                response = generate_response(transcription)
            except Exception as e:
                    print(f"Error generating response: {str(e)}")
                    print("Please try again.")
                    continue
                
            
            #convert response to speech output
            try:
                text_to_speech(response.text)
            except Exception as e:
                    print(f"Error converting response to speech: {str(e)}")
                    print(f"Continuing without speech output.\nResponse: {response.text}")
                    time.sleep(3)
                    continue
                
                
        except KeyboardInterrupt:
            print("\n\nConversation interrupted by user. ¡Hasta luego!")
            break
        except Exception as e:
            print(f"\nUnexpected error: {str(e)}")
            print("Please try again.")
            continue

In [None]:
run_conversation()