In [25]:
from llama_cpp import Llama
import speech_recognition as sr
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, VitsModel, AutoTokenizer
import torch
import librosa
import io
import soundfile as sf
from IPython.di

splay import Audio
from transformers import AutoProcessor, AutoModel

from transformers import pipeline
import warnings
import logging

warnings.filterwarnings("ignore")
logging.getLogger("transformers").setLevel(logging.ERROR)


# Initialize global objects
r = sr.Recognizer()
processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english", verbose=False)
model_wav2vec = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
model_vits = VitsModel.from_pretrained("facebook/mms-tts-eng")
tokenizer_vits = AutoTokenizer.from_pretrained("facebook/mms-tts-eng", verbose=False)


def recognize_speech_from_microphone():
    with sr.Microphone(sample_rate=16000) as source:
        print("Please speak now...")
        audio_data = r.listen(source)
        print("Recognizing command...")

        audio_bytes = audio_data.get_wav_data()
        audio_file = io.BytesIO(audio_bytes)
        audio, samplerate = sf.read(audio_file)
        
        if samplerate != 16000:
            audio = librosa.resample(audio, orig_sr=samplerate, target_sr=16000)
            samplerate = 16000

        input_values = processor(audio, sampling_rate=samplerate, return_tensors="pt").input_values
        with torch.no_grad():
            logits = model_wav2vec(input_values).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.batch_decode(predicted_ids)

        return transcription[0]

def process_with_llama(transcription):
    llm = Llama(model_path="./MODELS.GGUF/capybarahermes-2.5-mistral-7b.Q4_K_M.gguf", verbose=False, chat_format="llama-2")
    messages = [
        {"role": "system", "content": "You are a story writer."},
        {"role": "user", "content": transcription}
    ]
    result = llm.create_chat_completion(messages)
    return result['choices'][0]['message']['content']

def text_to_speech_fb_mms(text):
    inputs = tokenizer_vits(text, return_tensors="pt", verbose=False)
    with torch.no_grad():
        output = model_vits(**inputs).waveform
    return Audio(output.numpy(), rate=model_vits.config.sampling_rate)

print("Starting ...")
# Main workflow
transcription = recognize_speech_from_microphone()
print("Transcription: ", transcription)

result = process_with_llama(transcription)
print("Processing command...")
print(result)
audio_output = text_to_speech_fb_mms(result)

audio_output


Starting ...
Please speak now...
Recognizing command...
Transcription:  tell me about cristiano ronaldo
Processing command...

Cristiano Ronaldo is a world-renowned Portuguese professional footballer who currently plays for the Italian club Juventus and captains the Portugal national team. He is widely considered one of the greatest footballers of all time, known for his exceptional skills, speed, and goal-scoring abilities.

Born on February 5, 1985, in Funchal, Madeira, Portugal, Ronaldo began his football career at a young age, joining local club Andorinha at the age of eight. He later moved to Sporting Lisbon's youth academy, where he honed his skills and attracted the attention of several European clubs. In 2003, at the age of 18, he signed with English club Manchester United, where he quickly became a star player under the guidance of coach Sir Alex Ferguson.

Ronaldo's time at Manchester United was highly successful, winning three Premier League titles, one FA Cup, two League Cu