<a href="https://colab.research.google.com/github/monteroserra/elderly-companion-ai/blob/main/notebooks/elderly_companion_ai_notebook_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## elderly-companion-ai notebook 2: text and voice models


In [1]:
!pip install transformers SpeechRecognition gtts pydub --quiet


In [2]:
pip install imageio[ffmpeg]





In [6]:
pip install pyaudio


Collecting pyaudioNote: you may need to restart the kernel to use updated packages.

  Downloading PyAudio-0.2.14-cp313-cp313-win_amd64.whl.metadata (2.7 kB)
Downloading PyAudio-0.2.14-cp313-cp313-win_amd64.whl (173 kB)
Installing collected packages: pyaudio
Successfully installed pyaudio-0.2.14


In [3]:
import speech_recognition as sr
from gtts import gTTS
from pydub import AudioSegment
from pydub.playback import play


In [5]:
# Record Audio Function
def record_audio(filename="recorded_audio.wav"):
    recognizer = sr.Recognizer()
    with sr.Microphone() as source:
        print("Recording... Speak now!")
        audio = recognizer.listen(source)
        print("Recording complete.")
    
    # Save the audio as a WAV file
    with open(filename, "wb") as f:
        f.write(audio.get_wav_data())
    print(f"Audio saved as {filename}")
    return filename

# Playback Audio Function
def play_audio(filename):
    try:
        # Load the audio file
        audio = AudioSegment.from_file(filename)
        print(f"Playing audio: {filename}")
        play(audio)
    except Exception as e:
        print(f"Error playing audio: {e}")

# Execute main functionality
if __name__ == "__main__":
    audio_file = record_audio()  # Record and save audio
    play_audio(audio_file)       # Play back the saved audio


Recording... Speak now!
Recording complete.
Audio saved as recorded_audio.wav
Playing audio: recorded_audio.wav


### Voice - text - voice basic framework

In [22]:
from dotenv import load_dotenv
import openai
import os

# Load environment variables from .env file
load_dotenv()

# Get the API key
openai.api_key = os.getenv("OPENAI_API_KEY")

# Check if the key was loaded correctly
if not openai.api_key:
    raise ValueError("OPENAI_API_KEY is not set in the .env file.")

# Initialize OpenAI client
client = openai.OpenAI(
    api_key=openai.api_key  
)


In [7]:
import speech_recognition as sr
from gtts import gTTS
from pydub import AudioSegment
from pydub.playback import play
import openai
import os

# Record Audio Function
def record_audio(filename="recorded_audio.wav"):
    recognizer = sr.Recognizer()
    with sr.Microphone() as source:
        print("Recording... Speak now!")
        audio = recognizer.listen(source)
        print("Recording complete.")
    
    # Save the audio as a WAV file
    with open(filename, "wb") as f:
        f.write(audio.get_wav_data())
    print(f"Audio saved as {filename}")
    return filename

# Convert Speech to Text
def speech_to_text(audio_file):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_file) as source:
        audio = recognizer.record(source)
        try:
            text = recognizer.recognize_google(audio)
            print(f"Recognized text: {text}")
            return text
        except sr.UnknownValueError:
            print("Could not understand the audio.")
            return None
        except sr.RequestError as e:
            print(f"Speech Recognition error: {e}")
            return None

# Analyze Text with LLM
def analyze_text_with_llm(text):
    try:
        print("Analyzing text with LLM...")
        response = openai.Completion.create(
            engine="text-davinci-003",  # Use your preferred model
            prompt=f"Analyze the following text and provide a meaningful response:\n\n{text}",
            max_tokens=100,
            temperature=0.7
        )
        response_text = response.choices[0].text.strip()
        print(f"LLM Response: {response_text}")
        return response_text
    except Exception as e:
        print(f"LLM analysis error: {e}")
        return None

# Convert Text to Speech
def text_to_speech(text, output_audio="response_audio.mp3"):
    try:
        print("Converting text to speech...")
        tts = gTTS(text=text, lang="en")
        tts.save(output_audio)
        print(f"Response audio saved as {output_audio}")
        return output_audio
    except Exception as e:
        print(f"Text-to-speech error: {e}")
        return None

# Playback Audio Function
def play_audio(filename):
    try:
        # Load the audio file
        audio = AudioSegment.from_file(filename)
        print(f"Playing audio: {filename}")
        play(audio)
    except Exception as e:
        print(f"Error playing audio: {e}")


Recording... Speak now!
Recording complete.
Audio saved as recorded_audio.wav
Recognized text: hello how are you
Analyzing text with LLM...
LLM analysis error: 

You tried to access openai.Completion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742



In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

def analyze_text_with_llm(text):
    try:
        print("Analyzing text with LLaMA model...")
        
        # Load LLaMA model and tokenizer (use a pre-trained model available on Hugging Face)
        model_name = "meta-llama/Llama-2-7b-chat-hf"  # Replace with the model you want
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name)

        # Tokenize input text
        inputs = tokenizer(text, return_tensors="pt")

        # Generate response
        outputs = model.generate(
            inputs.input_ids,
            max_length=100,
            num_return_sequences=1,
            temperature=0.7,
        )

        # Decode and return the response
        response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"LLM Response: {response_text}")
        return response_text
    except Exception as e:
        print(f"LLM analysis error: {e}")
        return None



In [None]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu


### Function with OpenAI

In [24]:
def analyze_text_with_llm(text):
    try:
        print("Analyzing text with OpenAI LLM...")
        
        # Create chat completion
        response = client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": text}
            ],
            model="gpt-3.5-turbo",  # Use the desired model, e.g., "gpt-4"
            max_tokens=100,
            temperature=0.7
        )
        
        # Extract and return the response content
        response_text = response["choices"][0]["message"]["content"]
        print(f"LLM Response: {response_text}")
        return response_text

    except Exception as e:
        print(f"LLM analysis error: {e}")
        return None


#### Function with Open Source Models 

In [29]:
from transformers import AutoModelForCausalLM, AutoTokenizer

import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

def analyze_text_with_llm(text):
    try:
        print("Analyzing text with an open-source LLM...")

        # Load model and tokenizer
        model_name = "gpt2"  # Replace with a Hugging Face model like "EleutherAI/gpt-neo-1.3B"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name)

        # Encode input text
        inputs = tokenizer.encode(text, return_tensors="pt")

        # Generate a response
        outputs = model.generate(inputs, max_length=100, num_return_sequences=1, temperature=0.7)

        # Decode the response
        response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"LLM Response: {response_text}")
        return response_text
    except Exception as e:
        print(f"LLM analysis error: {e}")
        return None


In [41]:
# Main Functionality
if __name__ == "__main__":
    # Step 1: Record Audio
    audio_file = "recorded_audio.wav"  # Specify the path to the stored audio file
    #audio_file = record_audio()
    
    # Step 2: Convert Audio to Text
    recognized_text = speech_to_text(audio_file)
    
    if recognized_text:
        # Step 3: Analyze Text with LLM
        llm_response = analyze_text_with_huggingface(recognized_text)
        
        if llm_response:
            # Step 4: Convert LLM Response to Speech
            response_audio = text_to_speech(llm_response)
            
            if response_audio:
                # Step 5: Play the Response Audio
                play_audio(response_audio)

Recognized text: hello how are you
Analyzing text with Hugging Face API...




LLM Response: No response generated.
Converting text to speech...
Response audio saved as response_audio.mp3
Error playing audio: [WinError 2] The system cannot find the file specified




In [31]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu


Looking in indexes: https://download.pytorch.org/whl/cpu
Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement torch (from versions: none)
ERROR: No matching distribution found for torch


In [33]:
pip install huggingface-hub





In [37]:
### get HF token

# Load environment variables from .env file
load_dotenv()

# Get the Hugging Face token
HF_token = os.getenv("HF_Token")


In [40]:
from huggingface_hub import InferenceApi

def analyze_text_with_huggingface(text):
    try:
        print("Analyzing text with Hugging Face API...")

        # Set up the Hugging Face Inference API client
        api = InferenceApi(repo_id="EleutherAI/gpt-neo-1.3B", token=HF_token)

        # Send the input text to the model
        response = api({"inputs": text})
        response_text = response.get("generated_text", "No response generated.")
        print(f"LLM Response: {response_text}")
        return response_text
    except Exception as e:
        print(f"LLM analysis error: {e}")
        return None


## Record voice