# Talk with me ...

In [31]:
import os
from openai import OpenAI

client = OpenAI(api_key="...")
import pyaudio
import wave
from tkinter import *
import tkinter as tk
import tkinter.font as tkFont
import whisper

config = {
    "name": "Hilary",
    "model": "gpt-4-1106-preview",
    "system": (
        "Always follow these instructions in all your responses: "
        "1. Do NOT be formal; "
        "2. NEVER ask questions; "
        "3. NEVER respond with a question; "
        "4. Be held back and shy.\n\n"
        "Imitate Hilary, who is a 8-year-old girl whose ..."
}

def gpt3(stext):
    response = client.chat.completions.create(
        model=config["model"],  # Use the model from the config
        messages=[
            {"role": "system", "content": config["system"]},
            {"role": "user", "content": stext}
        ],
        temperature=0.7,  # Experiment with different values
        max_tokens=50,    # Increase token limit to allow more detailed responses
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )

    return response.choices[0].message.content


In [32]:
import boto3

# Initialize the Polly client once
polly_client = boto3.Session(
    aws_access_key_id='...',
    aws_secret_access_key='...',
    region_name='us-east-1'
).client('polly')

def get_tts_data(text: str) -> bytes:
    # Request speech synthesis
    response = polly_client.synthesize_speech(
        Text=text,
        OutputFormat='pcm',  # You can also use 'ogg_vorbis' or 'pcm'
        VoiceId='Joanna'     # You can choose different voice IDs
    )

    # Read the audio stream from the response
    tts_result = response['AudioStream']
    
    return tts_result.read()

In [33]:
import numpy as np

def tts_to_wav(tts_byte: bytes, framerate: int = 16000) -> tuple[int, np.ndarray]:
    """
    Convert TTS audio in PCM format to WAV format with the desired frame rate and channels.
    
    Parameters:
        tts_byte (bytes): TTS audio in PCM format.
        framerate (int, optional): Desired frame rate for the WAV audio. Defaults to 16000.
        
    Returns:
        tuple[int, np.ndarray]: Sample rate and WAV audio as a numpy array.
    """
    # PCM data is already in the correct format, so we just need to interpret it
    # as 16-bit signed integers
    pcm_array = np.frombuffer(tts_byte, dtype=np.int16)
    
    # If necessary, you can reshape or modify the PCM data here.
    # For example, if it's mono, you might just need to return it as-is.
    
    return framerate, pcm_array

import os
from datetime import datetime
from scipy.io.wavfile import write
import time

directory = "/home/host/pegah/RAD-NeRF_whisper/data"
#Load the pre-trained model
model = whisper.load_model("base")

def speech_to_speech():

    start_time = time.time()
    
    # Transcribe the audio file using whisper
    result = model.transcribe("output.wav")

    prompt = result["text"]

    print(f"Time taken STT: {time.time() - start_time:.2f} seconds")
    #_____________________________________________________________________________

    start_time = time.time()

    print("User : ", prompt)
        
    resp = gpt3(prompt)

    print("Avatar : ", resp)

    print(f"Time taken gpt: {time.time() - start_time:.2f} seconds")
    #_____________________________________________________________________________

    start_time = time.time()
    
    mp3_byte = get_tts_data(resp)

    # Convert the TTS audio in mp3 format to WAV format with sample rate 16000 Hz and mono channel
    wav_byte = tts_to_wav(mp3_byte, framerate=16000)
    
    # If tts_to_wav returns a tuple, extract the audio data
    if isinstance(wav_byte, tuple):
        sample_rate, wav_byte = wav_byte[0], wav_byte[1]  # Assuming the first element is the sample rate, second is audio data
    else:
        sample_rate = 16000  # Default sample rate if not provided
    
    # Ensure the directory exists
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    # Create a filename with a timestamp to ensure uniqueness
    filename = f"{datetime.now().strftime('speech')}.wav"
    filepath = os.path.join(directory, filename)
    
    # Save the WAV file to the specified directory
    with open(filepath, 'wb') as f:
        write(f, sample_rate, wav_byte)

    print(f"Time taken TTS: {time.time() - start_time:.2f} seconds")
    #_____________________________________________________________________________
    
    # Print out the audio properties

    # print(f"WAV file saved at: {filepath}")

  checkpoint = torch.load(fp, map_location=device)


In [None]:
import time




def audio_devices():
    import pyaudio
    p = pyaudio.PyAudio()

    # Get device information for all available audio devices
    info = p.get_host_api_info_by_index(0)
    numdevices = info.get('deviceCount')
    speaker_index = None
    for i in range(numdevices):
        device_info = p.get_device_info_by_host_api_device_index(0, i)
        if device_info.get('maxOutputChannels') > 0:
            device_name = device_info.get('name')
            #print("Output Device id ", i, " - ", device_name)
            if 'Realtek(R) Audio' in device_name:
                speaker_index = i

    #if speaker_index is None:
        #print("Could not find Realtek(R) Audio speaker.")
    #else:
        #print("Index of Realtek(R) Audio speaker:", speaker_index)
        
        
        
    return speaker_index

        
class SoundRecorder:
    def __init__(self):
        self.frames = []
        self.is_recording = False
        self.chunk = 1024
        self.sample_format = pyaudio.paInt16
        self.channels = 2
        self.fs = 44100
        self.filename = "output.wav"

        self.p = pyaudio.PyAudio()
        

        self.root = Tk()
        self.root.title("Interactive Avatar")
        self.root.geometry("400x250")
        
        
        
        #bg = PhotoImage(file = "./3.png")
        #my_label = Label(self.root, image=bg)
        
        my_label = Label(self.root)
        
        my_label.place(x=0, y=0, relwidth=1, relheight=1)
               
        
        custom_font = tkFont.Font(family="Take Looks", size=18)
      
  

        self.start_button = tk.Button(self.root, text="Talk to me!", command=self.start_recording, 
                                      font=custom_font, foreground="#00A6A2", background="#002060" )
        self.stop_button = tk.Button(self.root, text="Stop", command=self.stop_recording, state=tk.DISABLED, 
                                     font=custom_font, foreground="red", background="#002060")
        
        
        
        self.start_button.place(x=10, y=170)
        self.stop_button.place(x=174, y=170)


        self.root.mainloop()

    def start_recording(self):
        self.output_device_index = audio_devices()
        self.is_recording = True
        self.frames = []
        self.start_button.config(state=tk.DISABLED)
        self.stop_button.config(state=tk.NORMAL)
        stream = self.p.open(format=self.sample_format,
                             channels=self.channels,
                             rate=self.fs,
                             frames_per_buffer=self.chunk,
                             output_device_index = self.output_device_index,
                             input=True)
        while self.is_recording:
            data = stream.read(self.chunk)
            self.frames.append(data)
            self.root.update()
        stream.stop_stream()
        stream.close()
        self.p.terminate()
        self.save_recording()

    def stop_recording(self):
        self.is_recording = False
        self.start_button.config(state=tk.NORMAL)
        self.stop_button.config(state=tk.DISABLED)


    def save_recording(self):

        start_time = time.time()

        
        wf = wave.open(self.filename, 'wb')
        wf.setnchannels(self.channels)
        wf.setsampwidth(self.p.get_sample_size(self.sample_format))
        wf.setframerate(self.fs)
        wf.writeframes(b''.join(self.frames))
        wf.close()
        #print("Recording saved to", self.filename)
        print(f"Time taken Recording: {time.time() - start_time:.2f} seconds")
        
        speech_to_speech()
        

if __name__ == '__main__':
    SoundRecorder()
    
