In [1]:
import time as tm
import numpy as np
import threading

from scipy.signal import find_peaks

import sounddevice as sd
import librosa

import numpy as np

In [2]:
c = 3

In [3]:
BC_RATE = 1.3*c #1.3

# Audio settings
SAMPLERATE = 44100
CHANNELS = 1
DURATION = 0.5  # Duration of each audio capture in seconds

speech_duration, silence_duration = 0, 0

In [4]:
def play_soft_beep():
    # Parameters for the beep
    frequency = 440  # Frequency of the beep (in Hz)
    duration = 0.2   # Duration of the beep (in seconds)
    amplitude = 0.1  # Amplitude of the beep (0.0 to 1.0 for soft to loud)

    # Generate the time points
    sample_rate = 44100  # Samples per second
    t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)

    # Generate the beep as a sine wave
    beep = amplitude * np.sin(2 * np.pi * frequency * t)

    print(" BC ", end="\n\n")
    # Play the beep
    sd.play(beep, sample_rate)
    sd.wait()  # Wait until the sound is done playing
play_soft_beep()

 BC 



In [5]:
def autocorrelate(signal):
    """
    Autocorrelates the provided signal.

    Args:
    signal (numpy array): The audio signal.

    Returns:
    numpy array: Autocorrelation of the signal.
    """
    if signal.size == 0:
        # Handle the empty signal appropriately
        return np.array([])

    # Compute the mean of the signal
    mean = np.mean(signal)

    # Perform autocorrelation only if the signal is not empty
    correlation = np.correlate(signal - mean, signal - mean, mode='full')

    return correlation[len(signal) - 1:]



def get_pitch(signal, sample_rate=44100):
    if check_silence(signal):
        return 0

    correlation_result = autocorrelate(signal)
    peaks, _ = find_peaks(correlation_result, height=0.1)

    if len(peaks) == 0:
        return 0

    period = peaks[0]
    return sample_rate / period


def check_silence(signal, threshold=0.1):
    """
    Check if the provided audio signal is silent.
    
    Args:
    signal (numpy array): The audio signal.
    threshold (float): Amplitude threshold for silence detection.

    Returns:
    bool: True if the signal is silent, False otherwise.
    """
    if signal.size == 0:
        return True  
    if np.max(np.abs(signal)) < threshold:
        return True
    return False

# Long Utterance Model

In [6]:
W_SPEAK, W_PAUSE = 1.5*c, 0.8*c #1.5, 0.8  

def long_utterance_model(speech_duration, 
                         silence_duration, 
                         last_bc_time, 
                         current_time):
    
    if speech_duration >= W_SPEAK and silence_duration >= W_PAUSE:
        
        if current_time - last_bc_time > BC_RATE:
        
            return True 
        
    return False


# Long Pause Model

In [7]:
# Long Pause Model
LP_SPEAK, LP_PAUSE = 1.0, 1.7

def long_pause_model(speech_duration, 
                     silence_duration, 
                     last_bc_time, 
                     current_time):
    
    if speech_duration >= LP_SPEAK and silence_duration >= LP_PAUSE:
        
        if current_time - last_bc_time > BC_RATE:
            
            return True 
        
    return False


# Pitch Model

In [8]:
P_AND_P_PAUSE = 0.4  
P_AND_P_SPEAK = 1.0  
P_AND_P_LENGTH = 0.3 
P_AND_P_SLOPE = 0.25  # 25% rise or drop

def analyze_pitch_change(audio_data, duration, slope_threshold):
    # Assuming get_pitch function calculates the pitch of the audio segment
    # Calculate pitch at the start and end of the duration
    start_pitch = get_pitch(audio_data[:int(duration/2)])
    end_pitch = get_pitch(audio_data[-int(duration/2):])

    # Calculate the percentage change in pitch
    if start_pitch == 0:  # Avoid division by zero
        return False
    pitch_change = (end_pitch - start_pitch) / start_pitch

    # Check if the change meets or exceeds the threshold
    return abs(pitch_change) >= slope_threshold



def pitch_model(speech_data, silence_duration, 
                last_bc_time, current_time):
    pitch = analyze_pitch_change(speech_data, P_AND_P_LENGTH, P_AND_P_SLOPE)
    if (silence_duration >= P_AND_P_PAUSE) and pitch:
        
        if current_time - last_bc_time > BC_RATE:
            return True   
    return False


# Energy Model

In [9]:
# Energy Model
E_PAUSE, E_SLOPE_LENGTH, E_SLOPE = 0.3, 0.5, 0.3

def analyze_energy_change(audio_data, duration, slope_threshold):
    
    # Calculate RMS energy at the start and end of the duration
    start_energy = np.sqrt(np.mean(np.square(audio_data[:int(duration/2)])))
    
    end_energy = np.sqrt(np.mean(np.square(audio_data[-int(duration/2):])))

    # Calculate the percentage change in energy
    if start_energy == 0:  # Avoid division by zero
        return False
    energy_change = (end_energy - start_energy) / start_energy

    # Check if the change meets or exceeds the threshold
    return abs(energy_change) >= slope_threshold

E_PAUSE = 0.3  # 300 ms
E_SLOPE_LENGTH = 0.5  # Energy analysis duration 500 ms
E_SLOPE = 0.3  # 30% rise or drop

def energy_model(speech_data, silence_duration, last_bc_time, current_time):
    
    energy_status = analyze_energy_change(speech_data, E_SLOPE_LENGTH, E_SLOPE)
    
    if (silence_duration >= E_PAUSE) and energy_status:
        
        if current_time - last_bc_time > BC_RATE:
            
            return True 
        
    return False


# Callback function for processing audio stream

In [10]:
def audio_callback(indata, frames, time, status):
    
    global speech_duration, silence_duration, last_bc_time
    
    audio_data = np.frombuffer(indata.copy(), dtype=np.float32)


    is_silent = check_silence(audio_data)  # Use the check_silence function

    # Update durations
    if is_silent:
        silence_duration += DURATION
        print("S", end="")
    else:
        speech_duration += DURATION
        print("I", end="")

    # Check models and play beep if conditions are met
    current_time = tm.perf_counter() # float value of time in seconds
    
    utterance = long_utterance_model(speech_duration, silence_duration, 
                                     last_bc_time, current_time)
    
    """
    pause = long_pause_model(speech_duration, silence_duration, 
                            last_bc_time, current_time)
    
    pitch = pitch_model(audio_data, silence_duration, 
                        last_bc_time, current_time)
    
    energy = energy_model(audio_data, silence_duration, 
                          last_bc_time, current_time)
    """
    if utterance: #and pause and pitch and energy:
        play_soft_beep()
        silence_duration = 0
        speech_duration = 0
        last_bc_time = current_time  # Update the time of the last beep


# Main

In [11]:
def main():
    global speech_duration, silence_duration, last_bc_time
    speech_duration = 0.0
    silence_duration = 0.0
    last_bc_time = 0.0

    # Open a stream for real-time audio processing
    with sd.InputStream(callback=audio_callback, 
                        channels=CHANNELS, 
                        samplerate=SAMPLERATE, 
                        blocksize=int(SAMPLERATE * DURATION)):
        
        print("Recording and analyzing speech. Press Ctrl+C to stop.")
        while True:
            
            tm.sleep(0.1)  # Keep the main thread alive
            


main()

Recording and analyzing speech. Press Ctrl+C to stop.
SSSSSSSSSSSSSSSIISSSSSIIIISSSSSSSIII BC 

SISSISSSSSSSSSSSSSSSSSSSSSSSIIIIIII BC 

SSSSSS

KeyboardInterrupt: 