In [1]:
import sounddevice as sd

# List all available devices
devices = sd.query_devices()

# Print the list of devices
print(devices)

   0 Microsoft Sound Mapper - Input, MME (2 in, 0 out)
>  1 麥克風排列 (Realtek High Definition , MME (2 in, 0 out)
   2 Microsoft Sound Mapper - Output, MME (0 in, 2 out)
<  3 喇叭 (Realtek High Definition Aud, MME (0 in, 6 out)
   4 Headphones (), Windows WDM-KS (0 in, 2 out)
   5 Microphone Array (Realtek HD Audio Mic Array input), Windows WDM-KS (2 in, 0 out)
   6 Headphones 1 (Realtek HD Audio 2nd output with SST), Windows WDM-KS (0 in, 2 out)
   7 Headphones 2 (Realtek HD Audio 2nd output with SST), Windows WDM-KS (0 in, 6 out)
   8 PC Speaker (Realtek HD Audio 2nd output with SST), Windows WDM-KS (2 in, 0 out)
   9 Headset Microphone (Realtek HD Audio Mic input), Windows WDM-KS (2 in, 0 out)
  10 Speakers 1 (Realtek HD Audio output with SST), Windows WDM-KS (0 in, 2 out)
  11 Speakers 2 (Realtek HD Audio output with SST), Windows WDM-KS (0 in, 6 out)
  12 PC Speaker (Realtek HD Audio output with SST), Windows WDM-KS (2 in, 0 out)
  13 Headset (@System32\drivers\bthhfenum.sys,#2;%1 Hand

In [2]:
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
import joblib

# Paths to folders containing "yes" and "no" audio samples
yes_path = 'Yes'
no_path = 'No'

def is_silent(audio, threshold=0.02):
    """Check if the audio is mostly silent based on a threshold."""
    return np.max(np.abs(audio)) < threshold


# Function to extract MFCC features from an audio file
def extract_features(file_path):
    audio, sr = librosa.load(file_path, sr=16000)
    
    # Check if the audio is silent to avoid errors in chroma feature extraction
    if is_silent(audio):
        print(f"Warning: {file_path} is silent or too quiet. Skipping chroma extraction.")
        chroma_mean = np.zeros(12)  # Filler for chroma features if audio is silent
    else:
        # Extract Chroma features only if audio is not silent
        chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
        chroma_mean = np.mean(chroma.T, axis=0)

    # Extract MFCC and Spectral Contrast features
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=5)
    mfccs_mean = np.mean(mfccs.T, axis=0)
    
    spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)
    spectral_contrast_mean = np.mean(spectral_contrast.T, axis=0)
    
    # Concatenate all features
    features = np.concatenate([mfccs_mean, spectral_contrast_mean, chroma_mean])
    return features

# Prepare data and labels
X = []  # Feature vectors
y = []  # Labels

# Process "yes" files
for file_name in os.listdir(yes_path):
    if file_name.endswith('.wav'):
        file_path = os.path.join(yes_path, file_name)
        features = extract_features(file_path)
        X.append(features)
        y.append(1)  # Label for "yes"

# Process "no" files
for file_name in os.listdir(no_path):
    if file_name.endswith('.wav'):
        file_path = os.path.join(no_path, file_name)
        features = extract_features(file_path)
        X.append(features)
        y.append(0)  # Label for "no"

# Convert lists to numpy arrays
X = np.array(X)
y = np.array(y)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier with additional regularization
clf = RandomForestClassifier(
    n_estimators=10,
    max_depth=2,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42,
    class_weight='balanced'
)
clf.fit(X_train, y_train)

# Save the trained model
joblib.dump(clf, 'yes_no_classifier.joblib')
print("Model training completed and saved as 'yes_no_classifier.joblib'")

# Evaluate with cross-validation
#cross_val_scores = cross_val_score(clf, X, y, cv=5)
#print(f"Cross-validation scores: {cross_val_scores}")
#print(f"Mean cross-validation score: {cross_val_scores.mean():.2f}")

  return pitch_tuning(


Model training completed and saved as 'yes_no_classifier.joblib'
Cross-validation scores: [0.9382716  0.94444444 0.94444444 0.94409938 0.88198758]
Mean cross-validation score: 0.93


In [2]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
import seaborn as sns
import matplotlib.pyplot as plt

# Load the trained model
clf = joblib.load('yes_no_classifier.joblib')

# Predict on the validation set
y_val_pred = clf.predict(X_val)

# Calculate and print validation accuracy
val_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy: {:.2f}%".format(val_accuracy * 100))

# Print detailed validation report
print("Classification Report:\n", classification_report(y_val, y_val_pred, target_names=["No", "Yes"]))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))

# Calculate confusion matrix
cm = confusion_matrix(y_val, y_val_pred)


Validation Accuracy: 91.98%
Classification Report:
               precision    recall  f1-score   support

          No       0.91      0.92      0.92        77
         Yes       0.93      0.92      0.92        85

    accuracy                           0.92       162
   macro avg       0.92      0.92      0.92       162
weighted avg       0.92      0.92      0.92       162

Confusion Matrix:
 [[71  6]
 [ 7 78]]


In [3]:
#Start detecting program definition part

import sounddevice as sd
import numpy as np
import joblib
from scipy.io.wavfile import write
import os

def predict_yes_no_from_audio(model, duration=3, fs=44100, device_index=5):
    try:
        # Record audio
        print("Recording...")
        myrecording = sd.rec(int(duration * fs), samplerate=fs, channels=2, dtype=np.float32)
        sd.wait()  # Wait until recording is finished
        print("Recording finished.")
        
        # Save the recorded audio to a temporary file
        temp_file = 'temp_audio.wav'
        write(temp_file, fs, myrecording)
        
        # Extract features using the temp audio file path
        features = extract_features(temp_file)
        
        # Predict using the model
        prediction = model.predict([features])
        
        # Delete the temporary file if not needed anymore
        os.remove(temp_file)
        
        return "Yes" if prediction[0] == 1 else "No"

    except sd.PortAudioError as e:
        print(f"Error during recording: {e}")
        print("Please ensure an audio input device is connected and enabled.")
        return None



# Example usage of the function
#loaded_model = joblib.load('yes_no_classifier.joblib')

#result = predict_yes_no_from_audio(loaded_model)
#if result:
#    print(f"You said: {result}")


def adjust():
    recognizer = sr.Recognizer()
    with sr.Microphone() as source:
        print("Adjusting for ambient noise...")
        recognizer.adjust_for_ambient_noise(source, duration=1)  # Adjust for ambient noise

def speechrecognize():
    # Create a Recognizer instance
    recognizer = sr.Recognizer()
    # Capture audio input from the microphone
    with sr.Microphone() as source:
        print("Recording...")
        audio_data = recognizer.listen(source, phrase_time_limit=30)  # Listen to the audio from the microphone
        print("Finished recording.")  # This will print after the recording is done    
    # You can now process the audio_data, for example:
    try:
        # Recognize speech using Google's recognition
        text = recognizer.recognize_google(audio_data)
        print("You said:", text)
        return text
    except sr.UnknownValueError:
        print("Could not understand the audio")
    except sr.RequestError as e:
        print(f"Could not request results; {e}")

def switch(order):
    if order == 1:
        sentence = "Hello! Do you need any help?"
        engine.say(sentence)
        engine.runAndWait()
        loaded_model = joblib.load('yes_no_classifier.joblib')
        return predict_yes_no_from_audio(loaded_model)
        
    elif order == 2:
        sentence = "Okay, What do you need?"
        engine.say(sentence)
        engine.runAndWait()
        return speechrecognize()

In [13]:
import pyttsx3
import speech_recognition as sr
import csv

# Initialize the TTS engine
engine = pyttsx3.init()

# Set properties (optional)
engine.setProperty('rate', 260)  # Speed of speech
engine.setProperty('volume', 1)  # Volume level (0.0 to 1.0)

# Adjust for ambient noise before recording
adjust()
    
# Create a new CSV file to store the results
csv_file_path = 'results.csv'

# Open the CSV file in write mode
with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)

    # Get the result of switch 1
    text1 = switch(1)
    print (f"You said: {text1}")

    # Write the result of switch 1 to the first row
    writer.writerow(['Need help?', text1])

    # Check the response of switch 1 and proceed to switch 2 if needed
    if text1 and text1.lower() == 'yes':
        text2 = switch(2)
    else:
        text2 = 'Not Applicable'  # If the answer is "no", don't ask further questions

    # Write the result of switch 2 to the second row
    writer.writerow(['What the person needs.', text2])

print(f"Results saved to {csv_file_path}")

Adjusting for ambient noise...
Recording...
Recording finished.
You said: Yes
Recording...
Finished recording.
You said: I need water food and a helicopter
Results saved to results.csv


In [14]:
import pandas as pd

# Path to the CSV file
csv_file_path = 'results.csv'

# Load CSV into a DataFrame and print
df = pd.read_csv(csv_file_path)
print(df)

               Need help?                                 Yes
0  What the person needs.  I need water food and a helicopter


In [None]:
#Plot the accuracy and numbers of trees
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

n_estimators_range = range(1, 50, 1)  
train_accuracies = []
val_accuracies = []

for n_estimators in n_estimators_range:
    
    clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=2, min_samples_split=10, 
                                 min_samples_leaf=5, random_state=42, class_weight='balanced')
    clf.fit(X_train, y_train)
    
    
    train_accuracy = accuracy_score(y_train, clf.predict(X_train))
    val_accuracy = accuracy_score(y_val, clf.predict(X_val))
    
   
    train_accuracies.append(train_accuracy)
    val_accuracies.append(val_accuracy)


plt.figure(figsize=(10, 6))
plt.plot(n_estimators_range, train_accuracies, label="Training Accuracy")
plt.plot(n_estimators_range, val_accuracies, label="Validation Accuracy")
plt.xlabel("Number of Trees (n_estimators)")
plt.ylabel("Accuracy")
#plt.title("Accuracy vs. Number of Trees in Random Forest")
plt.legend()
plt.grid(True)
plt.show()