# Setting up the background_sound folder

In [None]:
import sounddevice as sd
from scipy.io.wavfile import write
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import models, layers
from sklearn.metrics import confusion_matrix, classification_report
import sounddevice as sd
from scipy.io.wavfile import write
import tensorflow as tf  # Import TensorFlow for TFLite conversion
from tensorflow.keras.models import load_model

In [None]:
# Function to record audio with a Wake Word, saving the recordings to the specified path
def record_audio_and_save(save_path, n_times=50):
    # Prompt user to start recording the Wake Word
    input("To start recording Wake Word press Enter: ")
    
    # Loop to record the specified number of times (n_times)
    for i in range(n_times):
        fs = 44100  # Sampling frequency (samples per second)
        seconds = 2  # Duration of each recording in seconds

        # Record audio for the specified duration and channels (stereo)
        myrecording = sd.rec(int(seconds * fs), samplerate=fs, channels=2)
        sd.wait()  # Wait until recording is finished
        
        # Save the recording to the given path with a numbered filename
        write(save_path + str(i) + ".wav", fs, myrecording)
        
        # Prompt to either proceed to next recording or stop the loop
        input(f"Press Enter to record next or stop with ctrl + C ({i + 1}/{n_times}): ")

# Function to record background sounds, saving them to the specified path
def record_background_sound(save_path, n_times=50):
    # Prompt user to start recording background sounds
    input("To start recording your background sounds press Enter: ")
    
    # Loop to record the specified number of times (n_times)
    for i in range(n_times):
        fs = 44100  # Sampling frequency (samples per second)
        seconds = 2  # Duration of each recording in seconds

        # Record audio for the specified duration and channels (stereo)
        myrecording = sd.rec(int(seconds * fs), samplerate=fs, channels=2)
        sd.wait()  # Wait until recording is finished
        
        # Save the background sound recording to the specified path with a numbered filename
        write(save_path + str(i) + ".wav", fs, myrecording)
        
        # Provide feedback on the progress of recording
        print(f"Currently on {i + 1}/{n_times}")

# Step 1: Record yourself saying the Wake Word
print("Recording the Wake Word:\n")
record_audio_and_save("WakeWordDetection/audio_data", n_times=100)  # Save to specified folder with 100 recordings

# Step 2: Record background sounds (Just let it run, it will automatically record)
print("Recording the Background sounds:\n")
record_background_sound("background_sound/", n_times=100)  # Save to specified folder with 100 recordings


# Data Visualization

### LOADING THE VOICE DATA FOR VISUALIZATION 

In [None]:
walley_sample = "audio_data/0.wav"  # Path to the audio file to be loaded
# Load the audio data using librosa, the sample rate (sampling frequency) is also returned
data, sample_rate = librosa.load(walley_sample)

### VISUALIZING WAVE FORM

In [None]:
plt.title("Wave Form")  # Set the title of the plot
# Use librosa to display the waveform of the loaded audio data
librosa.display.waveshow(data, sr=sample_rate)
plt.show()  # Show the waveform plot

### VISUALIZING MFCC

In [None]:
# Extract the Mel-Frequency Cepstral Coefficients (MFCC) from the audio data
# MFCCs are features that represent the short-term power spectrum of a sound, commonly used in speech processing
mfccs = librosa.feature.mfcc(y=data, sr=sample_rate, n_mfcc=40)  # Extract 40 MFCCs
print("Shape of mfcc:", mfccs.shape)  # Print the shape of the MFCC matrix to check its dimensions

# Plot the MFCCs as a spectrogram
plt.title("MFCC")  # Set the title of the plot
# Display the MFCCs over time using librosa’s specshow function
librosa.display.specshow(mfccs, sr=sample_rate, x_axis='time')
plt.show()  # Show the MFCC plot

# Model Training

In [None]:
# CONSTANTS
fs = 44100  # Sampling frequency for audio recordings (samples per second)
seconds = 2  # Duration of each audio clip in seconds
filename = "prediction.wav"  # Name of the prediction audio file
class_names = ["Wake Word NOT Detected", "Wake Word Detected"]  # Class labels for the detection task
num_labels = 2  # Number of classes (Wake Word Detected or Not)

In [None]:
# The input shape for the Conv2D model
input_shape = (32, 32, 1)  # Input shape: Height, Width, Channels

In [None]:
# Load audio data and preprocess
all_data = []  # List to store the features and labels
data_path_dict = {
    0: ["background_sound/" + file_path for file_path in os.listdir("background_sound/")],  # Background sounds
    1: ["audio_data/" + file_path for file_path in os.listdir("audio_data/")]  # Wake Word sounds
}

In [None]:
# Loop over the two class labels (background sounds and wake word sounds)
for class_label, list_of_files in data_path_dict.items():
    for single_file in list_of_files:
        # Load the audio file using librosa
        audio, sample_rate = librosa.load(single_file)
        
        # Extract MFCC (Mel-Frequency Cepstral Coefficients) from the audio
        mfcc = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfcc_processed = np.mean(mfcc.T, axis=0)  # Take the mean of the MFCC over time
        # Reshape for Conv2D input (40 MFCCs, 1 channel)
        mfcc_processed = mfcc_processed.reshape(40, 1)  # Reshaped to (40, 1)
        
        # Append the processed MFCC and its corresponding class label
        all_data.append([mfcc_processed, class_label])
    
    print(f"Info: Successfully Preprocessed Class Label {class_label}")  # Print info after processing each class

# Create a DataFrame from the preprocessed data
df = pd.DataFrame(all_data, columns=["feature", "class_label"])

In [None]:
# Making our data training-ready 
X = df["feature"].values  # Extract the feature column

# Create an array to hold the padded features (to match the required input shape for Conv2D)
padded_features = []

# Loop through each feature and pad or resize to (32, 32)
for feature in X:
    # Pad or resize each feature to a shape of (32, 32)
    feature = np.pad(feature, ((0, max(0, 32 - feature.shape[0])), (0, 31)), 'constant')
    feature = feature[:32]  # Take the first 32 rows if the feature is too large
    padded_features.append(feature)

X = np.array(padded_features)  # Convert the padded features to a numpy array
X = np.reshape(X, (-1, 32, 32, 1))  # Reshape to fit Conv2D input shape: (32, 32, 1)

# Convert the class labels to one-hot encoding
y = np.array(df["class_label"].tolist())
y = to_categorical(y)  # One-hot encode the labels

In [None]:
# Train-test split 
# Split the data into training and test sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Number of elements in X_train:", len(X_train))
print("Number of elements in X_test:", len(X_test))
print("Number of elements in y_train:", len(y_train))
print("Number of elements in y_test:", len(y_test))

In [None]:
# Model Training #######
# Define the neural network model using Keras Sequential API
model = models.Sequential([
    layers.Input(shape=input_shape),  # Input layer with shape (32, 32, 1)
    layers.Resizing(32, 32),  # Resize input to (32, 32) if needed
    layers.Conv2D(32, 3, activation='relu'),  # Convolutional layer with 32 filters, 3x3 kernel
    layers.Conv2D(64, 3, activation='relu'),  # Another convolutional layer with 64 filters
    layers.MaxPooling2D(),  # Max pooling to reduce spatial dimensions
    layers.Dropout(0.25),  # Dropout layer to prevent overfitting
    layers.Flatten(),  # Flatten the output for the fully connected layer
    layers.Dense(128, activation='relu'),  # Dense layer with 128 neurons
    layers.Dropout(0.5),  # Dropout layer for regularization
    layers.Dense(num_labels, activation='softmax')  # Output layer with softmax activation for classification
])

print(model.summary())  # Print the summary of the model architecture

# Compile the model
model.compile(
    loss="categorical_crossentropy",  # Loss function for multi-class classification
    optimizer='adam',  # Adam optimizer
    metrics=['accuracy']  # Accuracy as the evaluation metric
)

# Train the model for 150 epochs with a validation split of 0.2
print("Model Score: \n")
history = model.fit(X_train, y_train, epochs=150, validation_split=0.2)
model.save("saved_model/WWD.h5")  # Save the trained model
score = model.evaluate(X_test, y_test)  # Evaluate the model on the test set
print(score)

In [None]:
# Convert Keras model to TFLite 
# Convert the trained Keras model to TensorFlow Lite format
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

# Save the TFLite model to a file
tflite_model_path = "Untitled Folder/WWD.tflite"
with open(tflite_model_path, "wb") as f:
    f.write(tflite_model)

print("TFLite model conversion completed and saved at", tflite_model_path)


In [None]:
# Evaluate TFLite model 
# Load the TFLite model and allocate tensors for inference
interpreter = tf.lite.Interpreter(model_path=tflite_model_path)
interpreter.allocate_tensors()

# Get input and output tensor details
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# Test the TFLite model on the test data
y_pred_tflite = []
for i in range(len(X_test)):
    # Prepare input data and run inference on TFLite model
    input_data = np.expand_dims(X_test[i], axis=0).astype(np.float32)
    interpreter.set_tensor(input_details[0]['index'], input_data)
    interpreter.invoke()
    output_data = interpreter.get_tensor(output_details[0]['index'])
    y_pred_tflite.append(np.argmax(output_data))  # Get the predicted class

# Display the classification report for the TFLite model
print("TFLite Model Classification Report: \n")
cm = confusion_matrix(np.argmax(y_test, axis=1), y_pred_tflite)
print(classification_report(np.argmax(y_test, axis=1), y_pred_tflite))