In [2]:
#part 2 of  mini project phase 2

In [None]:
import os
import pyaudio
import numpy as np
import tensorflow as tf
from pydub import AudioSegment
from pydub.silence import detect_silence
from scipy.signal import resample
from io import BytesIO

import os
from pydub import AudioSegment
from pydub.silence import split_on_silence
import tensorflow as tf
import numpy as np
from scipy.signal import resample
import csv
from tensorflow.keras.utils import register_keras_serializable
from tensorflow.keras import layers

        # Define the custom ResidualUnit layer
@register_keras_serializable()
class ResidualUnit(tf.keras.layers.Layer):
    def __init__(self, filters, strides=1, **kwargs):
        super().__init__(**kwargs)
        self.filters = filters
        self.strides = strides
        self.conv1 = layers.Conv2D(filters, kernel_size=3, strides=strides, padding="same", use_bias=False)
        self.bn1 = layers.BatchNormalization()
        self.activation = layers.Activation("relu")
        self.conv2 = layers.Conv2D(filters, kernel_size=3, strides=1, padding="same", use_bias=False)
        self.bn2 = layers.BatchNormalization()

        if strides > 1 or filters != kwargs.get('input_shape', [None, 374, 129, 1])[-1]:
            self.skip_conv = layers.Conv2D(filters, kernel_size=1, strides=strides, padding="same", use_bias=False)
            self.skip_bn = layers.BatchNormalization()
        else:
            self.skip_conv = None

    def call(self, inputs, training=False):
        x = self.conv1(inputs)
        x = self.bn1(x, training=training)
        x = self.activation(x)
        x = self.conv2(x)
        x = self.bn2(x, training=training)

        if self.skip_conv is not None:
            skip = self.skip_conv(inputs)
            skip = self.skip_bn(skip, training=training)
        else:
            skip = inputs

        return self.activation(x + skip)

    def get_config(self):
        config = super().get_config()
        config.update({
            "filters": self.filters,
            "strides": self.strides,
        })
        return config

model = tf.keras.models.load_model(
    'farsi_numbers_detectionjupyter.keras',
    custom_objects={'ResidualUnit': ResidualUnit}  # Include custom layers if used
)
commands = np.array(['8', '5', '4', '9', '1', '7', '6', '3', '2', '10', '0'])
# Audio capture settings
CHUNK = 1024  # Size of audio buffer
FORMAT = pyaudio.paInt16  # Audio format
CHANNELS = 1  # Mono audio
RATE = 16000  # Sample rate (16 kHz)

# Function to get MFCCs from audio
def get_mfccs(audio, sample_rate):
    frame_length = int(sample_rate / 40)  # 25 ms
    frame_step = int(sample_rate / 100)  # 10 ms
    fft_length = frame_length
    num_feats = 40

    stfts = tf.signal.stft(audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length)
    spectrograms = tf.abs(stfts)

    num_spectrogram_bins = stfts.shape[-1]
    lower_edge_hertz, upper_edge_hertz, num_mel_bins = 0, sample_rate / 2, num_feats
    linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
        num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hertz, upper_edge_hertz)
    mel_spectrograms = tf.tensordot(spectrograms, linear_to_mel_weight_matrix, 1)
    mel_spectrograms.set_shape(spectrograms.shape[:-1].concatenate(linear_to_mel_weight_matrix.shape[-1:]))

    log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6)
    mfccs = tf.signal.mfccs_from_log_mel_spectrograms(log_mel_spectrograms)
    mfccs = mfccs[..., tf.newaxis]  # Add a channel dimension
    return mfccs

# Process and predict a single audio chunk
def process_and_predict_chunk(chunk, model):
    # Export chunk to a temporary file
    temp_chunk_path = 'temp_chunk.wav'
    chunk.export(temp_chunk_path, format="wav")

    # Load and decode the chunk with TensorFlow
    audio_binary = tf.io.read_file(temp_chunk_path)
    audio, sample_rate = tf.audio.decode_wav(audio_binary)

    # Handle multi-channel or single-channel audio
    if len(audio.shape) > 1:
        audio = tf.reduce_mean(audio, axis=-1)  # Convert to mono if multi-channel
    else:
        audio = tf.squeeze(audio, axis=-1)  # Squeeze if single-channel

    # Resample to 16 kHz if the sample rate is different
    desired_sample_rate = 16000
    if sample_rate.numpy() != desired_sample_rate:  # Convert tensor to numpy
        num_samples = int(desired_sample_rate / sample_rate.numpy() * len(audio))
        audio = resample(audio.numpy(), num_samples)
        audio = tf.convert_to_tensor(audio, dtype=tf.float32)  # Convert back to tensor

    # Get MFCCs
    mfccs = get_mfccs(audio, desired_sample_rate)

    # Ensure the MFCC shape matches the input shape expected by the model
    input_shape = model.input_shape[1:]
    mfccs = tf.image.resize(mfccs, [input_shape[0], input_shape[1]])  # Resize if necessary
    mfccs = tf.expand_dims(mfccs, axis=0)  # Add batch dimension

    # Make a prediction
    predictions = model.predict(mfccs)
    predicted_label_index = np.argmax(predictions, axis=1)[0]
    predicted_label = commands[predicted_label_index]

    # Clean up the temporary file
    os.remove(temp_chunk_path)

    return predicted_label

# Real-time audio processing
def realtime_prediction(model):
    p = pyaudio.PyAudio()
    stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)

    print("Listening... (Press Ctrl+C to stop)")

    audio_buffer = AudioSegment.empty()

    try:
        while True:
            # Read data from the microphone
            data = stream.read(CHUNK)
            audio_segment = AudioSegment(data, sample_width=p.get_sample_size(FORMAT), frame_rate=RATE, channels=CHANNELS)
            audio_buffer += audio_segment

            # Detect silence and split audio
            silence_ranges = detect_silence(audio_buffer, min_silence_len=500, silence_thresh=-50)

            if silence_ranges:
                # Process audio up to the first silence
                start, end = silence_ranges[0]
                chunk = audio_buffer[:end]
                audio_buffer = audio_buffer[end:]  # Remove processed chunk from buffer

                if start == 0:  # If silence is at the beginning
                    predicted_label = '10'  # Unknown class
                else:
                    # Predict the label for the chunk
                    predicted_label = process_and_predict_chunk(chunk, model)
                
                print(f"Predicted Label: {predicted_label}")

    except KeyboardInterrupt:
        print("Stopping...")
        stream.stop_stream()
        stream.close()
        p.terminate()

if __name__ == "__main__":
    realtime_prediction(model)





Listening... (Press Ctrl+C to stop)
Predicted Label: 10
