# Stuttering Detection Using CNN and RNN

In [1]:
import os
import librosa
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, LSTM, Input
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
# Define the input shape of the audio data
input_shape = (431, 128, 1) # because n_mels = 128
#input_shape = (128, 431, 1) #original
max_time_steps = 431 #originally 300
num_features = 128
train_time_series_data = 38400

In [3]:
# Load the audio files and extract features
audio_dir = "C:\\Users\\DeLL\\Downloads\\AllAudio"
audio_files = os.listdir(audio_dir)
audio_features = []
for file in audio_files:
    audio_path = os.path.join(audio_dir, file)
    audio_data, sr = librosa.load(audio_path, sr=22050, mono=True, duration=10)
    spectrogram = librosa.feature.melspectrogram(y=audio_data, sr=sr, n_mels=num_features)
    log_spectrogram = librosa.power_to_db(spectrogram)
    log_spectrogram = np.expand_dims(log_spectrogram, axis=-1)
    audio_features.append(log_spectrogram)
audio_features = np.array(audio_features)

In [4]:
# Define function to extract features from audio files
def extract_features(audio_path):
    audio_dir = "C:\\Users\\DeLL\\Downloads\\AllAudio"
    y, sr = librosa.load(audio_dir)
    mfcc = librosa.feature.mfcc(y=y, sr=22050, n_mfcc=40, n_fft=1024, hop_length=512)
    mfcc_delta = librosa.feature.delta(mfcc)
    mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
    features = np.concatenate([mfcc, mfcc_delta, mfcc_delta2])
    return features

In [5]:
# Load the stuttering labels
labels = pd.read_csv("fyplabels.csv")

In [6]:
np.array(labels).astype(np.float32)

array([[1. ],
       [1.5],
       [1. ],
       [1.5],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1.5],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1.5],
       [1.5],
       [1. ],
       [1. ],
       [2.5],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
       [1. ],
      

In [7]:
# Split the data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(audio_features, labels, test_size=0.3, random_state=0)

In [8]:
# Transpose the data to match the RNN input shape
train_data = np.transpose(train_data, (0, 2, 1, 3))
test_data = np.transpose(test_data, (0, 2, 1, 3))

In [9]:
# Define the CNN architecture
cnn_model = Sequential()
cnn_model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape))
cnn_model.add(MaxPooling2D(pool_size=(2, 2)))
cnn_model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
cnn_model.add(MaxPooling2D(pool_size=(2, 2)))
cnn_model.add(Flatten())

In [10]:
# Define the RNN architecture
rnn_input = Input(shape=(max_time_steps, num_features))
rnn_model = LSTM(units=64, return_sequences=False)(rnn_input)

In [11]:
# Combine the CNN and RNN models
combined_model = tf.keras.layers.concatenate([cnn_model.output, rnn_model])
output_layer = Dense(1, activation='sigmoid')(combined_model)
model = Model(inputs=[cnn_model.input, rnn_input], outputs=output_layer)

In [12]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
# Train the model
model.fit([train_data, np.random.rand(train_data.shape[0], max_time_steps, num_features)], train_labels, epochs=2, batch_size=32, validation_split=0.3)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x2734f9c10f0>

In [14]:
prediction = model.predict([test_data, np.random.rand(test_data.shape[0], max_time_steps, num_features)])

# Print prediction
for i in range(len(prediction)):
    if prediction[i][0] > 0.5:
        print("The audio clip {} contains stuttering.".format(i+1))
    else:
        print("The audio clip {} is fluent.".format(i+1))

The audio clip 1 contains stuttering.
The audio clip 2 contains stuttering.
The audio clip 3 contains stuttering.
The audio clip 4 contains stuttering.
The audio clip 5 contains stuttering.
The audio clip 6 contains stuttering.
The audio clip 7 contains stuttering.
The audio clip 8 contains stuttering.
The audio clip 9 contains stuttering.
The audio clip 10 contains stuttering.
The audio clip 11 contains stuttering.
The audio clip 12 contains stuttering.
The audio clip 13 contains stuttering.
The audio clip 14 contains stuttering.
The audio clip 15 contains stuttering.
The audio clip 16 contains stuttering.
The audio clip 17 contains stuttering.
The audio clip 18 contains stuttering.
The audio clip 19 contains stuttering.
The audio clip 20 contains stuttering.
The audio clip 21 contains stuttering.
The audio clip 22 contains stuttering.
The audio clip 23 contains stuttering.
The audio clip 24 contains stuttering.
The audio clip 25 contains stuttering.
The audio clip 26 contains stutter

In [15]:
print(labels.columns)

Index(['Interruptions'], dtype='object')


In [16]:
# Load the stuttering labels
labels = pd.read_csv("fyplabels.csv")

# Define a function to map label values to categories
def categorize_label(label_value):
    if label_value == 1:
        return "full stuttering"
    elif label_value == 1.5:
        return "medium stuttering"
    elif label_value == 2:
        return "low stuttering"
    else:
        return "non-stuttering"

# Categorize the labels
test_label_categories = [categorize_label(label) for label in labels['Interruptions'].values]

for i, category in enumerate(test_label_categories):
    print("Audio clip {} has {}.".format(i+1, category))

Audio clip 1 has full stuttering.
Audio clip 2 has medium stuttering.
Audio clip 3 has full stuttering.
Audio clip 4 has medium stuttering.
Audio clip 5 has full stuttering.
Audio clip 6 has full stuttering.
Audio clip 7 has full stuttering.
Audio clip 8 has full stuttering.
Audio clip 9 has full stuttering.
Audio clip 10 has full stuttering.
Audio clip 11 has full stuttering.
Audio clip 12 has full stuttering.
Audio clip 13 has full stuttering.
Audio clip 14 has full stuttering.
Audio clip 15 has full stuttering.
Audio clip 16 has full stuttering.
Audio clip 17 has full stuttering.
Audio clip 18 has full stuttering.
Audio clip 19 has full stuttering.
Audio clip 20 has full stuttering.
Audio clip 21 has full stuttering.
Audio clip 22 has full stuttering.
Audio clip 23 has full stuttering.
Audio clip 24 has full stuttering.
Audio clip 25 has full stuttering.
Audio clip 26 has full stuttering.
Audio clip 27 has full stuttering.
Audio clip 28 has full stuttering.
Audio clip 29 has full st

# API Creation