In [1]:
import os
import cv2
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from scipy.signal import medfilt
from scipy.ndimage import gaussian_filter1d
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

In [2]:
DATASET_PATH = './digits'  # Folder with fluent/ and stutter/
SAMPLE_RATE = 16000
N_MFCC = 12
MAX_LEN = 160  # max frame length for MFCC

In [3]:
def read_txt_file(filepath):
    with open(filepath, 'r') as f:
        lines = f.readlines()

    # Filter only lines that contain numbers
    numeric_lines = []
    for line in lines:
        try:
            float(line.strip())  # Try converting
            numeric_lines.append(line.strip())
        except ValueError:
            continue  # Skip headers or non-numeric lines

    return np.array([float(x) for x in numeric_lines])


In [4]:
def apply_median_filter(signal):
    return medfilt(signal, kernel_size=3)

In [5]:
def apply_gaussian_filter(y):
    return gaussian_filter1d(y, sigma=1)

In [8]:
def extract_mfcc(y, sr):
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC)
    return mfcc

In [29]:
def process_file(filepath):
    y = read_txt_file(filepath)
    y = librosa.util.fix_length(y, size=SAMPLE_RATE)  # trim or pad

    y_median = apply_median_filter(y)
    y_gaussian = apply_gaussian_filter(y)
    # y_clean = y_median
    y_vlean = y_gaussian
    mfcc = extract_mfcc(y_clean, SAMPLE_RATE)
    return mfcc

In [30]:
def load_dataset():
    X = []
    y = []
    filenames = []
    labels = {'fluent': 0, 'prolong': 1, 'repeat': 2}

    # Fluent
    fluent_path = os.path.join(DATASET_PATH, 'fluent')
    for file in os.listdir(fluent_path):
        if file.endswith('.txt'):
            mfcc = process_file(os.path.join(fluent_path, file))
            X.append(mfcc)
            y.append(labels['fluent'])
            filenames.append(file)    

    # Stutter - prolong
    prolong_path = os.path.join(DATASET_PATH, 'stutter', 'prolong')
    for file in os.listdir(prolong_path):
        if file.endswith('.txt'):
            mfcc = process_file(os.path.join(prolong_path, file))
            X.append(mfcc)
            y.append(labels['prolong'])
            filenames.append(file)

    # Stutter - repeations
    repeations_path = os.path.join(DATASET_PATH, 'stutter', 'repeat')
    for file in os.listdir(repeations_path):
        if file.endswith('.txt'):
            mfcc = process_file(os.path.join(repeations_path, file))
            X.append(mfcc)
            y.append(labels['repeat'])
            filenames.append(file)

    return np.array(X), np.array(y), np.array(filenames)

In [31]:
X, y, filenames = load_dataset()

In [32]:
X_padded = np.array([np.pad(x, ((0,0), (0, MAX_LEN - x.shape[1])), mode='constant')[:, :MAX_LEN] for x in X])
X_padded = X_padded[..., np.newaxis]  # add channel dimension
y_cat = to_categorical(y, num_classes=3)
X_train, X_test, y_train, y_test, filenames_train, filenames_test = train_test_split(X_padded, y_cat, filenames, test_size=0.2, random_state=42)

In [33]:
# --- Build CNN Model ---
model = Sequential([
    Conv2D(32, (3,3), activation='relu', input_shape=X_padded.shape[1:]),
    MaxPooling2D((2,2)),
    Dropout(0.3),
    Conv2D(64, (3,3), activation='relu'),
    MaxPooling2D((2,2)),
    Dropout(0.3),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(3, activation='softmax')
])

In [34]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_4 (Conv2D)           (None, 10, 158, 32)       320       
                                                                 
 max_pooling2d_4 (MaxPoolin  (None, 5, 79, 32)         0         
 g2D)                                                            
                                                                 
 dropout_6 (Dropout)         (None, 5, 79, 32)         0         
                                                                 
 conv2d_5 (Conv2D)           (None, 3, 77, 64)         18496     
                                                                 
 max_pooling2d_5 (MaxPoolin  (None, 1, 38, 64)         0         
 g2D)                                                            
                                                                 
 dropout_7 (Dropout)         (None, 1, 38, 64)        

In [35]:
# --- Train ---
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [37]:
# Predict
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)
y_true_labels = np.argmax(y_test, axis=1)

# Label mapping
label_map = {0: 'fluent', 1: 'prolong', 2: 'repeations'}

# Print comparison
print(f"{'Filename':<30} {'Actual':<12} {'Predicted':<12}")
print("="*60)
for i in range(len(filenames_test)):
    actual = label_map[y_true_labels[i]]
    predicted = label_map[y_pred_labels[i]]
    print(f"{filenames_test[i]:<30} {actual:<12} {predicted:<12}")

Filename                       Actual       Predicted   
210101063_E_9_5.txt            fluent       fluent      
210101063_E_1_5.txt            fluent       fluent      
210101063_E_3_1.txt            fluent       fluent      
210101063_repeat_1_4.txt       repeations   repeations  
210101063_prolong_5_4.txt      prolong      prolong     
210101063_prolong_3_1.txt      prolong      prolong     
210101063_E_6_9.txt            fluent       fluent      
210101063_repeat_4_1.txt       repeations   repeations  
210101063_repeat_4_5.txt       repeations   repeations  
210101063_E_4_5.txt            fluent       fluent      
210101063_E_6_6.txt            fluent       fluent      
210101063_repeat_6_3.txt       repeations   repeations  
210101063_repeat_3_1.txt       repeations   repeations  
210101063_E_7_8.txt            fluent       fluent      
210101063_repeat_7_2.txt       repeations   repeations  
210101063_repeat_5_3.txt       repeations   repeations  
210101063_E_5_6.txt            

In [38]:
label_names = ['fluent', 'prolong', 'repeations']
print("\nCNN Classification Report:")
print(classification_report(y_true_labels, y_pred_labels, target_names=label_names))


CNN Classification Report:
              precision    recall  f1-score   support

      fluent       1.00      0.95      0.98        21
     prolong       1.00      1.00      1.00         9
  repeations       0.91      1.00      0.95        10

    accuracy                           0.97        40
   macro avg       0.97      0.98      0.98        40
weighted avg       0.98      0.97      0.98        40

