In [9]:
import os
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
import urllib.request
import zipfile
import warnings

# Optional: Suppress Librosa warnings
warnings.filterwarnings("ignore", category=UserWarning, module='librosa')

# Download the Free Spoken Digit Dataset (FSDD)
def download_fsdd():
    if not os.path.exists("FSDD"):
        print("Downloading FSDD dataset...")
        url = "https://github.com/Jakobovski/free-spoken-digit-dataset/archive/refs/heads/master.zip"
        urllib.request.urlretrieve(url, "fsdd.zip")
        with zipfile.ZipFile("fsdd.zip", "r") as zip_ref:
            zip_ref.extractall()
        os.rename("free-spoken-digit-dataset-master/recordings", "FSDD")
        print("Download complete.")
    else:
        print("FSDD dataset already downloaded.")

# Extract MFCC features safely
def extract_features(audio_path, default_n_fft=2048, n_mfcc=13, hop_length=512):
    try:
        y, sr = librosa.load(audio_path, sr=None)
        n_fft = min(default_n_fft, len(y))

        # Pad signal if it's too short
        if len(y) < default_n_fft:
            pad_width = default_n_fft - len(y)
            y = np.pad(y, (0, pad_width))

        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
        mfccs_mean = np.mean(mfccs, axis=1)
        return mfccs_mean
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return None

# Load all data
def load_data(data_dir):
    features = []
    labels = []
    print("Extracting features...")
    for file_name in os.listdir(data_dir):
        if file_name.endswith(".wav"):
            path = os.path.join(data_dir, file_name)
            label = int(file_name.split("_")[0])  # Label is the first part of filename
            mfcc_features = extract_features(path)
            if mfcc_features is not None:
                features.append(mfcc_features)
                labels.append(label)
    print("Feature extraction complete.")
    return np.array(features), np.array(labels)

# Full pipeline
def main():
    download_fsdd()
    X, y = load_data("FSDD")

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # MLP Classifier
    clf = MLPClassifier(hidden_layer_sizes=(64,), max_iter=1000, random_state=42)
    clf.fit(X_train, y_train)

    # Evaluation
    accuracy = clf.score(X_test, y_test)
    print(f"Model accuracy: {accuracy: .2f}")

    #prediction on example
    test_path= "FSDD/2_george_45.wav"
    test_feat= extract_features(test_path)
    if test_feat is not None:
       pred= clf.predict(test_feat.reshape(1, -1))[0]
       print(f"Predicted digit for {test_path}: {pred}")

if __name__ == "__main__":
    main()


FSDD dataset already downloaded.
Extracting features...
Feature extraction complete.
Model accuracy:  0.87
Error processing FSDD_jackson_39.wav: [Errno 22] Invalid argument: 'FSDD\x04_jackson_39.wav'


  y, sr = librosa.load(audio_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
