1. Data Preprocessing and Clustering

Feature Extraction: We start by loading all voice .wav files from the healthy control and Parkinson’s patient folders. For each audio sample, we compute a range of acoustic features: Mel-Frequency Cepstral Coefficients (MFCCs) capturing spectral shape, jitter (cycle-to-cycle pitch variability), shimmer (amplitude variability), pitch (fundamental frequency) statistics, spectral entropy, and Harmonic-to-Noise Ratio (HNR). These features are commonly used to characterize voice pathologies – for example, fundamental frequency, jitter, shimmer, HNR, and MFCCs have been extracted for disease classification in prior studies ￼. In PD patients, such vocal features often show higher fluctuations and noise compared to healthy voices ￼.

Unsupervised Severity Clustering: Since true severity labels may be unavailable, we apply unsupervised clustering to the patient feature vectors to derive severity levels. We use K-Means (k=4) to cluster the patient samples into 4 groups (intended to correspond to increasing severity 1–4). All healthy control samples are assigned label 0 (no disease). We map the cluster indices to severity scores 1–4 by ordering clusters based on an indicator (e.g. average jitter, assuming higher jitter corresponds to more severe dysphonia). The code below performs feature extraction and clustering:

In [20]:
import os, glob
import numpy as np
import librosa
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Paths to data folders (adjust as needed)
healthy_dir = "HealthyAudio/"
patient_dir = "PatientAudio/"

# Feature extraction function for one audio signal
def extract_features(y, sr):
    features = []
    # 1. MFCC features
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfcc_means = mfcc.mean(axis=1)
    mfcc_stds = mfcc.std(axis=1)
    features.extend(mfcc_means)
    features.extend(mfcc_stds)
    # 2. Fundamental frequency (pitch) using librosa's pitch tracking
    f0, voiced_flag, _ = librosa.pyin(y, sr=sr, fmin=50, fmax=500)
    f0 = f0[voiced_flag] if voiced_flag is not None else f0  # filter voiced frames
    if f0.size > 0:
        pitch_mean = float(np.nanmean(f0))
        pitch_std = float(np.nanstd(f0))
    else:
        pitch_mean, pitch_std = 0.0, 0.0
    # Jitter: relative average change in pitch per frame (proxy for cycle variability)
    if f0.size > 1:
        jitter = float(np.mean(np.abs(np.diff(f0))) / (np.mean(f0) + 1e-8))
    else:
        jitter = 0.0
    features.append(jitter)
    # 3. Shimmer: relative average change in amplitude (use RMS energy as proxy for amplitude)
    rms = librosa.feature.rms(y=y)[0]
    if rms.size > 1:
        shimmer = float(np.mean(np.abs(np.diff(rms))) / (np.mean(rms) + 1e-8))
    else:
        shimmer = 0.0
    features.append(shimmer)
    # Include pitch stats as features as well
    features.append(pitch_mean)
    features.append(pitch_std)
    # 4. Spectral entropy (measure of spectral flatness/disorder)
    # Compute power spectrum and normalize to a probability distribution
    D = np.abs(librosa.stft(y, n_fft=2048, hop_length=512))**2
    ps = D.sum(axis=1)
    ps_norm = ps / (ps.sum() + 1e-12)
    spectral_entropy = -np.sum(ps_norm * np.log2(ps_norm + 1e-12))
    features.append(spectral_entropy)
    # 5. Harmonic-to-Noise Ratio (HNR) using harmonic-percussive source separation
    harmonic, percussive = librosa.effects.hpss(y)
    harm_energy = np.sum(harmonic**2)
    noise_energy = np.sum(percussive**2)
    HNR = 10 * np.log10((harm_energy + 1e-8) / (noise_energy + 1e-8))
    features.append(HNR)
    return np.array(features, dtype=np.float32)

# Load data and extract features for all samples
X_features = []   # feature vectors
y_labels = []     # labels (0 for healthy, temporary for patients)
file_paths = []   # store file paths to maintain order

# Process healthy controls (label 0)
for filepath in glob.glob(os.path.join(healthy_dir, "*.wav")):
    y, sr = librosa.load(filepath, sr=None)  # use original sampling rate
    feats = extract_features(y, sr)
    X_features.append(feats)
    y_labels.append(0)  # healthy label 0
    file_paths.append(os.path.basename(filepath))

# Process patient samples (temporary label -1 to distinguish before clustering)
patient_feats = []
patient_paths = []
for filepath in glob.glob(os.path.join(patient_dir, "*.wav")):
    y, sr = librosa.load(filepath, sr=None)
    feats = extract_features(y, sr)
    patient_feats.append(feats)
    patient_paths.append(os.path.basename(filepath))
# (We'll assign actual labels after clustering)
 
# Scale features for clustering and modeling
X_all = np.vstack([np.array(X_features), np.array(patient_feats)])
scaler = StandardScaler()
X_all_scaled = scaler.fit_transform(X_all)
# Split back out the scaled features for patients and healthy
X_h_scaled = X_all_scaled[:len(X_features)]
X_p_scaled = X_all_scaled[len(X_features):]

# Cluster the patient samples into 4 clusters (severity 1-4)
kmeans = KMeans(n_clusters=4, random_state=42)
cluster_labels = kmeans.fit_predict(X_p_scaled)
# Determine severity order by average jitter in each cluster
patient_feats_arr = np.array(patient_feats)
jitter_index = 26  # index of jitter feature in our feature vector
avg_jitter = []
for k in range(4):
    avg_jitter.append(patient_feats_arr[cluster_labels == k, jitter_index].mean())
# Rank clusters by jitter (low jitter = mild, high jitter = severe)
cluster_order = np.argsort(avg_jitter)
# Map original cluster labels to severity 1-4
cluster_to_severity = {int(cluster_order[i]): i+1 for i in range(4)}
severity_labels = [cluster_to_severity[int(c)] for c in cluster_labels]

# Combine healthy and patient labels
y_patient = severity_labels  # severity 1-4 for each patient sample
y_labels.extend(y_patient)   # 0 for all healthy (already in list) and 1-4 for patients
X_features.extend(patient_feats)
file_paths.extend(patient_paths)

# Convert to numpy arrays for modeling
X_features = np.array(X_features, dtype=np.float32)
y_labels = np.array(y_labels, dtype=int)

This code loads each audio file, computes a feature vector, and then uses KMeans to cluster patient samples into 4 groups. The cluster with the smallest average jitter is labeled as severity 1 (mildest) and the highest jitter cluster as severity 4 (most severe), with intermediate clusters labeled 2 and 3. All healthy controls are labeled 0. At the end, we have X_features (feature matrix) and y_labels (0–4 severity labels) for supervised model training.

2. Feature-Based Machine Learning Model

With features and pseudo-severity labels prepared, we train traditional machine learning classifiers. We consider a Support Vector Machine (SVM) and a Random Forest (RF) model. SVMs are effective for high-dimensional feature spaces and often outperform neural networks when data is limited ￼, while Random Forests are robust ensemble classifiers that handle feature interactions well. Both models will learn to classify each sample into severity classes 0–4 based on the extracted acoustic features.

We split the data into training and testing sets (e.g., 80/20 split) and train the classifiers. After training, we evaluate their accuracy and examine the confusion matrix. Code for training and evaluating the SVM and RF models is given below:

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Split data into train and test sets (stratified by severity to preserve class proportions)
X_train, X_test, y_train, y_test = train_test_split(X_features, y_labels, test_size=0.2, stratify=y_labels, random_state=42)

# Train SVM classifier
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
svm_model.fit(X_train, y_train)
# Train Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate on test set
svm_pred = svm_model.predict(X_test)
rf_pred = rf_model.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, svm_pred))
print("RF Accuracy:", accuracy_score(y_test, rf_pred))
print("\nSVM Classification Report:\n", classification_report(y_test, svm_pred))
print("SVM Confusion Matrix:\n", confusion_matrix(y_test, svm_pred))

SVM Accuracy: 0.5882352941176471
RF Accuracy: 0.5882352941176471

SVM Classification Report:
               precision    recall  f1-score   support

           0       0.56      1.00      0.72         9
           1       0.00      0.00      0.00         2
           2       0.00      0.00      0.00         3
           3       0.00      0.00      0.00         1
           4       1.00      0.50      0.67         2

    accuracy                           0.59        17
   macro avg       0.31      0.30      0.28        17
weighted avg       0.42      0.59      0.46        17

SVM Confusion Matrix:
 [[9 0 0 0 0]
 [2 0 0 0 0]
 [3 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 1]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


This trains an RBF-kernel SVM and a Random Forest. We output the accuracy and a classification report for the SVM as an example (the RF can be evaluated similarly). The classification report includes precision, recall (sensitivity), and F1-score for each class, and the confusion matrix shows how predicted labels align with true labels. We would typically see lower severity classes (including healthy 0) being confused with adjacent classes if the features are similar, while very distinct classes separate better.

3. Deep Learning Model (Spectrogram-Based)

To capture more complex patterns in the audio, we use a deep learning approach on the raw audio signals. We first convert each .wav file into a Mel spectrogram, which is a time-frequency representation of the audio. The spectrogram can be treated as an image (with time on one axis, frequency on the other, and intensity as pixel values) and fed into a Convolutional Neural Network (CNN) ￼. The CNN can automatically learn salient features (such as tremor or noise patterns) from the spectrograms that correlate with Parkinson’s severity.

Below, we compute mel spectrograms for all samples and build a CNN model. Each spectrogram is converted to decibel (log) scale and padded to the same dimension. The CNN consists of convolutional and pooling layers to extract spatial features from the spectrogram, followed by dense layers to output a severity class. We train the CNN on the training set spectrograms. Such spectrogram-based CNN classifiers can capture subtle voice characteristics and have been shown to outperform classical feature-based methods for PD detection ￼.

Additionally, we demonstrate how to apply transfer learning to improve the deep model. This involves using a pre-trained CNN (e.g. ResNet50 trained on ImageNet) as a fixed feature extractor for our spectrogram images, or fine-tuning it on our data ￼. We also mention using a pre-trained audio model like Wav2Vec2 for transfer learning on raw waveform. The code below covers spectrogram generation, CNN training, and an optional transfer learning setup:

In [22]:
import tensorflow as tf
from tensorflow.keras import layers, models

# Convert all audio files to Mel spectrograms (with consistent shape)
spectrograms = []
n_mels = 128
# Use the same file order as X_features/y_labels (file_paths list was built above)
for filepath in file_paths:
    # We need the full path; assuming healthy_dir and patient_dir contain unique filenames, determine prefix:
    if filepath in os.listdir(healthy_dir):
        full_path = os.path.join(healthy_dir, filepath)
    else:
        full_path = os.path.join(patient_dir, filepath)
    y, sr = librosa.load(full_path, sr=None)
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, hop_length=512, n_fft=1024)
    S_db = librosa.power_to_db(S, ref=np.max)
    spectrograms.append(S_db)

# Pad all spectrograms to the same time length
max_frames = max([spec.shape[1] for spec in spectrograms])
spectrograms_padded = []
for spec in spectrograms:
    if spec.shape[1] < max_frames:
        # pad with zeros (silence) to match max_frames
        pad_width = max_frames - spec.shape[1]
        spec_padded = np.pad(spec, ((0,0), (0, pad_width)), mode='constant')
    else:
        spec_padded = spec[:, :max_frames]  # truncate if somehow longer
    spectrograms_padded.append(spec_padded)
spectrograms_padded = np.array(spectrograms_padded, dtype=np.float32)
# Add channel dimension for CNN input
X_spec = spectrograms_padded[..., np.newaxis]  # shape (num_samples, n_mels, max_frames, 1)

# Split into train/test sets (same y_labels as before)
X_train_spec, X_test_spec, y_train_spec, y_test_spec = train_test_split(X_spec, y_labels, test_size=0.2, stratify=y_labels, random_state=42)

# Define a CNN model for spectrogram classification
model = models.Sequential([
    layers.Conv2D(32, kernel_size=(3,3), activation='relu', input_shape=(n_mels, max_frames, 1)),
    layers.MaxPooling2D(pool_size=(2,2)),
    layers.Conv2D(64, kernel_size=(3,3), activation='relu'),
    layers.MaxPooling2D(pool_size=(2,2)),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(5, activation='softmax')  # 5 classes: 0,1,2,3,4
])
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the CNN model
callbacks = [tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)]
history = model.fit(X_train_spec, y_train_spec, epochs=30, batch_size=16, 
                    validation_split=0.2, callbacks=callbacks, verbose=1)

# Evaluate on the test set
test_loss, test_acc = model.evaluate(X_test_spec, y_test_spec, verbose=0)
print("CNN Test Accuracy:", test_acc)
print("CNN Confusion Matrix:\n", confusion_matrix(y_test_spec, np.argmax(model.predict(X_test_spec), axis=1)))

Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 588ms/step - accuracy: 0.3407 - loss: 52.9768 - val_accuracy: 0.0769 - val_loss: 23.3810
Epoch 2/30
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 277ms/step - accuracy: 0.1748 - loss: 47.4927 - val_accuracy: 0.6154 - val_loss: 2.9664
Epoch 3/30
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 257ms/step - accuracy: 0.2875 - loss: 10.5067 - val_accuracy: 0.2308 - val_loss: 1.7315
Epoch 4/30
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 487ms/step - accuracy: 0.3730 - loss: 2.1007 - val_accuracy: 0.2308 - val_loss: 1.5057
Epoch 5/30
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 434ms/step - accuracy: 0.4877 - loss: 1.2269 - val_accuracy: 0.6154 - val_loss: 1.2148
Epoch 6/30
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 452ms/step - accuracy: 0.5947 - loss: 1.1251 - val_accuracy: 0.3077 - val_loss: 1.4780
Epoch 7/30
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━

The CNN is trained on the spectrogram images with an early stopping callback to prevent overfitting. After training, we evaluate the accuracy on the test set and show a confusion matrix of predicted vs actual severity classes.

Transfer Learning (Optional): We can improve the CNN by leveraging a pre-trained network. For example, we can use ResNet50 (pre-trained on ImageNet) to extract features from our spectrograms. Since ResNet expects 3-channel images, we replicate the single-channel spectrogram into an RGB image and resize to the input shape (e.g., 224×224). We then use ResNet’s convolutional base to process the spectrogram and add a new classifier layer on top for our 5 classes. We freeze the pre-trained layers initially and train the top layer on our data, optionally unfreezing some layers later for fine-tuning.

Below is an example of setting up a transfer learning model with ResNet50:

In [23]:
# Expand spectrogram data to 3 channels by duplicating the single channel
X_spec_rgb = np.repeat(X_spec, 3, axis=-1)  # shape: (samples, n_mels, max_frames, 3)
# If required, resize spectrograms to 224x224 for ResNet (using TensorFlow image resizing)
target_size = 224
X_spec_resized = tf.image.resize(X_spec_rgb, [target_size, target_size]).numpy()

# Load pre-trained ResNet50 without its top layer
base_model = tf.keras.applications.ResNet50(weights='imagenet', include_top=False, 
                                           input_shape=(target_size, target_size, 3))
base_model.trainable = False  # freeze convolutional layers

# Add a new classification head on top of ResNet
transfer_model = models.Sequential([
    base_model,
    layers.GlobalAveragePooling2D(),
    layers.Dense(5, activation='softmax')
])
transfer_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the transfer learning model on our spectrogram data
transfer_model.fit(X_spec_resized, y_labels, epochs=10, batch_size=16, validation_split=0.2)

Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 9s/step - accuracy: 0.1979 - loss: 1.7063 - val_accuracy: 0.0000e+00 - val_loss: 2.6784
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - accuracy: 0.7044 - loss: 1.0628

KeyboardInterrupt: 

In this snippet, we prepare the spectrogram data for ResNet and then create a model that uses ResNet50’s convolutional layers (frozen) followed by a global pooling and a dense output layer. We would train this model similarly (with a validation split for early stopping). Using a pre-trained CNN can jump-start learning by applying features learned from natural images to our spectrograms ￼.

Alternatively, for raw audio, one could fine-tune a pre-trained audio transformer like Wav2Vec2. For example, using HuggingFace Transformers:

In [None]:
# (Pseudo-code for Wav2Vec2 fine-tuning)
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
model_wav2vec = Wav2Vec2ForSequenceClassification.from_pretrained(
    "superb/wav2vec2-base-superb-ks", num_labels=5)
processor = Wav2Vec2Processor.from_pretrained("superb/wav2vec2-base-superb-ks")

# Prepare audio data for Wav2Vec2
input_values = processor([librosa.resample(y, orig_sr=sr, target_sr=16000) for y in audio_signals],
                          sampling_rate=16000, return_tensors="np", padding=True).input_values
# Fine-tune model (this requires a training loop or Trainer API)

This illustrates how to set up a deep learning model using spectrogram images and how to incorporate transfer learning. After training, the CNN-based model will output probabilities for each class 0–4 given a spectrogram input.

4. Hybrid Model (Fusion of ML and Deep Learning)

The hybrid approach combines the best of both worlds: the hand-crafted acoustic features and the deep learning derived features. We first use the trained CNN (from step 3) to extract high-level representations of each sample – for instance, taking the activations from the penultimate layer of the CNN as an embedding. This embedding captures complex voice patterns learned by the CNN. We then concatenate this embedding with the original acoustic feature vector (from step 1) for each sample. The fused feature (a longer vector) is used to train a meta-classifier (here we can use a Random Forest or another SVM) that predicts the severity. The rationale is that the classifier can utilize both the domain-specific features (jitter, MFCCs, etc.) and the CNN’s automatically learned features. Similar feature-fusion strategies (e.g., using a pre-trained CNN’s features with an auxiliary classifier) have improved accuracy in voice disorder classification.

Below, we obtain CNN embeddings for our train/test sets and train a Random Forest on the concatenated features:

In [None]:
from tensorflow.keras.models import Model

_ = model.predict(X_train_spec[:1])  # Force a forward pass on a single sample

# Use the trained CNN model from above to extract features (exclude final softmax layer)
feature_extractor = Model(inputs=model.input, outputs=model.layers[-2].output)  # second last layer output

# Extract CNN embeddings for training and test sets
train_cnn_feat = feature_extractor.predict(X_train_spec)
test_cnn_feat  = feature_extractor.predict(X_test_spec)

# We also need the corresponding acoustic features for the same train/test split.
# We can split X_features and y_labels in the same way (using the same indices as train_test_split earlier).
X_train_feat, X_test_feat, _, _ = train_test_split(X_features, y_labels, test_size=0.2, stratify=y_labels, random_state=42)

# Concatenate acoustic features with CNN features for fused representation
X_train_fused = np.concatenate([X_train_feat, train_cnn_feat], axis=1)
X_test_fused  = np.concatenate([X_test_feat, test_cnn_feat], axis=1)

# Train a meta-classifier on the fused features (e.g., Random Forest)
meta_model = RandomForestClassifier(n_estimators=100, random_state=42)
meta_model.fit(X_train_fused, y_train_spec)  # y_train_spec is same as y_train

# Evaluate hybrid model on test set
hybrid_pred = meta_model.predict(X_test_fused)
print("Hybrid Model Accuracy:", accuracy_score(y_test_spec, hybrid_pred))
print("Hybrid Confusion Matrix:\n", confusion_matrix(y_test_spec, hybrid_pred))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step


AttributeError: The layer sequential_2 has never been called and thus has no defined input.

In this code, feature_extractor uses the previously trained model (CNN) to output the 64-dimensional features from the layer before the final softmax. We generate these for each sample in the training and test sets. We then combine them with the original scaled features (X_train_feat, X_test_feat). Finally, we train a meta_model (here a Random Forest) on the fused features. We can evaluate its accuracy and confusion matrix on the test set. This hybrid classifier can often achieve better performance by leveraging both learned and engineered features.


5. Incorporating Demographic Data (Age and Sex)

Demographic factors such as age and sex can also influence voice characteristics (for example, females have about 1.5× higher fundamental frequency than males on average ￼). Parkinson’s patients in voice datasets also tend to be older than healthy controls ￼. To build a second version of our model that includes demographics, we simply add age and sex as additional features.

For the feature-based models (SVM, RF, hybrid), this means appending the age and sex values to each sample’s feature vector. For sex, we encode it as a binary variable (e.g., 0 = Male, 1 = Female). Age can be used as a raw value or normalized. In the deep learning approach, one could incorporate demographics by using a multi-input network (one branch for the spectrogram, another for demographics) that merges before the final prediction, or more simply by adding these values into the fused feature vector in the hybrid model.

Below, we show how to load an external demographics file and merge age/sex with the acoustic features:

In [None]:
import pandas as pd

# Load demographic data (ensure the file path is correct)
demo_df = pd.read_excel("demographics.xlsx")
# Assume demo_df has columns: "Sample ID", "Label" (HC or PwPD), "Age", "Sex"
# Convert sex to numeric binary
demo_df['Sex'] = demo_df['Sex'].map({'M': 0, 'F': 1})

# Create a lookup dictionary for demographics by sample ID
demo_info = {row['Sample ID']: (row['Age'], row['Sex']) for _, row in demo_df.iterrows()}

# Append age and sex to feature vectors
X_features_demo = []
for i, filepath in enumerate(file_paths):
    base_id = os.path.splitext(filepath)[0]  # sample ID without extension
    feats = X_features[i]
    age_val, sex_val = 0, 0
    if base_id in demo_info:
        age_val, sex_val = demo_info[base_id]
    # concatenate original features with age and sex
    feats_demo = np.concatenate([feats, [age_val, sex_val]], axis=0)
    X_features_demo.append(feats_demo)
X_features_demo = np.array(X_features_demo, dtype=np.float32)

# Scale the extended features (age can be scaled, sex is 0/1 so scaling is optional)
scaler_demo = StandardScaler()
X_features_demo = scaler_demo.fit_transform(X_features_demo)

ModuleNotFoundError: No module named 'pandas'

Here we read an Excel file containing age and sex for each sample (identified by an ID). We merge this info with our feature matrix X_features. The result X_features_demo includes the original acoustic features plus two extra dimensions for age and sex. We then normalize the features (note: for sex, since it’s binary, normalization isn’t strictly necessary, but including it won’t harm). Now we can reuse the previous modeling code with X_features_demo in place of X_features. For example, training the SVM:

In [None]:
# Train/test split with demographic-enhanced features
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_features_demo, y_labels, test_size=0.2, stratify=y_labels, random_state=42)
svm_model_d = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
svm_model_d.fit(X_train_d, y_train_d)
print("SVM (with demographics) Accuracy:", accuracy_score(y_test_d, svm_model_d.predict(X_test_d)))

We would follow similarly for RF or the hybrid model (concatenating the CNN embedding with the extended features). By comparing results, we can observe if including age and sex improves the performance. In many cases, adding demographics can subtly boost accuracy or help disambiguate certain cases (for instance, an older male voice might be more likely to be PD than a young female, if other features are borderline).


6. Model Evaluation and Deployment

Cross-Validation and Performance Metrics

To ensure our models generalize well, we perform cross-validation and compute detailed performance metrics. We can use k-fold cross-validation (e.g., 5-fold) on the training data to tune hyperparameters and estimate accuracy more robustly, avoiding overfitting to a single train/test split. For example, we could run 5-fold CV for the SVM’s C parameter and choose the value that gives highest mean accuracy. Below is how to get a cross-validated accuracy for the SVM model:

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(svm_model, X_features, y_labels, cv=5)  # 5-fold CV on entire dataset
print("SVM 5-fold CV Accuracy: %.2f%%" % (100 * scores.mean()))

After training the final models, we evaluate them on a held-out test set. We calculate the confusion matrix and derive sensitivity and specificity for each class. Sensitivity (recall) for a given class is the proportion of that class correctly identified, and specificity for a class can be interpreted as the proportion of all other classes that are correctly identified as not being that class. In multi-class setting, we compute these per class. Here’s how we can compute sensitivity and specificity from a confusion matrix:

In [None]:
from sklearn.metrics import confusion_matrix

# Suppose we have true labels y_test and predictions y_pred from some model
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

num_classes = cm.shape[0]
for i in range(num_classes):
    TP = cm[i, i]
    FN = cm[i, :].sum() - TP
    FP = cm[:, i].sum() - TP
    TN = cm.sum() - (TP + FP + FN)
    sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0.0  # recall for class i
    specificity = TN / (TN + FP) if (TN + FP) > 0 else 0.0
    print(f"Class {i}: Sensitivity = {sensitivity:.2f}, Specificity = {specificity:.2f}")

This will output the sensitivity and specificity for each severity class 0 through 4. For instance, “Class 0” corresponds to healthy controls – sensitivity(0) is the true positive rate for detecting healthy voices, and specificity(0) is the rate at which patient voices are correctly identified as non-healthy. We would typically observe high sensitivity for class 0 if healthy voices are rarely misclassified, and high specificity for class 4 if severe cases are distinctly identified, etc. We also look at overall accuracy and perhaps macro-averaged metrics to summarize performance.

Real-Time Inference Function for Deployment

Finally, we implement a real-time inference function to deploy the model. This function takes a new .wav file as input (and optionally the patient’s age/sex if using the demographic-enhanced model) and returns the predicted Parkinson’s severity level (0 to 4). It replicates the necessary preprocessing: loading the audio, extracting the same features and/or spectrogram, and then uses the trained model to output a class prediction. We ensure that the preprocessing (scaling, padding, etc.) is identical to what was done during training.

Below is an example inference function that handles different model types (feature-based SVM, CNN, or hybrid). This assumes we have the trained models and scalers in scope (from the steps above):

In [None]:
def predict_severity(file_path, model_type='hybrid'):
    # Load audio file
    y, sr = librosa.load(file_path, sr=None)
    # Extract acoustic features
    feats = extract_features(y, sr)  # reuse the same feature extraction function
    # If the model includes demographics, one would obtain age/sex for this sample (e.g., via input or file lookup)
    age_val, sex_val = 0, 0  # replace with actual data if available
    if model_type in ['feature_demog', 'hybrid_demog']:
        # include demographics in feature vector if required
        feats = np.concatenate([feats, [age_val, sex_val]])
        feats = scaler_demo.transform([feats])[0]
    else:
        feats = scaler.transform([feats])[0]
    if model_type.startswith('feature'):
        # Use feature-based ML model (SVM or RF)
        if 'svm_model' in globals():
            pred_class = int(svm_model.predict([feats])[0])
        else:
            pred_class = int(rf_model.predict([feats])[0])
    elif model_type.startswith('cnn'):
        # Use CNN model on spectrogram
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, hop_length=512, n_fft=1024)
        S_db = librosa.power_to_db(S, ref=np.max)
        # Pad or truncate to the same shape as training spectrograms
        if S_db.shape[1] < max_frames:
            S_db = np.pad(S_db, ((0,0), (0, max_frames - S_db.shape[1])), mode='constant')
        else:
            S_db = S_db[:, :max_frames]
        X_input = S_db[np.newaxis, ..., np.newaxis]  # shape (1, n_mels, max_frames, 1)
        probs = model.predict(X_input)
        pred_class = int(np.argmax(probs, axis=1)[0])
    else:  # hybrid
        # Use hybrid model: combine features and CNN embedding then predict with meta-classifier
        # Extract CNN embedding
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, hop_length=512, n_fft=1024)
        S_db = librosa.power_to_db(S, ref=np.max)
        if S_db.shape[1] < max_frames:
            S_db = np.pad(S_db, ((0,0), (0, max_frames - S_db.shape[1])), mode='constant')
        else:
            S_db = S_db[:, :max_frames]
        X_input = S_db[np.newaxis, ..., np.newaxis]
        cnn_feat = feature_extractor.predict(X_input)[0]
        fused_feat = np.concatenate([feats, cnn_feat])
        pred_class = int(meta_model.predict([fused_feat])[0])
    return pred_class

# Example usage:
new_file = "sample.wav"
result = predict_severity(new_file, model_type='hybrid')
print(f"Predicted severity for {new_file}: {result}")



Predicted severity for sample.wav: 2


In this function, model_type can be set to use the feature-based model ('feature' or 'feature_demog'), the pure CNN ('cnn'), or the hybrid ('hybrid' or 'hybrid_demog'). The function loads the audio, extracts features, and if needed computes the spectrogram and CNN embedding. It then applies the appropriate model to predict the class. For deployment, you would load the trained model objects (svm_model, model, meta_model, etc.) and the scalers from saved files, then use this function to get predictions on new data. The output is an integer 0–4 indicating the severity level, which could then be presented to clinicians or used in a downstream system.