In [None]:
import os
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from pydub import AudioSegment
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Function to convert audio to WAV format
def convert_to_wav(file_path):
    try:
        file_ext = os.path.splitext(file_path)[1].lower()
        if file_ext == '.wav':
            return file_path
        wav_path = os.path.splitext(file_path)[0] + '.wav'
        audio = AudioSegment.from_file(file_path)
        audio.export(wav_path, format='wav')
        print(f"Converted {file_path} to {wav_path}")
        return wav_path
    except Exception as e:
        print(f"Error converting {file_path} to WAV: {str(e)}")
        return None

In [None]:
# Function to extract MFCC features from audio files
def extract_features(file_path):
    try:
        wav_file = convert_to_wav(file_path)
        if wav_file is None:
            return None
        audio, sample_rate = librosa.load(wav_file, res_type='kaiser_fast')
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccs_mean = np.mean(mfccs.T, axis=0)
        return mfccs_mean
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

In [None]:
# Prepare the dataset
def prepare_dataset(folder_names, base_path="."):
    X = []
    y = []
    for label, folder in enumerate(folder_names):
        folder_path = os.path.join(base_path, folder)
        print(f"\nProcessing folder: {folder_path}")
        if not os.path.exists(folder_path):
            print(f"Warning: Folder {folder_path} does not exist")
            continue
        files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.wav', '.mp3', '.m4a', '.ogg'))]
        print(f"Found {len(files)} audio files in {folder}")
        for file in files:
            file_path = os.path.join(folder_path, file)
            features = extract_features(file_path)
            if features is not None:
                X.append(features)
                y.append(label)
            else:
                print(f"Failed to process {file}")
    print(f"\nTotal samples processed: {len(X)}")
    return np.array(X), np.array(y)

In [None]:
# Random Forest Classifier
def train_random_forest(X_train, X_test, y_train, y_test):
    print("\nTraining Random Forest classifier...")
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_classifier.fit(X_train, y_train)
    y_pred = rf_classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Random Forest Accuracy: {accuracy * 100:.2f}%")
    return rf_classifier

In [None]:
# XGBoost Classifier
def train_xgboost(X_train, X_test, y_train, y_test):
    print("\nTraining XGBoost classifier...")
    xgb_classifier = XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='mlogloss')
    xgb_classifier.fit(X_train, y_train)
    y_pred = xgb_classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"XGBoost Accuracy: {accuracy * 100:.2f}%")
    return xgb_classifier

In [None]:
# SVM with RBF Kernel
def train_svm_rbf(X_train, X_test, y_train, y_test):
    print("\nTraining SVM (RBF Kernel) classifier...")
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    svm_classifier = SVC(kernel='rbf', random_state=42)
    svm_classifier.fit(X_train_scaled, y_train)
    y_pred = svm_classifier.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"SVM (RBF) Accuracy: {accuracy * 100:.2f}%")
    return svm_classifier, scaler

In [None]:
# Logistic Regression
def train_logistic_regression(X_train, X_test, y_train, y_test):
    print("\nTraining Logistic Regression classifier...")
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    lr_classifier = LogisticRegression(random_state=42, max_iter=1000)
    lr_classifier.fit(X_train_scaled, y_train)
    y_pred = lr_classifier.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Logistic Regression Accuracy: {accuracy * 100:.2f}%")
    return lr_classifier, scaler

In [None]:
# k-Nearest Neighbors
def train_knn(X_train, X_test, y_train, y_test):
    print("\nTraining k-Nearest Neighbors classifier...")
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    knn_classifier = KNeighborsClassifier(n_neighbors=5)
    knn_classifier.fit(X_train_scaled, y_train)
    y_pred = knn_classifier.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"k-NN Accuracy: {accuracy * 100:.2f}%")
    return knn_classifier, scaler

In [None]:
# Gradient Boosting Classifier (sklearn)
def train_gradient_boosting(X_train, X_test, y_train, y_test):
    print("\nTraining Gradient Boosting classifier...")
    gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)
    gb_classifier.fit(X_train, y_train)
    y_pred = gb_classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Gradient Boosting Accuracy: {accuracy * 100:.2f}%")
    return gb_classifier

In [None]:
# Gaussian Naive Bayes
def train_naive_bayes(X_train, X_test, y_train, y_test):
    print("\nTraining Gaussian Naive Bayes classifier...")
    nb_classifier = GaussianNB()
    nb_classifier.fit(X_train, y_train)
    y_pred = nb_classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Naive Bayes Accuracy: {accuracy * 100:.2f}%")
    return nb_classifier

In [None]:
# Multi-Layer Perceptron (Neural Network)
def train_mlp(X_train, X_test, y_train, y_test):
    print("\nTraining Multi-Layer Perceptron classifier...")
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    mlp_classifier = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
    mlp_classifier.fit(X_train_scaled, y_train)
    y_pred = mlp_classifier.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"MLP Accuracy: {accuracy * 100:.2f}%")
    return mlp_classifier, scaler

In [None]:
# Main execution
def main():
    folder_names = ['belly_pain', 'burping', 'discomfort', 'hungry', 'tired']
    
    print("Starting dataset preparation...")
    X, y = prepare_dataset(folder_names)
    
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    print(f"\nTraining set size: {len(X_train)}")
    print(f"Testing set size: {len(X_test)}")
    
    # Dictionary to store accuracies
    accuracies = {}
    
    # Train and evaluate each algorithm, store accuracies
    rf_classifier = train_random_forest(X_train, X_test, y_train, y_test)
    accuracies['rf'] = accuracy_score(y_test, rf_classifier.predict(X_test))
    
    xgb_classifier = train_xgboost(X_train, X_test, y_train, y_test)
    accuracies['xgb'] = accuracy_score(y_test, xgb_classifier.predict(X_test))
    
    svm_classifier, svm_scaler = train_svm_rbf(X_train, X_test, y_train, y_test)
    accuracies['svm'] = accuracy_score(y_test, svm_classifier.predict(svm_scaler.transform(X_test)))
    
    lr_classifier, lr_scaler = train_logistic_regression(X_train, X_test, y_train, y_test)
    accuracies['lr'] = accuracy_score(y_test, lr_classifier.predict(lr_scaler.transform(X_test)))
    
    knn_classifier, knn_scaler = train_knn(X_train, X_test, y_train, y_test)
    accuracies['knn'] = accuracy_score(y_test, knn_classifier.predict(knn_scaler.transform(X_test)))
    
    gb_classifier = train_gradient_boosting(X_train, X_test, y_train, y_test)
    accuracies['gb'] = accuracy_score(y_test, gb_classifier.predict(X_test))
    
    nb_classifier = train_naive_bayes(X_train, X_test, y_train, y_test)
    accuracies['nb'] = accuracy_score(y_test, nb_classifier.predict(X_test))
    
    mlp_classifier, mlp_scaler = train_mlp(X_train, X_test, y_train, y_test)
    accuracies['mlp'] = accuracy_score(y_test, mlp_classifier.predict(mlp_scaler.transform(X_test)))
    
    # Print accuracies in descending order
    print("\nModel Accuracies (High to Low):")
    for model_name, accuracy in sorted(accuracies.items(), key=lambda x: x[1], reverse=True):
        print(f"{model_name.upper()} -> {accuracy * 100:.2f}%")
    
    # Return classifiers and scalers for prediction function
    def predict_audio(file_path, classifier, scaler=None):
        features = extract_features(file_path)
        if features is not None:
            if scaler is not None:  # Scale features for models that need it
                features = scaler.transform([features])
            else:
                features = [features]
            prediction = classifier.predict(features)[0]
            return folder_names[prediction]
        return None
    
    return {
        'rf': (rf_classifier, None),
        'xgb': (xgb_classifier, None),
        'svm': (svm_classifier, svm_scaler),
        'lr': (lr_classifier, lr_scaler),
        'knn': (knn_classifier, knn_scaler),
        'gb': (gb_classifier, None),
        'nb': (nb_classifier, None),
        'mlp': (mlp_classifier, mlp_scaler)
    }, predict_audio

In [None]:
# Execute and test
classifiers, predict_function = main()

In [None]:
# Test prediction with all models
test_file = "D:\defence\model\donateacry_corpus_cleaned_and_updated_data\discomfort\\1309B82C-F146-46F0-A723-45345AFA6EA8-1430703937-1.0-f-48-dc.wav"
print("\nPredictions for test file:")
for model_name, (classifier, scaler) in classifiers.items():
    prediction = predict_function(test_file, classifier, scaler)
    print(f"{model_name.upper()} Prediction: {prediction}")