In [None]:
import os
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Function to extract MFCC features from audio files
def extract_features(file_path):
    try:
        audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast')
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccs_mean = np.mean(mfccs.T, axis=0)
        return mfccs_mean
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

In [None]:
# Prepare the dataset with debugging output
def prepare_dataset(folder_names, base_path="."):
    X = []
    y = []
    
    for label, folder in enumerate(folder_names):
        folder_path = os.path.join(base_path, folder)
        print(f"\nProcessing folder: {folder_path}")
        
        if not os.path.exists(folder_path):
            print(f"Warning: Folder {folder_path} does not exist")
            continue
            
        files = [f for f in os.listdir(folder_path) if f.endswith('.wav')]
        print(f"Found {len(files)} .wav files in {folder}")
        
        for file in files:
            file_path = os.path.join(folder_path, file)
            # print(f"Processing: {file_path}")
            features = extract_features(file_path)
            if features is not None:
                X.append(features)
                y.append(label)
                # print(f"Successfully processed {file}")
            else:
                print(f"Failed to process {file}")
    
    print(f"\nTotal samples processed: {len(X)}")
    return np.array(X), np.array(y)


In [None]:
# Main execution
def main():
    folder_names = ['belly_pain', 'burping', 'discomfort', 'hungry', 'tired']
    
    print("Starting dataset preparation...")
    X, y = prepare_dataset(folder_names)
    
    
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    print(f"\nTraining set size: {len(X_train)}")
    print(f"Testing set size: {len(X_test)}")
    
    # Create and train Random Forest classifier
    print("\nTraining Random Forest classifier...")
    rf_classifier = RandomForestClassifier(
        n_estimators=100,
        random_state=42
    )
    rf_classifier.fit(X_train, y_train)
    
    # Make predictions
    y_pred = rf_classifier.predict(X_test)
    
    # Calculate and display accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\nModel Accuracy: {accuracy * 100:.2f}%")
    
    # Feature importance
    feature_importance = rf_classifier.feature_importances_
    print("\nTop 5 most important MFCC features:")
    for i in np.argsort(feature_importance)[::-1][:5]:
        print(f"MFCC coefficient {i}: {feature_importance[i]:.4f}")
    
    def predict_audio(file_path):
        features = extract_features(file_path)
        if features is not None:
            prediction = rf_classifier.predict([features])[0]
            return folder_names[prediction]
        return None
    
    return rf_classifier, predict_audio

In [None]:
# if __name__ == "__main__":
classifier, predict_function = main()
    

In [None]:
test_file = "D:\defence\model\donateacry_corpus_cleaned_and_updated_data\discomfort\\1309B82C-F146-46F0-A723-45345AFA6EA8-1430703937-1.0-f-48-dc.wav"
prediction = predict_function(test_file)
print(f"Predicted class: {prediction}")