In [17]:
import pandas as pd
import librosa
import numpy as np
import os
import time

In [18]:
# Paths
data = "environmental-sound-classification-50\\versions\\15\\audio\\audio\\16000\\"
csv_path = "environmental-sound-classification-50\\versions\\15\\esc50.csv"

In [19]:
# Load dataset CSV
df = pd.read_csv(csv_path)

In [20]:
df.sample(5)

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
1468,4-188878-A-30.wav,4,30,door_wood_knock,False,188878,A
1064,3-159445-A-45.wav,3,45,train,False,159445,A
1134,3-197408-C-8.wav,3,8,sheep,False,197408,C
510,2-120218-A-30.wav,2,30,door_wood_knock,False,120218,A
242,1-50625-A-17.wav,1,17,pouring_water,False,50625,A


In [21]:
# Map detailed classes to broader categories
main_category_map = {
    'dog': 'Animals', 'rooster': 'Animals', 'pig': 'Animals', 'cow': 'Animals', 'frog': 'Animals',
    'cat': 'Animals', 'hen': 'Animals', 'insects': 'Animals', 'sheep': 'Animals', 'crow': 'Animals',
    'rain': 'Natural soundscapes & water sounds', 'sea_waves': 'Natural soundscapes & water sounds',
    'crackling_fire': 'Natural soundscapes & water sounds', 'crickets': 'Natural soundscapes & water sounds',
    'chirping_birds': 'Natural soundscapes & water sounds', 'water_drops': 'Natural soundscapes & water sounds',
    'wind': 'Natural soundscapes & water sounds', 'pouring_water': 'Natural soundscapes & water sounds',
    'toilet_flush': 'Natural soundscapes & water sounds', 'thunderstorm': 'Natural soundscapes & water sounds',
    'crying_baby': 'Human, non-speech sounds', 'sneezing': 'Human, non-speech sounds',
    'clapping': 'Human, non-speech sounds', 'breathing': 'Human, non-speech sounds',
    'coughing': 'Human, non-speech sounds', 'footsteps': 'Human, non-speech sounds',
    'laughing': 'Human, non-speech sounds', 'brushing_teeth': 'Human, non-speech sounds',
    'snoring': 'Human, non-speech sounds', 'drinking_sipping': 'Human, non-speech sounds',
    'door_wood_knock': 'Interior/domestic sounds', 'mouse_click': 'Interior/domestic sounds',
    'keyboard_typing': 'Interior/domestic sounds', 'door_wood_creaks': 'Interior/domestic sounds',
    'can_opening': 'Interior/domestic sounds', 'washing_machine': 'Interior/domestic sounds',
    'vacuum_cleaner': 'Interior/domestic sounds', 'clock_alarm': 'Interior/domestic sounds',
    'clock_tick': 'Interior/domestic sounds', 'glass_breaking': 'Interior/domestic sounds',
    'helicopter': 'Exterior/urban noises', 'chainsaw': 'Exterior/urban noises',
    'siren': 'Exterior/urban noises', 'car_horn': 'Exterior/urban noises', 'engine': 'Exterior/urban noises',
    'train': 'Exterior/urban noises', 'church_bells': 'Exterior/urban noises',
    'airplane': 'Exterior/urban noises', 'fireworks': 'Exterior/urban noises', 'hand_saw': 'Exterior/urban noises'
}

In [22]:
# Add main category to dataframe
df['main_category'] = df['category'].map(main_category_map)

In [23]:
df['main_category'].sample(10)

607     Natural soundscapes & water sounds
1561              Interior/domestic sounds
889               Interior/domestic sounds
1178                               Animals
496                                Animals
875               Interior/domestic sounds
1826    Natural soundscapes & water sounds
872                  Exterior/urban noises
315                  Exterior/urban noises
463     Natural soundscapes & water sounds
Name: main_category, dtype: object

In [24]:
df['main_category'].unique()

array(['Animals', 'Natural soundscapes & water sounds',
       'Interior/domestic sounds', 'Human, non-speech sounds',
       'Exterior/urban noises'], dtype=object)

In [25]:
features = []
labels = []

for idx, row in df.iterrows():
    file_path = os.path.join(data, row['filename'])
    y, sr = librosa.load(file_path, sr=16000, mono=True)
    
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
    mfcc_mean = np.mean(mfcc.T, axis=0)
    
    features.append(mfcc_mean)
    labels.append(row['main_category'])
    
X = np.array(features)
y = np.array(labels)

In [26]:
from sklearn.preprocessing import OneHotEncoder

# Reshape y to 2D array as required by OneHotEncoder
y_reshaped = y.reshape(-1, 1)

# Updated parameter name for scikit-learn >= 1.2
encoder = OneHotEncoder(sparse_output=False)
y_onehot = encoder.fit_transform(y_reshaped)

# To get back the labels (optional)
label_names = encoder.categories_[0]


In [27]:
# Create DataFrame for features (MFCCs)
features_df = pd.DataFrame(X, columns=[f'mfcc_{i+1}' for i in range(X.shape[1])])

# Create DataFrame for one-hot encoded labels
labels_df = pd.DataFrame(y_onehot, columns=label_names)

# Concatenate features and labels
final_df = pd.concat([features_df, labels_df], axis=1)

In [28]:
final_df

Unnamed: 0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,...,mfcc_36,mfcc_37,mfcc_38,mfcc_39,mfcc_40,Animals,Exterior/urban noises,"Human, non-speech sounds",Interior/domestic sounds,Natural soundscapes & water sounds
0,-611.100403,3.343292,-10.551255,-2.647891,-1.820711,-1.461677,0.579047,1.206124,0.444120,-0.308052,...,0.040707,0.149828,0.118979,-0.294478,-0.571947,1.0,0.0,0.0,0.0,0.0
1,-178.176926,-23.769896,-42.744907,-12.746276,-50.138439,-31.489588,-18.378567,-28.189640,-29.059763,-11.410609,...,-1.221400,1.394207,-0.658031,-0.205951,0.612961,0.0,0.0,0.0,0.0,1.0
2,28.579021,47.497429,-3.984118,8.676783,-9.227191,16.239357,-2.928177,14.405091,-10.961942,3.435755,...,5.468563,-3.833577,-1.117157,1.452625,-3.953355,0.0,0.0,0.0,1.0,0.0
3,28.901741,47.402294,-3.753149,9.146835,-5.993060,16.941429,-2.413928,12.345023,-9.952536,5.214138,...,6.493138,-3.978284,-0.061077,1.392027,-2.367900,0.0,0.0,0.0,1.0,0.0
4,-404.593964,119.941704,42.771217,23.748320,-0.906739,0.470205,-9.534047,8.258159,-15.376285,-7.105755,...,-5.540597,-3.010366,3.179933,-7.124720,-2.658294,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,-179.207199,100.529213,-57.235874,8.718024,-9.163035,14.837163,2.769281,-1.397354,0.044582,9.310634,...,-8.154682,-0.274747,-3.191775,-1.862323,-2.330522,1.0,0.0,0.0,0.0,0.0
1996,-44.983044,18.894360,-4.314282,-14.483033,-23.834936,-26.665272,-26.291445,-11.474735,-21.367085,-7.105660,...,0.895783,0.105380,-3.548870,0.369805,-8.284348,0.0,0.0,0.0,1.0,0.0
1997,-272.711823,77.392746,10.226200,18.360622,6.086541,6.449529,1.605461,6.465501,2.494278,4.671847,...,-0.128015,-2.550002,-1.540261,-1.918842,-1.150266,0.0,0.0,1.0,0.0,0.0
1998,-128.933243,79.908081,-38.742485,22.791780,-2.220027,1.829074,-14.245359,6.024097,5.907116,1.920074,...,2.550519,0.661556,2.082479,-1.226844,0.203859,1.0,0.0,0.0,0.0,0.0


In [29]:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [30]:
# Step 1: Convert one-hot y back to labels (required for classification targets)
y_labels = np.array([label_names[i] for i in np.argmax(y_onehot, axis=1)])

# Step 2: Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y_labels, test_size=0.2, random_state=42, stratify=y_labels)

In [31]:
# -------------------------------
# RANDOM FOREST with Hyperparameter Tuning
# -------------------------------
rf_params = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [10, 20, 30, None],  # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],    # Minimum samples required at each leaf node
}

rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5, scoring='accuracy', n_jobs=-1)
rf_grid_search.fit(X_train, y_train)

# Best parameters and cross-validated accuracy score
print("🎯 Best Random Forest Parameters:", rf_grid_search.best_params_)
print("🎯 Best Random Forest Cross-Validation Accuracy:", rf_grid_search.best_score_)

# Best model
rf_best_model = rf_grid_search.best_estimator_
rf_best_model.fit(X_train, y_train)
rf_preds = rf_best_model.predict(X_test)
print("\n🎯 Random Forest Test Accuracy:", accuracy_score(y_test, rf_preds))
print(classification_report(y_test, rf_preds))

🎯 Best Random Forest Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
🎯 Best Random Forest Cross-Validation Accuracy: 0.588125

🎯 Random Forest Test Accuracy: 0.5925
                                    precision    recall  f1-score   support

                           Animals       0.54      0.56      0.55        80
             Exterior/urban noises       0.60      0.66      0.63        80
          Human, non-speech sounds       0.57      0.57      0.57        80
          Interior/domestic sounds       0.63      0.57      0.60        80
Natural soundscapes & water sounds       0.63      0.59      0.61        80

                          accuracy                           0.59       400
                         macro avg       0.59      0.59      0.59       400
                      weighted avg       0.59      0.59      0.59       400



In [32]:
# -------------------------------
# SVM with Hyperparameter Tuning
# -------------------------------
svm_params = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto', 0.1, 1],
}

# Timing the grid search
start = time.time()

svm_grid_search = GridSearchCV(SVC(), svm_params, cv=5, scoring='accuracy', n_jobs=-1)
svm_grid_search.fit(X_train, y_train)

print("⏱️ Grid Search Time: {:.2f} seconds".format(time.time() - start))

# Best parameters and cross-validated accuracy score
print("🎯 Best SVM Parameters:", svm_grid_search.best_params_)
print("🎯 Best SVM Cross-Validation Accuracy:", svm_grid_search.best_score_)

# Best model
svm_best_model = svm_grid_search.best_estimator_
svm_best_model.fit(X_train, y_train)
svm_preds = svm_best_model.predict(X_test)

# Final test results
print("\n🎯 SVM Test Accuracy:", accuracy_score(y_test, svm_preds))
print(classification_report(y_test, svm_preds))

KeyboardInterrupt: 

In [None]:
# -------------------------------
# KNN with Hyperparameter Tuning
# -------------------------------
knn_params = {
    'n_neighbors': [3, 5, 7, 10],  # Number of neighbors
    'weights': ['uniform', 'distance'],  # Weight function used in prediction
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  # Algorithm used to compute nearest neighbors
}

knn_grid_search = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5, scoring='accuracy', n_jobs=-1)
knn_grid_search.fit(X_train, y_train)

# Best parameters and cross-validated accuracy score
print("🎯 Best KNN Parameters:", knn_grid_search.best_params_)
print("🎯 Best KNN Cross-Validation Accuracy:", knn_grid_search.best_score_)

# Best model
knn_best_model = knn_grid_search.best_estimator_
knn_best_model.fit(X_train, y_train)
knn_preds = knn_best_model.predict(X_test)
print("\n🎯 KNN Test Accuracy:", accuracy_score(y_test, knn_preds))
print(classification_report(y_test, knn_preds))


🎯 KNN Results:
Accuracy: 0.48
                                    precision    recall  f1-score   support

                           Animals       0.48      0.55      0.51        80
             Exterior/urban noises       0.50      0.47      0.49        80
          Human, non-speech sounds       0.40      0.56      0.47        80
          Interior/domestic sounds       0.56      0.44      0.49        80
Natural soundscapes & water sounds       0.53      0.38      0.44        80

                          accuracy                           0.48       400
                         macro avg       0.49      0.48      0.48       400
                      weighted avg       0.49      0.48      0.48       400

