In [19]:
import os
import time
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import *

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, BaggingClassifier
from xgboost import XGBClassifier

# import StackNetClassifier
from pystacknet.pystacknet import StackNetClassifier

import warnings
warnings.filterwarnings('ignore')

ImportError: cannot import name 'StackNetClassifier' from 'pystacknet.pystacknet' (d:\Study\Code\Python\Sound_Classification_Bee_Qeen_Queenless\pystacknet\pystacknet\__init__.py)

In [2]:
path = os.getcwd() + "/dataset/"
path

'd:\\Study\\Code\\Python\\Sound_Classification_Bee_Qeen_Queenless/dataset/'

In [3]:
TRAINING_DIR = path+"/train"
VALIDATION_DIR = path + "/val"
TEST_DIR = path + "/test"

print(TRAINING_DIR)
print(VALIDATION_DIR)
print(TEST_DIR)

d:\Study\Code\Python\Sound_Classification_Bee_Qeen_Queenless/dataset//train
d:\Study\Code\Python\Sound_Classification_Bee_Qeen_Queenless/dataset//val
d:\Study\Code\Python\Sound_Classification_Bee_Qeen_Queenless/dataset//test


In [4]:
def data_loader(path):
    X = []
    Y = []
    for folder in os.listdir(path):
        for file in os.listdir(os.path.join(path, folder)):
            feature = np.load(os.path.join(path, folder, file))
            label = folder
            X.append(feature)
            Y.append(label)
    return X, Y

In [5]:
X_train, Y_train = data_loader(TRAINING_DIR)
X_val, Y_val = data_loader(VALIDATION_DIR)
X_test, Y_test = data_loader(TEST_DIR)

In [6]:
print("There are {} training samples and {} training labels".format(len(X_train), len(Y_train)))
print("There are {} validation samples and {} validation labels".format(len(X_val), len(Y_val)))
print("There are {} testing samples and {} testing labels".format(len(X_test), len(Y_test)))

There are 14000 training samples and 14000 training labels
There are 2000 validation samples and 2000 validation labels
There are 4000 testing samples and 4000 testing labels


In [7]:
X_train = np.array(X_train)
X_val = np.array(X_val)
X_test = np.array(X_test)
Y_train = np.array(Y_train)
Y_val = np.array(Y_val)
Y_test = np.array(Y_test)

print("Shape of X_train: {}".format(X_train.shape))
print("Shape of X_val: {}".format(X_val.shape))
print("Shape of X_test: {}".format(X_test.shape))

Shape of X_train: (14000, 64575)
Shape of X_val: (2000, 64575)
Shape of X_test: (4000, 64575)


In [8]:
# shuffle the dataset
shuffle_index = np.random.permutation(len(X_train))
X_train, Y_train = X_train[shuffle_index], Y_train[shuffle_index]
shuffle_index = np.random.permutation(len(X_val))
X_val, Y_val = X_val[shuffle_index], Y_val[shuffle_index]
shuffle_index = np.random.permutation(len(X_test))
X_test, Y_test = X_test[shuffle_index], Y_test[shuffle_index]

In [9]:
label_encoder = LabelEncoder()
Y_train = label_encoder.fit_transform(Y_train)
Y_val = label_encoder.fit_transform(Y_val)
Y_test = label_encoder.fit_transform(Y_test)

In [18]:
def model_training(model, X_train, Y_train, X_val, Y_val, X_test, Y_test):
    start_time = time.time()
    model.fit(X_train, Y_train)
    time_taken = time.time() - start_time
    Y_val_pred = model.predict(X_val)
    Y_test_pred = model.predict(X_test)
    val_acc = accuracy_score(Y_val, Y_val_pred)
    test_acc = accuracy_score(Y_test, Y_test_pred)
    recall_val = recall_score(Y_val, Y_val_pred, average='macro')
    recall_test = recall_score(Y_test, Y_test_pred, average='macro')
    precision_val = precision_score(Y_val, Y_val_pred, average='macro')
    precision_test = precision_score(Y_test, Y_test_pred, average='macro')
    f1_val = f1_score(Y_val, Y_val_pred, average='macro')
    f1_test = f1_score(Y_test, Y_test_pred, average='macro')
    conf_matrix_val = confusion_matrix(Y_val, Y_val_pred)
    conf_matrix_test = confusion_matrix(Y_test, Y_test_pred)
    print('We have used the following model: {}'.format(model))
    print('Test Accuracy: {}'.format(test_acc))
    print('F1 Score: {}'.format(f1_test))
    print('Confusion Matrix: {}'.format(conf_matrix_test))
    print('Time taken (seconds): {}'.format(round(time_taken, 2)))
    return val_acc, test_acc, recall_val, recall_test, precision_val, precision_test, f1_val, f1_test, conf_matrix_val, conf_matrix_test, time_taken

In [None]:
models = [
    RandomForestClassifier(max_depth=10, n_estimators=100, max_features=16, criterion='entropy', mean_samples_split=5),
    RandomForestClassifier(max_depth=10, n_estimators=50, max_features=254, criterion='gini, mean_samples_split=2'),
    KNeighborsClassifier(n_neighbors=10, weights='distance', algorithm='auto'),
    KNeighborsClassifier(n_neighbors=10, weights='uniform', algorithm='auto'),
    SVC(C=1.0, kernel='rbf', gamma='scale'),
    SVC(C=10, kernel='rbf', gamma='scale'),
    ExtraTreesClassifier(max_depth=10, n_estimators=100, max_features=16, criterion='entropy', mean_samples_split=5),
    XGBClassifier(max_depth=10, n_estimators=500, learning_rate=0.01, subsample=0.8, colsample_bytree=0.8, gamma = 0.1),
    GradientBoostingClassifier(max_depth=10, n_estimators=100, learning_rate=0.01, subsample=0.5, min_impurity_decrease = 0.1),
    AdaBoostClassifier(n_estimators=500, learning_rate=0.1, bootstrap=True, criterion='entropy'),
    MLPClassifier(hidden_layer_sizes=(100, 100, 100), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, max_iter=500),
    MLPClassifier(hidden_layer_sizes=(100, 100, 100), activation='sigmoid', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, max_iter=1000),
]

In [None]:
model_names = ['Random Forest', 'Random Forest', 'KNN', 'KNN', 'SVM', 'SVM', 'Extra Trees', 'XGBoost', 'Gradient Boosting', 'AdaBoost']
result = []

In [None]:
for model, model_name in zip(models, model_names):
    val_acc, test_acc, recall_val, recall_test, precision_val, precision_test, f1_val, f1_test, conf_matrix_val, conf_matrix_test, time_taken = model_training(model, X_train, Y_train, X_val, Y_val, X_test, Y_test)
    resul.append({'Model': model_name, 'Validation Accuracy': val_acc, 'Test Accuracy': test_acc, 'Validation Recall': recall_val, 'Test Recall': recall_test, 'Validation Precision': precision_val, 'Test Precision': precision_test, 'Validation F1': f1_val, 'Test F1': f1_test, 'Validation Confusion Matrix': conf_matrix_val, 'Test Confusion Matrix': conf_matrix_test, 'Time Taken': time_taken})

In [None]:
result_df = pd.DataFrame(resul)
result_df.to_csv('result_1D_STFT.csv', index=False)
result_df.head(12)

In [None]:
    # StackNetClassifier(classifiers=[RandomForestClassifier(max_depth=10, n_estimators=100, max_features=16, criterion='entropy', mean_samples_split=5),
    #                                 RandomForestClassifier(max_depth=10, n_estimators=50, max_features=254, criterion='gini, mean_samples_split=2'),
    #                                 KNeighborsClassifier(n_neighbors=10, weights='distance', algorithm='auto'),
    #                                 KNeighborsClassifier(n_neighbors=10, weights='uniform', algorithm='auto')])

In [None]:
# model = StackNetClassifier(classifiers=[RandomForestClassifier(max_depth=10, n_estimators=100, max_features=16, criterion='entropy', mean_samples_split=5),
#                                     RandomForestClassifier(max_depth=10, n_estimators=50, max_features=254, criterion='gini, mean_samples_split=2'),
#                                     KNeighborsClassifier(n_neighbors=10, weights='distance', algorithm='auto'),
#                                     KNeighborsClassifier(n_neighbors=10, weights='uniform', algorithm='auto'),
#                                     MLPClassifier(hidden_layer_sizes=(100, 100, 100), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, max_iter=500),
#                                     MLPClassifier(hidden_layer_sizes=(100, 100, 100), activation='sigmoid', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, max_iter=1000)
#                                        ], metric = 'accuracy', folds = 4, restacking = False, use_retraining = False, use_proba = True, random_state = 2021, n_jobs = -1, verbose = 1)                                    

In [None]:
def plot_curve(y_true_train, y_pred_train, y_true_val, y_pred_val, model_name):
    """
    Plots the ROC Curve given predictions and labels
    """
    fpr_train, tpr_train, _ = roc_curve(y_true_train, y_pred_train, pos_label=1)
    fpr_val, tpr_val, _ = roc_curve(y_true_val, y_pred_val, pos_label=1)
    plt.figure(figsize=(8, 8))
    plt.plot(fpr_train, tpr_train, color='black',
             lw=2, label=f"ROC train curve (AUC = {round(roc_auc_score(y_true_train, y_pred_train), 4)})")
    plt.plot(fpr_val, tpr_val, color='darkorange',
             lw=2, label=f"ROC validation curve (AUC = {round(roc_auc_score(y_true_val, y_pred_val), 4)})")
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.title(f'ROC Plot for {model_name}', weight="bold", fontsize=20)
    plt.legend(loc="lower right", fontsize=16)

In [None]:
plot_curve(Y_train, Y_train_pred, Y_val, Y_val_pred, 'Random Forest')