## Scikit-Learn Random Forest

**Summary:** 
This notebook was used to run the Sciki-learn random forest model experiments as an addition to the experiments run using Edge Impulse. 


In [None]:
pip install visualkeras

### Dependencies

In [None]:
import os
from tqdm import tqdm
import sys
import pickle

import pandas as pd
import numpy as np
from scipy.io import wavfile

from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

import warnings
from scipy.io.wavfile import WavFileWarning
from sklearn.metrics import accuracy_score
import visualkeras

# dependencies for compute_multiclass_metrics
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, classification_report, precision_recall_fscore_support, roc_auc_score

from sklearn.preprocessing import label_binarize
from sklearn.metrics import balanced_accuracy_score

from everywhereml.sklearn.ensemble import RandomForestClassifier

## Path Variables

In [None]:
random_state = 42
n_mel = 58

In [None]:
data_path = f'/home/bukowskin/CSC_7901_ML_Capstone/data/data_split_random_state_{random_state}/number_mel_filters_{n_mel}'

mfe_data_path = f'{data_path}/mfe_data_split'
mfcc_data_path = f'{data_path}/mfcc_data_split'

save_model_path = f'/home/bukowskin/CSC_7901_ML_Capstone/models'

## Data

### Load MFE Data

In [None]:
with open(f'{mfe_data_path}/train_data.pkl', 'rb') as f:
    mfe_train = pickle.load(f)
    
with open(f'{mfe_data_path}/test_data.pkl', 'rb') as f:
    mfe_test = pickle.load(f)

In [None]:
# train 
train_mfe_feat = mfe_train['features']
y_train = mfe_train['labels']

# test 
test_mfe_feat = mfe_test['features']
y_test = mfe_test['labels']

# height and width
mfe_height = mfe_train['mfe_height']
mfe_width = mfe_train['mfe_width']

### Load MFCC  Data

In [None]:
with open(f'{mfcc_data_path}/train_data.pkl', 'rb') as f:
    mfcc_train = pickle.load(f)
    
with open(f'{mfcc_data_path}/test_data.pkl', 'rb') as f:
    mfcc_test = pickle.load(f)

In [None]:
# train 
train_mfcc_feat = mfcc_train['features']

# test 
test_mfcc_feat = mfcc_test['features']

# height and width
mfcc_height = mfcc_train['mfe_height']
mfcc_width = mfcc_train['mfe_width']

### Prepare Data

Taking the computed height and width and reshaping features for model input

### MFE Features

In [None]:
# train
train_mfe_feat = np.array(train_mfe_feat)
print(f'Training MFE Features Shape: {train_mfe_feat.shape}')

# test
test_mfe_feat = np.array(test_mfe_feat)
print(f'Testing MFE Features Shape: {test_mfe_feat.shape}')

### MFCC Features

In [None]:
# train
train_mfcc_feat = np.array(train_mfcc_feat)
print(f'Training MFCC Features Shape: {train_mfcc_feat.shape}')

# test
test_mfcc_feat = np.array(test_mfcc_feat)
print(f'MFCC Features Shape: {test_mfcc_feat.shape}')

### Labels

In [None]:
# train
# encoding labels to numerical representations
train_label_encoder = LabelEncoder()
y_train = train_label_encoder.fit_transform(y_train)
label_mapping = dict(zip(range(len(train_label_encoder.classes_)),train_label_encoder.classes_,))


# test
# encoding labels to numerical representations
test_label_encoder = LabelEncoder()
y_test = test_label_encoder.fit_transform(y_test)
label_mapping = dict(zip(range(len(test_label_encoder.classes_)),test_label_encoder.classes_))


print(f'Label mapping: {label_mapping}')

## Random Forest

## MFE

In [None]:
# splitting data into training and validation sets like in edge impulse 
X_train, X_val, y_train, y_val = train_test_split(train_mfe_feat, y_train, test_size=0.5, random_state=42)

In [None]:
NUM_TRIALS = 3

accuracies = []
auc_scores = []
precisions = []
recalls = []
f1_scores = []
n_trees = 100
for trial in range(NUM_TRIALS):
    print(f"Trial {trial + 1}/{NUM_TRIALS}")

    # train 
    rf_mfe = RandomForestClassifier(n_estimators = n_trees)
    rf_mfe.fit(X_train, y_train)

    # predict
    y_pred_probs = rf_mfe.predict_proba(test_mfe_feat)
    y_pred = np.argmax(y_pred_probs, axis=1)

    # metrics
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

    precision, recall, f1_score, _ = precision_recall_fscore_support(
        y_test, y_pred, average='weighted'
    )
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1_score)

    # binarize true labels for AUC 
    y_test_bin = label_binarize(y_test, classes=np.unique(y_test))
    auc_score = roc_auc_score(y_test_bin, y_pred_probs, average='micro')
    auc_scores.append(auc_score)

# calc avg and std
avg_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)

avg_precision = np.mean(precisions)
std_precision = np.std(precisions)

avg_recall = np.mean(recalls)
std_recall = np.std(recalls)

avg_f1_score = np.mean(f1_scores)
std_f1_score = np.std(f1_scores)

avg_auc = np.mean(auc_scores)
std_auc = np.std(auc_scores)

In [None]:
print("\nResults (averaged over 3 trials):")
print(f'Number Estimators: {n_trees}')
print(f'Number of filters: {n_mel}')
print(f"Accuracy: {avg_accuracy*100:.2f} ± {std_accuracy*100:.2f}")
print(f"Precision: {avg_precision:.2f} ± {std_precision:.2f}")
print(f"Recall: {avg_recall:.2f} ± {std_recall:.2f}")
print(f"F1-Score: {avg_f1_score:.2f} ± {std_f1_score:.2f}")
print(f"AUC: {avg_auc:.2f} ± {std_auc:.2f}")

## Save Model

In [None]:
with open(f'/home/bukowskin/CSC_7901_ML_Capstone/models/RandomForestClassifier_{n_mel}_{n_trees}_mfe.h', 'w') as file:
    file.write(rf_mfe.to_arduino(instance_name='RandomForestClassifier'))

## MFCC 

In [None]:
# splitting data into training and validation sets like in edge impulse 
X_train, X_val, y_train, y_val = train_test_split(train_mfcc_feat, y_train, test_size=0.5, random_state=42)

In [None]:
NUM_TRIALS = 3

accuracies = []
auc_scores = []
precisions = []
recalls = []
f1_scores = []
n_trees = 50

for trial in range(NUM_TRIALS):
    
    print(f"Trial {trial + 1}/{NUM_TRIALS}")
    
    # train 
    rf_mfcc = RandomForestClassifier(n_estimators = n_trees)
    rf_mfcc.fit(X_train, y_train)

    # predict 
    y_pred_probs = rf_mfcc.predict_proba(test_mfcc_feat)
    y_pred = np.argmax(y_pred_probs, axis=1)

    # metrics
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

    precision, recall, f1_score, _ = precision_recall_fscore_support(
        y_test, y_pred, average='weighted'
    )
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1_score)

    # binarize true labels for AUC 
    y_test_bin = label_binarize(y_test, classes=np.unique(y_test))
    auc_score = roc_auc_score(y_test_bin, y_pred_probs, average='micro')
    auc_scores.append(auc_score)

# calc avg and std
avg_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)

avg_precision = np.mean(precisions)
std_precision = np.std(precisions)

avg_recall = np.mean(recalls)
std_recall = np.std(recalls)

avg_f1_score = np.mean(f1_scores)
std_f1_score = np.std(f1_scores)

avg_auc = np.mean(auc_scores)
std_auc = np.std(auc_scores)

In [None]:
print("\nResults (averaged over 3 trials):")
print(f'Number Estimators: {n_trees}')
print(f'Number of filters: {n_mel}')
print(f"Accuracy: {avg_accuracy*100:.2f} ± {std_accuracy*100:.2f}")
print(f"Precision: {avg_precision:.2f} ± {std_precision:.2f}")
print(f"Recall: {avg_recall:.2f} ± {std_recall:.2f}")
print(f"F1-Score: {avg_f1_score:.2f} ± {std_f1_score:.2f}")
print(f"AUC: {avg_auc:.2f} ± {std_auc:.2f}")

### Save Model

Using everywhereml library to generate corresponding C code for Random Forest [1].

In [None]:
with open(f'/home/bukowskin/CSC_7901_ML_Capstone/models/RandomForestClassifier_{n_mel}_{n_trees}_mfcc.h', 'w') as file:
    file.write(rf_mfcc.to_arduino(instance_name='RandomForestClassifier'))

## References

[1] S. Salerno, W. Flinn and V. , "everywhereml," 2021. [Online]. Available: https://github.com/eloquentarduino/everywhereml.