# Random Forest Training
We are going to use the data on the 30 second clips to train our basline Random Forsest Model

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras_preprocessing.image import load_img, img_to_array
import tensorflow as tf
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping

# Load in the data set and encode the labels
This allows us to make a prediction on the 10 labels. Predictions made are integers not strings

In [2]:
data = pd.read_csv("./Data/features_30_sec.csv")
print(len(data))
print(data["label"][::100])
encoder = LabelEncoder()
data["label"] = encoder.fit_transform(data["label"])
print(data["label"][::100])
print(data.shape)

1000
0          blues
100    classical
200      country
300        disco
400       hiphop
500         jazz
600        metal
700          pop
800       reggae
900         rock
Name: label, dtype: object
0      0
100    1
200    2
300    3
400    4
500    5
600    6
700    7
800    8
900    9
Name: label, dtype: int64
(1000, 60)


# Remove incompatible data and split data
We are removing the label and the filename from the X data. This is required because they are strings and the genere is in the filename so this could skew the model.

In [3]:
clean_X = data.drop(['label', 'filename'], axis=1)
clean_y = data['label']

X_train, X_test, y_train, y_test = train_test_split(clean_X, clean_y, test_size=0.2, random_state=101)
print(X_train.shape)
print(y_train.shape)

(800, 58)
(800,)


# Train and assess the Random Forest 

In [4]:
number_tests = 50
baseline_accuracy = np.zeros(number_tests)
for i in range(number_tests):
    model = RandomForestClassifier(n_estimators=100)
    model.fit(X=X_train, y=y_train)

    y_pred = model.predict(X_test)
    baseline_accuracy[i] = np.mean(y_pred == y_test)
print(baseline_accuracy)
print(f"Random Forest average accuracy: {np.mean(baseline_accuracy)*100}%")

pred_genres = set(y_pred)
print(f"Predicted Genres: {pred_genres}")

[0.775 0.775 0.785 0.78  0.755 0.755 0.755 0.755 0.795 0.785 0.79  0.765
 0.785 0.775 0.8   0.75  0.79  0.77  0.775 0.78  0.78  0.74  0.81  0.785
 0.79  0.765 0.77  0.775 0.74  0.805 0.76  0.77  0.78  0.78  0.75  0.78
 0.765 0.8   0.755 0.775 0.8   0.785 0.79  0.805 0.775 0.78  0.765 0.785
 0.755 0.78 ]
Random Forest average accuracy: 77.58000000000001%
Predicted Genres: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}


Running the model 50 times it looks like we have average prediction accuracy of 78% with a Random Forest with 100 estimators. This is our baseline prediction accuracy that we hope to beat with the Convolutional Neural Network.