# Random Forests

In [1]:
from preprocessing import preprocessing_data
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn import tree
from sklearn.metrics import classification_report
import plotly.express as px
import plotly.offline as pyo
pyo.init_notebook_mode(connected=True)

df_raw = pd.read_csv("hf://datasets/maharshipandya/spotify-tracks-dataset/dataset.csv")
df = preprocessing_data(df_raw)
genres = df['track_genre'].unique()

X_train, X_test, y_train, y_test = train_test_split(df.drop(['track_genre'], axis=1),
                                                    df.track_genre, test_size=0.3)

In [2]:
music_rf = RandomForestClassifier(random_state=123)
music_rf.fit(X_train, y_train)

music_rf_pred = music_rf.predict_proba(X_test)

music_rf_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': music_rf.feature_importances_
}).sort_values('importance', ascending=False).reset_index().drop('index', axis=1)
music_rf_importance

Unnamed: 0,feature,importance
0,popularity,0.142322
1,acousticness,0.102606
2,danceability,0.09348
3,speechiness,0.088642
4,valence,0.083577
5,loudness,0.080462
6,energy,0.079441
7,instrumentalness,0.079116
8,duration_ms,0.075707
9,tempo,0.06536


The Random Forest feature importance is represented in the bar plot shown below.

In [3]:


px.bar(music_rf_importance, y='feature', x='importance',
       orientation='h', title='Impurity Importance for Random Forest', height=1000)

In [4]:
rf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [2, 4],
    'max_features': ['sqrt', 'log2'],
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

y_pred = grid_search.best_estimator_.predict(X_test)
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 144 candidates, totalling 720 fits
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   1.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   1.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   1.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   1.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   1.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   2.0s[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   2.0s

[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_e

In [5]:
cv_score_rf = cross_val_score(rf, X_test, y_test, cv=5, scoring='accuracy')
cv_score_rf

array([0.67752715, 0.69397993, 0.69648829, 0.67892977, 0.68812709])

In [6]:
np.mean(cv_score_rf)

0.6870104469646804