In [36]:
# reference: https://www.codementor.io/agarrahul01/multiclass-classification-using-random-forest-on-scikit-learn-library-hkk4lwawu

# Importing Libraries

In [37]:
#Importing Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib

# Fetching datasets

In [38]:
bonobo_df = pd.read_csv('bonobo/bonobo_tracks_data.csv')
crocoloko_df = pd.read_csv('crocoloko/crocoloko_tracks_data.csv')
dekel_df = pd.read_csv('dekel/dekel_tracks_data.csv')
mantismash_df = pd.read_csv('mantismash/mantismash_tracks_data.csv')
shwamp_df = pd.read_csv('shwamp/shwamp_tracks_data.csv')

#create df array
df_arr = [bonobo_df,crocoloko_df,dekel_df,mantismash_df,shwamp_df]

dataset = pd.concat(df_arr)

dataset

Unnamed: 0,id,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,popularity,track_type
0,3rCtueI7qBN2kZBZnXuk5K,0.713,0.725,-9.951,0.0385,0.539,0.886,0.111,0.24,122.041,369462,65,chill
1,2lJ4d8MCT6ZlDRHKJ1br14,0.64,0.844,-8.412,0.0374,0.395,0.933,0.0827,0.364,119.042,352247,61,chill
2,2pYvd6cHcAIMAM6xMD6nok,0.648,0.458,-15.169,0.0311,0.864,0.0944,0.366,0.0848,119.96,274693,60,chill
3,16ikmfoT1249gG3pzPvnYt,0.378,0.178,-15.348,0.0303,0.958,0.802,0.11,0.0591,77.263,223720,57,chill
4,5DAjrJqXqYtgr67pVhmUeR,0.529,0.739,-9.578,0.0718,0.0237,0.607,0.171,0.138,124.75,237547,61,chill
5,7Cg3F9ZsZ2TYUnlza49NYh,0.542,0.749,-8.333,0.0277,0.173,0.942,0.315,0.674,94.031,237827,58,chill
6,5EGJ7e7frJjYja6H4afzoT,0.723,0.571,-10.22,0.0572,0.21,0.334,0.133,0.185,115.518,302428,58,chill
7,0E7S1k9M1KshLISVC2EP1M,0.738,0.512,-12.924,0.0668,0.257,0.879,0.0824,0.0867,123.01,350055,59,chill
8,7sqii6BhIDpJChYpU3WjwS,0.595,0.788,-10.128,0.363,0.195,0.202,0.153,0.414,97.012,230307,54,chill
9,6wLDbkpdZEElEdKo9Wt010,0.728,0.71,-9.903,0.0535,0.0524,0.905,0.036,0.158,118.002,366989,56,chill


In [39]:
dataset.shape

(46, 13)

# Creating the dependent variable class

In [40]:
#Creating the dependent variable class
factor = pd.factorize(dataset['track_type'])
dataset.track_type = factor[0]
definitions = factor[1]

In [41]:
factor

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
        2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3], dtype=int64),
 Index(['chill', 'psy', 'deep psy', 'chill glitch'], dtype='object'))

In [42]:
dataset.track_type.head()

0    0
1    0
2    0
3    0
4    0
Name: track_type, dtype: int64

In [43]:
definitions

Index(['chill', 'psy', 'deep psy', 'chill glitch'], dtype='object')

# Extracting Features and Output

In [44]:
#Splitting the data into independent and dependent variables
X = dataset.iloc[:,1:12].values
y = dataset.iloc[:,12].values
print('The independent features set: ')
print(X[:5,:])
print('The dependent variable: ')
print(y[:5])

The independent features set: 
[[ 7.13000e-01  7.25000e-01 -9.95100e+00  3.85000e-02  5.39000e-01
   8.86000e-01  1.11000e-01  2.40000e-01  1.22041e+02  3.69462e+05
   6.50000e+01]
 [ 6.40000e-01  8.44000e-01 -8.41200e+00  3.74000e-02  3.95000e-01
   9.33000e-01  8.27000e-02  3.64000e-01  1.19042e+02  3.52247e+05
   6.10000e+01]
 [ 6.48000e-01  4.58000e-01 -1.51690e+01  3.11000e-02  8.64000e-01
   9.44000e-02  3.66000e-01  8.48000e-02  1.19960e+02  2.74693e+05
   6.00000e+01]
 [ 3.78000e-01  1.78000e-01 -1.53480e+01  3.03000e-02  9.58000e-01
   8.02000e-01  1.10000e-01  5.91000e-02  7.72630e+01  2.23720e+05
   5.70000e+01]
 [ 5.29000e-01  7.39000e-01 -9.57800e+00  7.18000e-02  2.37000e-02
   6.07000e-01  1.71000e-01  1.38000e-01  1.24750e+02  2.37547e+05
   6.10000e+01]]
The dependent variable: 
[0 0 0 0 0]


# Train-Test Data Splitting

In [45]:
# Creating the Training and Test set from data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 21)

# Feature Scaling

In [46]:
# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Training the model

In [47]:
# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

# Evaluating the performance

In [48]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
#Reverse factorize
definitions = ['chill','psy','deep psy','chill dlitch']
reversefactor = dict(zip(range(4),definitions))
y_test = np.vectorize(reversefactor.get)(y_test)
y_pred = np.vectorize(reversefactor.get)(y_pred)
# Making the Confusion Matrix
print(pd.crosstab(y_test, y_pred, rownames=['Actual Species'], colnames=['Predicted Species']))

Predicted Species  chill  chill dlitch  deep psy  psy
Actual Species                                       
chill                  2             0         0    0
chill dlitch           0             3         0    0
deep psy               1             0         1    0
psy                    0             2         0    3


# Storing the trained model

In [49]:
print(list(zip(dataset.columns[1:12], classifier.feature_importances_)))
joblib.dump(classifier, 'randomforestmodel.pkl') 

[('danceability', 0.042561102842292954), ('energy', 0.07442667580962599), ('loudness', 0.0113552878972069), ('speechiness', 0.06678429287059486), ('acousticness', 0.06157300122807633), ('instrumentalness', 0.03588854002698187), ('liveness', 0.06622735403928315), ('valence', 0.04439504637918941), ('tempo', 0.24086822862594698), ('duration_ms', 0.07637957948257698), ('popularity', 0.2795408907982246)]


['randomforestmodel.pkl']