**Initial Setup**

In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# data imported from https://www.kaggle.com/datasets/purumalgi/music-genre-classification?select=submission.csv
classes = pd.read_csv("submission.csv") 
data = pd.read_csv("train.csv")

#clean the data and convert to numbers:
data = data.dropna()
data = data.reset_index()
data["Artist Name"] = data["Artist Name"].astype(str)
data["Artist Name"] = preprocessing.LabelEncoder().fit_transform(data["Artist Name"])

print("size:", data.shape)
data.describe()

size: (11813, 18)


Unnamed: 0,index,Artist Name,Popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_in min/ms,time_signature,Class
count,11813.0,11813.0,11813.0,11813.0,11813.0,11813.0,11813.0,11813.0,11813.0,11813.0,11813.0,11813.0,11813.0,11813.0,11813.0,11813.0,11813.0
mean,8996.940574,3357.609244,43.100652,0.524944,0.670305,5.972573,-8.215807,0.623974,0.071107,0.239968,0.178991,0.195096,0.470115,122.928641,212944.6,3.913824,6.622619
std,5187.980453,1940.301548,16.5685,0.166013,0.242324,3.167348,4.239682,0.484407,0.067783,0.319142,0.303809,0.159743,0.244635,29.430883,115856.1,0.381625,3.246656
min,1.0,0.0,1.0,0.0644,2e-05,1.0,-36.214,0.0,0.0225,0.0,1e-06,0.0119,0.0183,30.557,0.50165,1.0,0.0
25%,4510.0,1684.0,32.0,0.413,0.513,3.0,-9.994,0.0,0.0346,0.00189,9.6e-05,0.0969,0.271,100.001,175533.0,4.0,5.0
50%,8971.0,3356.0,42.0,0.527,0.716,6.0,-7.314,1.0,0.0463,0.0561,0.00429,0.127,0.462,120.046,217883.0,4.0,8.0
75%,13507.0,5079.0,54.0,0.638,0.875,9.0,-5.341,1.0,0.0765,0.421,0.209,0.256,0.66,141.877,263587.0,4.0,10.0
max,17995.0,6702.0,98.0,0.989,1.0,11.0,1.355,1.0,0.935,0.996,0.996,0.992,0.98,217.416,1477187.0,5.0,10.0


**Example of a song's features**

In [2]:
song_example = pd.read_csv("train.csv", nrows=1)
song_example

Unnamed: 0,Artist Name,Track Name,Popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_in min/ms,time_signature,Class
0,Bruno Mars,That's What I Like (feat. Gucci Mane),60.0,0.854,0.564,1.0,-4.964,1,0.0485,0.0171,,0.0849,0.899,134.071,234596.0,4,5


**Example of class assignment**

In [3]:
classes_example = pd.read_csv("submission.csv", nrows=1)
classes_example

Unnamed: 0,Acoustic/Folk_0,Alt_Music_1,Blues_2,Bollywood_3,Country_4,HipHop_5,Indie Alt_6,Instrumental_7,Metal_8,Pop_9,Rock_10
0,1,0,0,0,0,0,0,0,0,0,0


**Populate X, y, and testing variables**

In [10]:
X = data.drop(columns=["index", "Track Name", "Popularity", "Artist Name", "Class"])
y = data["Class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#feature scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

X

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_in min/ms,time_signature
0,0.382,0.814,3.0,-7.230,1,0.0406,0.001100,0.004010,0.1010,0.5690,116.454,251733.0,4
1,0.434,0.614,6.0,-8.334,1,0.0525,0.486000,0.000196,0.3940,0.7870,147.681,109667.0,4
2,0.167,0.975,2.0,-4.279,1,0.2160,0.000169,0.016100,0.1720,0.0918,199.060,229960.0,4
3,0.235,0.977,6.0,0.878,1,0.1070,0.003530,0.006040,0.1720,0.2410,152.952,208133.0,4
4,0.674,0.658,5.0,-9.647,0,0.1040,0.404000,0.000001,0.0981,0.6770,143.292,329387.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11808,0.166,0.109,7.0,-17.100,0,0.0413,0.993000,0.824000,0.0984,0.1770,171.587,193450.0,3
11809,0.638,0.223,11.0,-10.174,0,0.0329,0.858000,0.000016,0.0705,0.3350,73.016,257067.0,4
11810,0.558,0.981,4.0,-4.683,0,0.0712,0.000030,0.000136,0.6660,0.2620,105.000,216222.0,4
11811,0.215,0.805,6.0,-12.757,0,0.1340,0.001290,0.916000,0.2560,0.3550,131.363,219693.0,4


**Train and Test Model**

In [11]:
model = RandomForestClassifier(n_estimators = 200, criterion = 'entropy')
model.fit(X_train, y_train)
predictions = model.predict(X_test)

**Evaluate Performance**

In [12]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, predictions)
cm

array([[ 59,   0,   0,   4,   5,   0,   0,   0,   0,   1,   0],
       [  0,   2,   4,   0,   0,   6, 105,   0,   4,   9,  86],
       [  0,   2,  55,   0,   0,   0,  35,   0,   2,  12, 110],
       [ 10,   0,   0,  45,   0,   0,   0,   0,   0,   0,   0],
       [ 12,   0,   0,   0,   9,   0,   0,   0,   0,   1,   3],
       [  0,   0,   5,   0,   0,  37,  12,   0,   0,  21,  18],
       [  0,  43,   9,   0,   0,   4, 142,   1,  13,  27, 148],
       [  5,   0,   0,   0,   0,   0,   0,  92,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,  12,   0, 158,   3, 134],
       [  6,   5,   3,   0,   4,  27,  45,   0,   0,  71,  52],
       [  7,  30,  26,   0,   1,   2,  76,   0,  74,  42, 427]],
      dtype=int64)

In [13]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.60      0.86      0.70        69
           1       0.02      0.01      0.01       216
           2       0.54      0.25      0.35       216
           3       0.92      0.82      0.87        55
           4       0.47      0.36      0.41        25
           5       0.49      0.40      0.44        93
           6       0.33      0.37      0.35       387
           7       0.99      0.95      0.97        97
           8       0.63      0.51      0.57       307
           9       0.38      0.33      0.35       213
          10       0.44      0.62      0.51       685

    accuracy                           0.46      2363
   macro avg       0.53      0.50      0.50      2363
weighted avg       0.45      0.46      0.45      2363



**Save Model**

In [14]:
import joblib
joblib.dump(model,  "genre-finder")

['genre-finder']