In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier


df = pd.read_csv('songs_normalize.csv')


df.describe()

Unnamed: 0,duration_ms,year,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,228748.1245,2009.494,59.8725,0.667438,0.720366,5.378,-5.512434,0.5535,0.103568,0.128955,0.015226,0.181216,0.55169,120.122558
std,39136.569008,5.85996,21.335577,0.140416,0.152745,3.615059,1.933482,0.497254,0.096159,0.173346,0.087771,0.140669,0.220864,26.967112
min,113000.0,1998.0,0.0,0.129,0.0549,0.0,-20.514,0.0,0.0232,1.9e-05,0.0,0.0215,0.0381,60.019
25%,203580.0,2004.0,56.0,0.581,0.622,2.0,-6.49025,0.0,0.0396,0.014,0.0,0.0881,0.38675,98.98575
50%,223279.5,2010.0,65.5,0.676,0.736,6.0,-5.285,1.0,0.05985,0.0557,0.0,0.124,0.5575,120.0215
75%,248133.0,2015.0,73.0,0.764,0.839,8.0,-4.16775,1.0,0.129,0.17625,6.8e-05,0.241,0.73,134.2655
max,484146.0,2020.0,89.0,0.975,0.999,11.0,-0.276,1.0,0.576,0.976,0.985,0.853,0.973,210.851


In [3]:
# CLEANSING DATA

df = df.dropna()
df = df.drop_duplicates()
df = df[df['genre'].isin(['rock', 'pop','Dance/Electronic', 'country', 'hip hop', 'R&B', 'metal'])]

Z = df.copy()
X = df.drop(columns=['genre', 'artist', 'song', 'mode', 'key', 'year'])
# X = df[['tempo', 'valence', 'instrumentalness']]
y = df['genre']

X.head(10)
y.head(10)



0                  pop
4                  pop
6              hip hop
9     Dance/Electronic
10                 pop
11                 pop
12                 pop
16                 pop
17                 pop
18                 pop
Name: genre, dtype: object

In [4]:
# GET TRAINING AND TESTING DATA

(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size = 0.1)


In [5]:
# DECISION TREE MODEL - accuracy around 70%

model = DecisionTreeClassifier()

model.fit(X_train, y_train)

treePredictions = model.predict(X_test)

score = accuracy_score(y_test, treePredictions)

score

0.6716417910447762

In [10]:
# RANDOM FOREST CLASSIFIER

model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
model.fit(X_train, y_train)

forestPredictions = model.predict(X_test)
score = accuracy_score(y_test, forestPredictions)


print(score)

0.7313432835820896


In [7]:
# FEATURE IMPORTANCE CHECK 

importance = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': importance})
print(feature_importance_df.sort_values(by='Importance', ascending=False))

             Feature  Importance
1           explicit    0.153429
6        speechiness    0.133101
3       danceability    0.082379
2         popularity    0.080919
8   instrumentalness    0.080601
0        duration_ms    0.079442
7       acousticness    0.077029
11             tempo    0.067779
4             energy    0.063673
9           liveness    0.061961
5           loudness    0.060567
10           valence    0.059120


In [12]:
#VIEWING PREDICTIONS 

results_df = X_test.copy()
results_df['Prediction'] = forestPredictions
results_df['True Labels'] = y_test

results_df['Artist'] = X_test.index.map(Z['artist'])
results_df['Song'] = X_test.index.map(Z['song'])

results_df.head(50)

Unnamed: 0,duration_ms,explicit,popularity,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,Prediction,True Labels,Artist,Song
1298,257720,False,74,0.576,0.835,-6.826,0.0486,0.337,0.0,0.082,0.476,150.017,pop,pop,Bruno Mars,It Will Rain
1882,217288,False,74,0.704,0.859,-4.877,0.0996,0.0185,0.0,0.0215,0.926,105.115,pop,pop,Bruno Mars,Finesse - Remix; feat. Cardi B
1348,233786,True,76,0.853,0.693,-6.87,0.275,0.0239,0.0,0.11,0.662,95.967,hip hop,hip hop,A$AP Rocky,"F**kin' Problems (feat. Drake, 2 Chainz & Kend..."
761,182826,False,76,0.459,0.895,-3.126,0.0805,0.00725,0.0,0.206,0.572,181.04,rock,rock,All Time Low,"Dear Maria, Count Me In"
535,250760,True,77,0.637,0.678,-3.798,0.266,0.209,0.0,0.156,0.254,84.039,hip hop,hip hop,Eminem,Mockingbird
1951,187436,False,7,0.719,0.704,-4.724,0.0476,0.0691,0.0,0.166,0.628,133.002,pop,pop,Ava Max,Sweet but Psycho
1630,212106,False,78,0.468,0.627,-5.085,0.0476,0.0281,8e-06,0.11,0.159,179.642,pop,Dance/Electronic,Alan Walker,Faded
1866,180493,False,67,0.856,0.632,-3.692,0.074,0.193,0.0,0.0688,0.697,112.009,pop,pop,Charlie Puth,Done for Me (feat. Kehlani)
570,217706,False,58,0.49,0.956,-4.556,0.0407,0.00466,6e-06,0.242,0.577,160.028,pop,rock,Kaiser Chiefs,Everyday I Love You Less And Less
177,185013,False,66,0.747,0.706,-4.653,0.0413,0.0844,0.00355,0.174,0.567,94.019,pop,pop,Atomic Kitten,Whole Again
