In [114]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier


df = pd.read_csv('songs_normalize.csv')


df.head(10)

Unnamed: 0,artist,song,duration_ms,explicit,year,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre
0,Britney Spears,Oops!...I Did It Again,211160,False,2000,77,0.751,0.834,1,-5.444,0,0.0437,0.3,1.8e-05,0.355,0.894,95.053,pop
1,blink-182,All The Small Things,167066,False,1999,79,0.434,0.897,0,-4.918,1,0.0488,0.0103,0.0,0.612,0.684,148.726,"rock, pop"
2,Faith Hill,Breathe,250546,False,1999,66,0.529,0.496,7,-9.007,1,0.029,0.173,0.0,0.251,0.278,136.859,"pop, country"
3,Bon Jovi,It's My Life,224493,False,2000,78,0.551,0.913,0,-4.063,0,0.0466,0.0263,1.3e-05,0.347,0.544,119.992,"rock, metal"
4,*NSYNC,Bye Bye Bye,200560,False,2000,65,0.614,0.928,8,-4.806,0,0.0516,0.0408,0.00104,0.0845,0.879,172.656,pop
5,Sisqo,Thong Song,253733,True,1999,69,0.706,0.888,2,-6.959,1,0.0654,0.119,9.6e-05,0.07,0.714,121.549,"hip hop, pop, R&B"
6,Eminem,The Real Slim Shady,284200,True,2000,86,0.949,0.661,5,-4.244,0,0.0572,0.0302,0.0,0.0454,0.76,104.504,hip hop
7,Robbie Williams,Rock DJ,258560,False,2000,68,0.708,0.772,7,-4.264,1,0.0322,0.0267,0.0,0.467,0.861,103.035,"pop, rock"
8,Destiny's Child,Say My Name,271333,False,1999,75,0.713,0.678,5,-3.525,0,0.102,0.273,0.0,0.149,0.734,138.009,"pop, R&B"
9,Modjo,Lady - Hear Me Tonight,307153,False,2001,77,0.72,0.808,6,-5.627,1,0.0379,0.00793,0.0293,0.0634,0.869,126.041,Dance/Electronic


In [124]:
# CLEANSING DATA

df = df.dropna()
df = df.drop_duplicates()
df = df[df['genre'].isin(['rock', 'pop','Dance/Electronic', 'country', 'hip hop', 'R&B', 'metal'])]

Z = df.copy()
X = df.drop(columns=['genre', 'artist', 'song', 'mode', 'key', 'year'])
# X = df[['tempo', 'valence', 'instrumentalness']]
y = df['genre']

X.head(10)
y.head(10)



0                  pop
4                  pop
6              hip hop
9     Dance/Electronic
10                 pop
11                 pop
12                 pop
16                 pop
17                 pop
18                 pop
Name: genre, dtype: object

In [116]:
# GET TRAINING AND TESTING DATA

(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size = 0.1)


In [131]:
# DECISION TREE MODEL - accuracy around 70%

model = DecisionTreeClassifier()

model.fit(X_train, y_train)

treePredictions = model.predict(X_test)

score = accuracy_score(y_test, treePredictions)

score

0.6268656716417911

In [132]:
# RANDOM FOREST CLASSIFIER

model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
model.fit(X_train, y_train)

forestPredictions = model.predict(X_test)
score = accuracy_score(y_test, forestPredictions)


print(score)

0.746268656716418


In [119]:
# FEATURE IMPORTANCE CHECK 

importance = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': importance})
print(feature_importance_df.sort_values(by='Importance', ascending=False))

             Feature  Importance
1           explicit    0.149903
6        speechiness    0.121829
3       danceability    0.089837
8   instrumentalness    0.089549
2         popularity    0.086393
7       acousticness    0.076300
0        duration_ms    0.075590
11             tempo    0.067605
4             energy    0.065071
5           loudness    0.062802
10           valence    0.058241
9           liveness    0.056881


In [135]:
#VIEWING PREDICTIONS 

results_df = X_test.copy()
results_df['Predictions'] = predictions
results_df['True Labels'] = y_test

results_df['Artist'] = X_test.index.map(Z['artist'])
results_df['Song'] = X_test.index.map(Z['song'])

results_df.head(50)

Unnamed: 0,duration_ms,explicit,popularity,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,Predictions,True Labels,Artist,Song
484,223506,True,80,0.657,0.734,-4.832,0.484,0.149,0.0,0.139,0.434,91.03,pop,hip hop,Kanye West,All Falls Down
76,241373,False,64,0.634,0.886,-5.424,0.0434,0.154,0.0,0.118,0.577,93.04,pop,pop,Céline Dion,That's the Way It Is
1782,189466,False,0,0.744,0.739,-5.35,0.0387,0.00459,0.0,0.306,0.649,104.99,pop,pop,Justin Bieber,Friends (with BloodPop®)
719,311866,True,54,0.617,0.717,-7.858,0.153,0.00564,0.0,0.408,0.49,103.992,pop,hip hop,Kanye West,Stronger
556,215000,False,61,0.412,0.944,-2.896,0.0448,0.0724,0.000493,0.801,0.79,123.091,pop,rock,Franz Ferdinand,Do You Want To
1910,182160,False,84,0.501,0.405,-5.679,0.0319,0.751,0.0,0.105,0.446,109.891,pop,pop,Lewis Capaldi,Someone You Loved
560,231533,False,74,0.179,0.912,-3.881,0.0791,0.0014,0.000294,0.582,0.289,182.99,pop,rock,Thirty Seconds To Mars,The Kill
679,185586,False,70,0.566,0.815,-4.481,0.14,0.0737,0.0,0.12,0.672,169.961,pop,rock,Panic! At The Disco,I Write Sins Not Tragedies
479,242413,True,68,0.414,0.936,-2.407,0.0758,0.00136,0.0,0.369,0.74,170.229,pop,pop,Avril Lavigne,My Happy Ending
1474,226600,False,83,0.672,0.52,-7.747,0.0353,0.859,0.0,0.115,0.37,120.001,pop,pop,One Direction,Night Changes
