In [71]:
import pandas as pd
import numpy as np
from interpret.glassbox import ExplainableBoostingClassifier, ClassificationTree
from interpret.blackbox import LimeTabular
from interpret import show
from interpret.perf import ROC
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

In [72]:
data = pd.read_csv('./data/data.csv')
data.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.995,['Carl Woitschach'],0.708,158648,0.195,0,6KbQ3uYMLKb5jDxLF7wYDD,0.563,10,0.151,-12.428,1,Singende Bataillone 1. Teil,0,1928,0.0506,118.469,0.779,1928
1,0.994,"['Robert Schumann', 'Vladimir Horowitz']",0.379,282133,0.0135,0,6KuQTIu1KoTTkLXKrwlLPV,0.901,8,0.0763,-28.454,1,"Fantasiestücke, Op. 111: Più tosto lento",0,1928,0.0462,83.972,0.0767,1928
2,0.604,['Seweryn Goszczyński'],0.749,104300,0.22,0,6L63VW0PibdM1HDSBoqnoM,0.0,5,0.119,-19.924,0,Chapter 1.18 - Zamek kaniowski,0,1928,0.929,107.177,0.88,1928
3,0.995,['Francisco Canaro'],0.781,180760,0.13,0,6M94FkXd15sOAOQYRnWPN8,0.887,1,0.111,-14.734,0,Bebamos Juntos - Instrumental (Remasterizado),0,9/25/28,0.0926,108.003,0.72,1928
4,0.99,"['Frédéric Chopin', 'Vladimir Horowitz']",0.21,687733,0.204,0,6N6tiFZ9vLTSOIxkj8qKrd,0.908,11,0.098,-16.829,1,"Polonaise-Fantaisie in A-Flat Major, Op. 61",1,1928,0.0424,62.149,0.0693,1928


In [85]:
data2020 = data[data.year == 2000]

In [86]:
n = len(data)
feats_num = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'key', 'liveness', 
             'loudness', 'speechiness', 'tempo', 'valence', 'year']
X = data2020[feats_num]
# X['offset'] = np.ones((len(X),))
y = data2020['popularity'] > 50

In [87]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3)

In [88]:
ebm = ExplainableBoostingClassifier()
ebm.fit(Xtrain, ytrain)

ExplainableBoostingClassifier(binning='quantile', early_stopping_rounds=50,
                              early_stopping_tolerance=0.0001,
                              feature_names=['acousticness', 'danceability',
                                             'energy', 'instrumentalness',
                                             'key', 'liveness', 'loudness',
                                             'speechiness', 'tempo', 'valence',
                                             'year'],
                              feature_types=['continuous', 'continuous',
                                             'continuous', 'continuous',
                                             'continuous', 'continuous',
                                             'continuous', 'continuous',
                                             'continuous', 'continuous',
                                             'categorical'],
                              inner_bags=0, interactions=0, learning_rate

In [89]:
classification_tree = ClassificationTree()
classification_tree.fit(Xtrain, ytrain)

<interpret.glassbox.decisiontree.ClassificationTree at 0x12c69c050>

In [90]:
ebm_global = ebm.explain_global()
show(ebm_global)

In [32]:
classification_tree_perf = ROC(classification_tree.predict_proba).explain_perf(Xtest, ytest, name='Classification Tree')
show(classification_tree_perf)


In [34]:
print("Accuracy on training set: {:.3f}".format(ebm.score(Xtrain, ytrain)))
print("Accuracy on validation set: {:.3f}".format(ebm.score(Xtest, ytest)))

Accuracy on training set: 0.905
Accuracy on validation set: 0.892


In [41]:
pca = PCA()
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
blackbox_model = Pipeline([('pca', pca), ('rf', rf)])
blackbox_model.fit(Xtrain, ytrain)

blackbox_perf = ROC(blackbox_model.predict_proba).explain_perf(Xtest, ytest, name='Blackbox')
show(blackbox_perf)

In [42]:
print("Accuracy on training set: {:.3f}".format(blackbox_model.score(Xtrain, ytrain)))
print("Accuracy on validation set: {:.3f}".format(blackbox_model.score(Xtest, ytest)))

Accuracy on training set: 0.999
Accuracy on validation set: 0.901


In [47]:
len(Xtrain)

1229

In [48]:
lime = LimeTabular(predict_fn=blackbox_model.predict_proba, data=Xtrain, random_state=1)
lime_local = lime.explain_local(Xtest[:20], ytest[:20], name='LIME')
show(lime_local)