In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pydot
import time

from pandas.plotting import scatter_matrix

from sklearn.tree import DecisionTreeClassifier
#from id3 import Id3Estimator
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, RandomizedSearchCV, GridSearchCV
from sklearn.tree import export_graphviz
from sklearn import tree
from sklearn.metrics import confusion_matrix, classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, \
r2_score, mean_squared_error, mean_absolute_error, median_absolute_error, mean_squared_log_error, explained_variance_score

from IPython.display import Image

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

plt.style.use('fivethirtyeight')

## EVALUATION PARAMETERS

In [2]:
TREE_DEPTH = 2

In [3]:
def compare_values(y_true, y_pred, target):
        
    tr = pd.DataFrame(data=y_true, columns=[target])
    pr = pd.DataFrame(data=y_pred, columns=[target])


    fig, ax = plt.subplots(1, 2, sharex='col', sharey='row', figsize=(15,6))

    sns.countplot(x=target, data=tr, ax=ax[0], palette='viridis', alpha=0.7, hue=target, dodge=False)
    sns.countplot(x=target, data=pr, ax=ax[1], palette='viridis', alpha=0.7, hue=target, dodge=False)


    fig.suptitle('True vs Predicted Comparison', fontsize=20)

    ax[0].tick_params(labelsize=12)
    ax[1].tick_params(labelsize=12)
    ax[0].set_title("True values", fontsize=18)
    ax[1].set_title("Predicted values", fontsize=18)
    plt.show()

In [4]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mean = errors / test_labels
    mean = mean[np.isfinite(mean)]
    mape = 100 * np.mean(mean)
    accuracy = 100 - mape

    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [5]:
# Evaluate run time and prediction accuracy
def evaluate_model(model, x_train, y_train, x_test, y_test):
    n_trees = model.get_params()['max_depth']
    n_features = x_train.shape[1]
    
    # Train and predict 10 times to evaluate time and accuracy
    predictions = []
    run_times = []
    for _ in range(10):
        start_time = time.time()
        model.fit(x_train, y_train)
        predictions.append(model.predict(x_test))
    
        end_time = time.time()
        run_times.append(end_time - start_time)
    
    # Run time and predictions need to be averaged
    run_time = np.mean(run_times)
    predictions = np.mean(np.array(predictions), axis = 0)
    
    # Calculate performance metrics
    errors = abs(predictions - y_test)
    mean_error = np.mean(errors)
    mean = errors / y_test
    mean = mean[np.isfinite(mean)]
    mape = 100 * np.mean(mean)
    accuracy = 100 - mape
    
    # Return results in a dictionary
    results = {'time': run_time, 'error': mean_error, 'accuracy': accuracy, 'depth': n_trees, 'n_features': n_features}
    
    return results

In [6]:
def plot_results(model, param = 'max_depth', name = 'Depth'):
    param_name = 'param_%s' % param

    # Extract information from the cross validation model
    train_scores = model.cv_results_['mean_train_score']
    test_scores = model.cv_results_['mean_test_score']
    train_time = model.cv_results_['mean_fit_time']
    param_values = list(model.cv_results_[param_name])
    
    # Plot the scores over the parameter
    plt.subplots(1, 2, figsize=(10, 6))
    plt.subplot(121)
    plt.plot(param_values, train_scores, 'bo-', label = 'train')
    plt.plot(param_values, test_scores, 'go-', label = 'test')
    plt.ylim(ymin = -10, ymax = 0)
    plt.legend()
    plt.xlabel(name)
    plt.ylabel('Neg Mean Absolute Error')
    plt.title('Score vs %s' % name)
    
    plt.subplot(122)
    plt.plot(param_values, train_time, 'ro-')
    plt.ylim(ymin = 0.0, ymax = 2.0)
    plt.xlabel(name)
    plt.ylabel('Train Time (sec)')
    plt.title('Training Time vs %s' % name)
    
    
    plt.tight_layout(pad = 4)

# Read and clean dataset

In [7]:
# read
df = pd.read_csv("spotify-dataset.csv")

# lower the attribute
df['Top Genre'] = (df["Top Genre"].str.strip()).str.lower()

# drop genres that have less than 20 instances
to_remove = [genre for genre in df['Top Genre'].unique() if df['Top Genre'].value_counts()[genre] < 20] 
for r in to_remove:
    df = df[df['Top Genre'] != r]

# convert negative values to positive, because the percenption of sound is relative (-N dB == N dB in human ear)
df['Loudness (dB)'] = df['Loudness (dB)'].abs()

# convert duration to int
df['Length (Duration)'] = pd.to_numeric(df['Length (Duration)'].str.replace(',',''))

# drop not used columns
df.drop(columns = ['Index', 'Title', 'Artist', 'Year'], inplace=True)

# encode genres
genres = list(df['Top Genre'].unique())
df['Top Genre Encoded'] = df['Top Genre'].apply(lambda x: genres.index(x))

In [8]:
dict_genres= {}
for i, genre in enumerate(genres):
    dict_genres[i] = genre
    
dict_genres

{0: 'adult standards',
 1: 'album rock',
 2: 'alternative metal',
 3: 'classic rock',
 4: 'pop',
 5: 'modern rock',
 6: 'alternative rock',
 7: 'dutch indie',
 8: 'dutch cabaret',
 9: 'permanent wave',
 10: 'classic uk pop',
 11: 'dance pop',
 12: 'dutch pop',
 13: 'british soul',
 14: 'irish rock',
 15: 'art rock',
 16: 'british invasion',
 17: 'dance rock',
 18: 'glam rock',
 19: 'europop'}

In [9]:
df

Unnamed: 0,Top Genre,Beats Per Minute (BPM),Energy,Danceability,Loudness (dB),Liveness,Valence,Length (Duration),Acousticness,Speechiness,Popularity,Top Genre Encoded
0,adult standards,157,30,53,14,11,68,201,94,3,71,0
1,album rock,135,79,50,11,17,81,207,17,7,39,1
3,alternative metal,173,96,43,4,3,37,269,0,4,76,2
4,classic rock,106,82,58,5,10,87,256,1,3,59,3
6,pop,102,71,71,6,13,54,257,6,3,74,4
...,...,...,...,...,...,...,...,...,...,...,...,...
1987,adult standards,119,24,75,15,9,43,216,83,12,68,0
1988,adult standards,168,7,17,21,14,10,298,92,3,66,0
1989,adult standards,94,21,70,12,11,72,128,84,7,63,0
1990,adult standards,175,76,36,8,76,95,136,73,6,69,0


# Perform analysis

The analisis performed, conssits in a classification with random forest, that ....

### Prepare data

In [10]:
targets = ['Top Genre Encoded']
features = ['Beats Per Minute (BPM)', 'Energy', 'Danceability', 'Loudness (dB)', 'Liveness', 'Valence', 'Length (Duration)', 'Acousticness', 'Speechiness', 'Popularity']
X = df[features]
Y = df[targets]
train_X, test_X, train_y, test_y = train_test_split(X, Y, random_state=1)

### Train model

In [11]:
#model = RandomForestClassifier(max_depth = TREE_DEPTH , random_state=1, n_estimators=10)
#model.fit(train_X, train_y)

# 1. Instantiate
# default criterion=gini
# you can swap to criterion=entropy for ID3 implementation (maximizing information gain)
model = DecisionTreeClassifier(criterion="entropy", max_depth = TREE_DEPTH,random_state=1)
model.fit(train_X, train_y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=2, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=1, splitter='best')

In [12]:
model.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': 2,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': 1,
 'splitter': 'best'}

### Perform validation predictions

In [13]:
validation_predictions = model.predict(test_X)

In [14]:
validation_predictions

array([12,  1,  1,  1, 12,  1,  1,  1,  1,  1,  1,  1,  1, 12,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1, 12,  1,  1,  1,  1,  1,  1,  1, 12,  1, 12,
        1,  1,  1,  1,  1,  1,  1,  1, 12,  1,  1,  1,  1, 12,  1,  1,  1,
        1,  1, 12,  1,  1,  1,  1,  1,  1,  1,  1,  1, 12, 12,  1, 12,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 12,
        1,  1,  1,  1, 12,  1,  1,  1,  1, 12,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1, 12,  1,  1, 12,  1,  1,  1,  1, 12,  1,  1,  1,  1, 12,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1, 12,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 12,  1,
        1,  1,  1,  1, 12,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 12,
        1,  1,  1, 12,  1

### Random Hyperparameter Grid

In [15]:
#
splitter = ['best', 'random']
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']
# Maximum number of levels in tree
max_depth = [2,4,6,8,10,12]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Create the random grid
random_grid = {'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'splitter': splitter}

In [16]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = DecisionTreeClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(train_X, train_y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 277 out of 300 | elapsed:    2.4s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    2.4s finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='entropy',
                                                    max_depth=2,
                                                    max_features=None,
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    presort='deprecated',
                                                    random_state=1,
             

In [17]:
# Get best params
rf_random.best_params_

{'splitter': 'random',
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 6}

In [18]:
# Get best model with best params
best_model = rf_random.best_estimator_

In [19]:
labels = test_y['Top Genre Encoded'].tolist()

In [20]:
base_accuracy = evaluate(model, test_X, labels)

Model Performance
Average Error: 5.3542 degrees.
Accuracy = 18.90%.


In [21]:
random_accuracy = evaluate(best_model, test_X, labels)

Model Performance
Average Error: 5.4741 degrees.
Accuracy = 34.48%.


In [22]:
print('Improvement of {:0.2f}%.'.format((random_accuracy - base_accuracy))

SyntaxError: unexpected EOF while parsing (<ipython-input-22-c9b59d9414e5>, line 1)

### GridSearch with Cross Validation

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'splitter': ['random'],
    'max_depth': [3, 5, 9, 12],
    'max_features': ['sqrt'],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [8, 10, 12]
}

# This will try out 
# 1 (bootstrap) * 4 (max_depth) * 2 (max_features) * 3 (min_smaples_leaf) 
# * 3 (min_samples_split) * 4 (n_estimators) = 288 combinations of settings.
grid_search = GridSearchCV(estimator = model, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
# Fit the grid search to the data
grid_search.fit(train_X, train_y)

grid_search.best_params_

In [None]:
# Get best grid
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, test_X, labels)

In [None]:
print('Improvement of {:0.2f}%.'.format(grid_accuracy - base_accuracy))

### Comparison

In [None]:
model_results = evaluate_model(model, train_X, train_y, test_X, labels)
model_results['model'] = 'base_model'
model_results['model'] = 'best_model_gs_model'
#model_results['accuracy'] = base_accuracy
best_model_results = evaluate_model(best_model, train_X, train_y, test_X, labels)
best_model_results['model'] = 'best_model_rf_model'
best_model_results['accuracy'] = random_accuracy
best_grid_results = evaluate_model(best_grid, train_X, train_y, test_X, labels)
best_grid_results['model'] = 'best_model_gs_model'
best_grid_results['accuracy'] = grid_accuracy
#best_grid_results['model'] = 'base_model'

#tmp = best_grid_results['error']
#best_grid_results['error'] = model_results['error']
#model_results['error'] = tmp

In [None]:
comparison = {'model': [],
              'accuracy': [],
              'error': [],
              'n_features': [],
              'depth': [],
              'time': []}
                        
#for m in [model_results, best_model_results, best_grid_results]:
for m in [model_results, best_model_results, best_grid_results]:
    comparison['accuracy'].append(round(m['accuracy'], 3))
    comparison['error'].append(round(m['error'], 3))
    comparison['model'].append(m['model'])
    comparison['n_features'].append(m['n_features'])
    comparison['depth'].append(int(m['depth']))
    comparison['time'].append(round(m['time'], 4))

In [None]:
comparison = pd.DataFrame.from_dict(comparison, orient = 'columns')

In [None]:
comparison[['model', 'accuracy', 'error', 'n_features', 'depth', 'time']]

### Representation and visualization

In [None]:
# Visualize labels in test data versus predictions and number of 
# equivalente values
compare_values(labels, validation_predictions, 'Top Genre')

In [None]:
# Representation of value frequency
df.plot.hist(subplots=True, layout=(6,2), figsize=(10, 10), bins=20)

In [None]:
# Values of Popularity for each genre
df.groupby("Top Genre").Popularity.mean().sort_values(ascending=False)[:5].plot.bar()

In [None]:
# Scatter of Popularity - Danceability
g = sns.scatterplot(x='Popularity', y='Danceability', hue='Top Genre', data=df, legend=False)

In [None]:
# Confusion matrix
sns.heatmap(df.drop(['Top Genre'], axis=1).corr(), annot=True)

In [None]:
# Confusion matrix
best_grid_predictions = best_grid.predict(test_X)
mat = confusion_matrix(labels, best_grid_predictions)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=genres, yticklabels=genres)
plt.xlabel('true label')
plt.ylabel('predicted label');

In [None]:
# Faceting is the act of breaking data variables up across multiple subplots 
# and combining those subplots into a single figure.
# In this case it will be the points of P1 per floor
g = sns.FacetGrid(df, col='Top Genre', col_wrap=5)
g = g.map(sns.kdeplot, 'Popularity')

In [None]:
# Sparse matrix
fig, ax = plt.subplots(figsize=(12,12))
scatter_matrix(df.drop(['Top Genre'], axis=1), alpha=1, ax=ax)

In [None]:
xvalues = list(range(len(comparison)))
plt.subplots(1, 2, figsize=(10, 6))
plt.subplot(121)
plt.bar(xvalues, comparison['accuracy'], color = 'g', edgecolor = 'k', linewidth = 1.8)
plt.xticks(xvalues, comparison['model'], rotation = 45, fontsize = 12)
plt.ylim(ymin = 91, ymax = 94)
plt.xlabel('model'); plt.ylabel('Accuracy (%)'); plt.title('Accuracy Comparison');

plt.subplot(122)
plt.bar(xvalues, comparison['error'], color = 'r', edgecolor = 'k', linewidth = 1.8)
plt.xticks(xvalues, comparison['model'], rotation = 45)
plt.ylim(ymin = 3.5, ymax = 6)
plt.xlabel('model'); plt.ylabel('Error (deg)'); plt.title('Error Comparison');
plt.show();

## Training Curves

### Changing Number of Trees

In [None]:
# We can perform grid search over only one parameter to observe the effects of changing that parameter on performance. 
# We will look at training time, training set accuracy, and testing set accuracy.
# Grid with only the number of trees changed
tree_grid = {'max_depth': [int(x) for x in np.linspace(1, 301, 30)]}

# Create the grid search model and fit to the training data
tree_grid_search = GridSearchCV(best_grid, param_grid=tree_grid, verbose = 2, n_jobs=-1, cv = 3,
                                scoring = 'neg_mean_absolute_error', return_train_score=True)
tree_grid_search.fit(train_X, train_y);

In [None]:
tree_grid_search

In [None]:
plot_results(tree_grid_search)

In [None]:
### Number of Samples required in each leaf node

In [None]:
feature_grid = {'min_samples_leaf': list(range(1, train_X.shape[1] + 1))}

In [None]:
# Create the grid search and fit on the training data
feature_grid_search = GridSearchCV(best_grid, param_grid=feature_grid, cv = 3, n_jobs=-1, verbose= 2,
                                  scoring = 'neg_mean_absolute_error', return_train_score=True)
feature_grid_search.fit(train_X, train_y);

In [None]:
plot_results(feature_grid_search, param='min_samples_leaf', name = 'Number of Samples')

In [None]:
### Number of Features at Each Split

In [None]:
feature_grid = {'max_features': list(range(1, train_X.shape[1] + 1))}

In [None]:
# Create the grid search and fit on the training data
feature_grid_search = GridSearchCV(best_grid, param_grid=feature_grid, cv = 3, n_jobs=-1, verbose= 2,
                                  scoring = 'neg_mean_absolute_error', return_train_score=True)
feature_grid_search.fit(train_X, train_y);

In [None]:
plot_results(feature_grid_search, param='max_features', name = 'Max Features')

# Plot model statistics

In [None]:
r2 = r2_score(test_y, validation_predictions)
mse = mean_squared_error(test_y, validation_predictions) 
mean_absolute_error = mean_absolute_error(test_y, validation_predictions) 
explained_variance = explained_variance_score(test_y, validation_predictions)
median_absolute_error = median_absolute_error(test_y, validation_predictions)
mean_squared_log_error = mean_squared_log_error(test_y, validation_predictions)

print('r2:',{round(r2,4)})
print('MSE:',{round(mse,4)})
print('RMSE:',{round(np.sqrt(mse),4)})
print('MAE:',{round(mean_absolute_error,4)})
print('explained_variance:',{round(explained_variance,4)})    
print('mean_squared_log_error:',{round(mean_squared_log_error,4)})

In [None]:
cross_val_score_ = cross_val_score(model, train_X, train_y, cv = 10, scoring = 'accuracy')
print('cross validation mean ',{cross_val_score_.mean()})
print('cross validation std ',{cross_val_score_.std()})

In [None]:
y_randforest = cross_val_predict(model, train_X, train_y, cv = 10)
recall_score_ = recall_score(train_y, y_randforest, average = "micro")
precission_score_ = precision_score(train_y, y_randforest, average = "micro")
f1_score = 2 * (precission_score_ * recall_score_) / (precission_score_ + recall_score_)
print('Recall Score:',{recall_score_})
print('Precision Score:',{precission_score_})
print('F1 Score:', {f1_score})

### Tree Visualization

In [None]:
column_names = list(df[features].columns.values)
class_names = list(df['Top Genre'].unique())

In [None]:
column_names

In [None]:
# Export as dot file
export_graphviz(best_grid, out_file='tree_depth_best_grid.dot', 
                feature_names = column_names,
                class_names = class_names,
                rounded = True, proportion = False, 
                precision = 2, filled = True)

In [None]:
# Convert to png using system command (requires Graphviz)
# python -m pip install graphviz
# python -m pip install pydot
(graph,) = pydot.graph_from_dot_file('tree_depth_best_grid.dot')
graph.write_png('tree_depth_best_grid.png')

# Display in jupyter notebook
Image(filename = 'tree_depth_best_grid.png')