### Data Preparation

In [2]:
import os
import json
import pandas as pd




In [None]:
# result.json contains flattened json (so list of dicts)
f = open(os.path.join(os.getcwd(), "..", "data_collection", "json", "flat_json_hiphop_rock_jazz.json"))
json_data = json.load(f)

# Convert list of dicts to dataframe
df = pd.DataFrame(json_data) 
df

In [24]:
# Drop unneeded columns
columns_to_drop = [
    "category.name",
    "category.playlist.id",
    "category.playlist.name",
    "category.playlist.track.id",
    "category.playlist.track.name",
    "category.playlist.track.album.id",
    "category.playlist.track.album.name",
    "category.playlist.track.artist"
    ]
df = df.drop(columns=columns_to_drop)

df = df.rename(columns={
    "category.id": "category",
    "category.playlist.track.feature.danceability": "feature_danceability",
    "category.playlist.track.feature.energy": "feature_energy",
    "category.playlist.track.feature.key": "feature_key",
    "category.playlist.track.feature.loudness": "feature_loudness",
    "category.playlist.track.feature.mode": "feature_mode",
    "category.playlist.track.feature.speechiness": "feature_speechiness",
    "category.playlist.track.feature.acousticness": "feature_acousticness",
    "category.playlist.track.feature.instrumentalness": "feature_instrumentalness",
    "category.playlist.track.feature.liveness": "feature_liveness",
    "category.playlist.track.feature.valence": "feature_valence",
    "category.playlist.track.feature.tempo": "feature_tempo",
    "category.playlist.track.feature.duration_ms": "feature_duration_ms",
    "category.playlist.track.feature.time_signature": "feature_time_signature"
})
df

Unnamed: 0,category,feature_danceability,feature_energy,feature_key,feature_loudness,feature_mode,feature_speechiness,feature_acousticness,feature_instrumentalness,feature_liveness,feature_valence,feature_tempo,feature_duration_ms,feature_time_signature
0,hiphop,0.849,0.424,5,-9.579,0,0.324,0.0635,0,0.0834,0.153,145.887,242966,4
1,hiphop,0.681,0.63,1,-5.585,1,0.0385,0.00383,0,0.139,0.183,151.951,161053,4
2,hiphop,0.711,0.611,1,-5.453,1,0.329,0.00575,0,0.231,0.144,134.14,252070,4
3,hiphop,0.723,0.516,11,-10.707,0,0.485,0.00311,1.26e-06,0.115,0.223,155.967,123077,4
4,hiphop,0.91,0.585,11,-7.572,0,0.257,0.0536,0,0.127,0.599,129.011,165067,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18723,jazz,0.421,0.0952,6,-12.561,1,0.0479,0.931,0.000201,0.126,0.0773,109.698,177922,4
18724,jazz,0.503,0.491,0,-12.02,1,0.0295,0.0412,0.922,0.0965,0.489,166.105,263447,4
18725,jazz,0.644,0.594,5,-9.965,1,0.117,0.751,0.224,0.107,0.632,90.564,494467,4
18726,jazz,0.462,0.211,0,-13.396,1,0.0586,0.665,0.946,0.114,0.426,179.658,77190,3


In [25]:
df.dtypes

category                    object
feature_danceability        object
feature_energy              object
feature_key                 object
feature_loudness            object
feature_mode                object
feature_speechiness         object
feature_acousticness        object
feature_instrumentalness    object
feature_liveness            object
feature_valence             object
feature_tempo               object
feature_duration_ms         object
feature_time_signature      object
dtype: object

In [26]:
# sklearn takes the features and labels as seperate lists
# df needs to be split
def encode_target(df, target_column):

    df_mod = df.copy()
    map_to_int = {name: n for n, name in enumerate(df_mod["category"].unique())}
    df_mod["target"] = df_mod[target_column].replace(map_to_int)

    return (df_mod)

df_target = encode_target(df, "category")

# Target to category mapping
df_target[["target", "category"]].head()

Unnamed: 0,target,category
0,0,hiphop
1,0,hiphop
2,0,hiphop
3,0,hiphop
4,0,hiphop


In [27]:
# Possible values for category
targets = df_target["category"].unique()
targets

array(['hiphop', 'rock', 'jazz'], dtype=object)

### Train model

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Split test and training data

train, test = train_test_split(df_target, test_size=0.2, random_state=45, shuffle=True)
print("Number of rows in train: ", train.shape[0])
print("Number of rows in test: ", test.shape[0])

Number of rows in train:  14982
Number of rows in test:  3746


In [29]:
# List of feature names

features = list(train.columns[1:14])
features

['feature_danceability',
 'feature_energy',
 'feature_key',
 'feature_loudness',
 'feature_mode',
 'feature_speechiness',
 'feature_acousticness',
 'feature_instrumentalness',
 'feature_liveness',
 'feature_valence',
 'feature_tempo',
 'feature_duration_ms',
 'feature_time_signature']

In [30]:
# Y is list containing all target values
Y = train["target"]
# X is dataframe containing all feature columns
X = train[features]

# Create decision tree classifier object
dt = DecisionTreeClassifier(min_samples_split=20, random_state=45)

# Fit model
dt.fit(X, Y)

DecisionTreeClassifier(min_samples_split=20, random_state=45)

In [31]:
# Calculate accuracy score using the test features and target

dt.score(test[features], test["target"])

0.8569140416444208

In [36]:
# save graphical decision tree
import graphviz
from sklearn import tree

dot_data = tree.export_graphviz(dt, 
 
                  filled=True, rounded=True,  
                  special_characters=True,
                   out_file=None,
                           )

graph = graphviz.Source(dot_data)
graph.save

<bound method Source.save of <graphviz.sources.Source object at 0x28097a970>>