In [0]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

In [0]:
music_genre = pd.read_csv("music_genre.csv")
df = music_genre.copy(deep=True)

In [0]:
#dropping columns
badInfo = ["instance_id","obtained_date","artist_name","track_name"]
df.drop(columns=badInfo,axis=1,inplace=True)
#dropping null rows
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True) #Very good practice to reset how your rows are counted when you drop rows.

#fixing tempo
df["tempo"]=df["tempo"].replace("?",np.nan)
df["tempo"] = df["tempo"].astype("float")
df["tempo"]=df.groupby("music_genre")["tempo"].transform(lambda x: x.fillna(x.mean(skipna=True)))
df['tempo'] = np.around(df['tempo'],2)

#fixing duration
df["duration_ms"]=df["duration_ms"].replace(-1.0,np.nan)
df["duration_ms"]=df.groupby("music_genre")["duration_ms"].transform(lambda x: x.fillna(x.mean(skipna=True)))
df['duration_ms'] = np.around(df['duration_ms'],2)

In [0]:
df.head(6)

In [0]:
df[df["duration_ms"] == -1.0].shape

In [0]:
dummies = pd.get_dummies(df["mode"])
dummies.head()

In [0]:
df["mode"] = dummies["Major"]
df.head()

In [0]:
df['music_genre_codes'] = df['music_genre'].map({'Electronic': 0, 'Anime': 1, 'Jazz': 2, 'Alternative': 3, 'Country': 4, 'Rap': 5, 'Blues': 6, 'Rock': 7, 'Classical': 8, 'Hip-Hop': 9})

In [0]:
df.drop(['music_genre'], axis=1, inplace=True)

In [0]:
df.drop(['key'], axis=1,inplace=True)
df.head()

In [0]:
df.columns

In [0]:
#feature_cols = [df.columns]

In [0]:
target = df["music_genre_codes"]

In [0]:
input_columns = df.loc[:, df.columns != "music_genre_codes"]

In [0]:
X_train, X_test, y_train, y_test = train_test_split(input_columns, target, test_size=0.2, random_state=1) # 80% training and 20% test

In [0]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)

In [0]:
y_pred = clf.predict(X_test)

In [0]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [0]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import matplotlib.pyplot as plt
from sklearn import tree

fig, axe = plt.subplots(figsize=(30,30))
tree.plot_tree(clf, ax = axe, fontsize=15)


#tree.plot_tree(clf)

In [0]:
clf = DecisionTreeClassifier(max_depth = 2, random_state = 0)
clf = clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

fig, axe = plt.subplots(figsize=(30,30))
tree.plot_tree(clf, ax = axe, fontsize=15)

In [0]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='g')

In [0]:
tree.plot_tree(clf)