# Music Genre Prediction: Molten Cores

<img src="https://www.warcrafttavern.com/wp-content/uploads/2020/10/WoW-Classic-Molten-Core-Guide-1024x729.jpg" alt="img" width="800"/>

# ____________________________________________________________________________________________________



## Notes:
- ### instrumentalness: vocals in a track

- ### speechiness: detects the pressents of vocal words in a track

- ### music genre: 10 different types
	0. Electronic
    1. Classical
    2. Jazz
    3. anime
    4. Rock
    5. country
    6. Rap
    7. Blues
    8. Hip-Hop
    9.Alternative

# problems with table:

- [x] negative duration
- [x] missing tempos
- [x] rows 10000-10005 are nan values for every column
- [ ] negative loudness

##### [Markdown Guide cheat Sheet](https://www.markdownguide.org/cheat-sheet/)



# ____________________________________________________________________________________________________



## All Libaries & Imports 

In [214]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from sklearn import utils
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.datasets import load_iris
from IPython.display import Image
from subprocess import call
from sklearn import tree
import copy
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb

#for Naïve Bayes
import seaborn as sns
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import make_scorer, f1_score, accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier

from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

# ____________________________________________________________________________________________________



## Cleaning data frame

In [215]:
#functions
def stringToInt(dataFrame,col):
    test = {}
    for i in dict(enumerate(dataFrame[col].unique())).items():#is a dictionary of the keys and corespodening number
        #makes it so the keys and values of the dictionary switch
        test[i[1]]=i[0]
    print(test,'\n')
    return dataFrame[col].map(test)

In [222]:
music_genre = pd.read_csv("music_genre.csv")
df = music_genre.copy(deep=True)

df = copy.deepcopy(music_genre)

In [223]:
#dropping columns
badInfo = ["instance_id","obtained_date","artist_name","track_name"]
df.drop(columns=badInfo,axis=1,inplace=True)
#dropping null rows
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True) #Very good practice to reset how your rows are counted when you drop rows.

#fixing tempo
df["tempo"]=df["tempo"].replace("?",np.nan)
df["tempo"] = df["tempo"].astype("float")
df["tempo"]=df.groupby("music_genre")["tempo"].transform(lambda x: x.fillna(x.mean(skipna=True)))
df['tempo'] = np.around(df['tempo'],2)

#fixing duration
df["duration_ms"]=df["duration_ms"].replace(-1.0,np.nan)
df["duration_ms"]=df.groupby("music_genre")["duration_ms"].transform(lambda x: x.fillna(x.mean(skipna=True)))
df['duration_ms'] = np.around(df['duration_ms'],2)

#change the values from string to int
df['key'] = stringToInt(df,'key')
df['mode'] = stringToInt(df,'mode')
df['music_genre_name'] = df['music_genre']
df['music_genre'] = stringToInt(df,'music_genre')

pio.templates.default = "plotly_dark"

{'A#': 0, 'D': 1, 'G#': 2, 'C#': 3, 'F#': 4, 'B': 5, 'G': 6, 'F': 7, 'A': 8, 'C': 9, 'E': 10, 'D#': 11} 

{'Minor': 0, 'Major': 1} 

{'Electronic': 0, 'Anime': 1, 'Jazz': 2, 'Alternative': 3, 'Country': 4, 'Rap': 5, 'Blues': 6, 'Rock': 7, 'Classical': 8, 'Hip-Hop': 9} 



In [0]:
df.head()

In [0]:
df.tail()

# ____________________________________________________________________________________________________



## Plot



In [0]:
px.bar(df.sort_values(by=["music_genre_name"]), x='popularity', y='acousticness', color='music_genre_name')

# non danceable songs are of the highest acoustecness
# danceability and popularity are not correlated
# for each popularity, the least danceable songs tend to have the highest acousticness (at least until about 45 popularity)***

In [0]:
df = df.sort_values(by=["key"])
px.bar(df, x='music_genre_name', y='tempo',color='key')

# tempo is highest for classical
# tempo is lowest for electronic
# clear pattern in tempo per genre

# ____________________________________________________________________________________________________



## Spiting and scaling data

In [224]:
X = df.loc[:,df.columns[:-2]]#input_columns
y= df['music_genre']#what we want

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

sc = preprocessing.StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)

#for heat map
labels = ['Electronic', 'Anime', 'Jazz', 'Alternative', 'Country', 'Rap', 'Blues', 'Rock', 'Classical', 'Hip-Hop']

# ____________________________________________________________________________________________________



## Logistic Regression  

In [0]:

# # #prepocessing dataset
# #X = df.loc[:, df.columns != 'music_genre']
# #y = df.music_genre
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=5, stratify=y)

# # print(X_train.shape)
# # print(X_test.shape)
# # print(y_train.shape)
# # print(y_test.shape)

# scaler = preprocessing.StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

#model building
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

X_train_acc = model.score(X_train_scaled, y_train)
print("The Accuracy for Training Set is {}".format(X_train_acc*100))
test_acc = model.score(X_test_scaled, y_test)
print("The Accuracy for Test Set is {}".format(test_acc*100))
print(classification_report(y_test, y_pred))

#Confusion matrixa
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='g', xticklabels=labels, yticklabels=labels)


# ____________________________________________________________________________________________________



## Naive Bayes 
About  Naive Bayes :
Native Bayes is a machine learning model for classification that uses the Bayes Theorem(	<img src="https://s0.wp.com/latex.php?latex=%5Ctextrm%7BP%28H+%5Ctextbar+E%29+%3D+%7D+%C2%A0%5Cfrac%7B%5Ctextrm%7B+P%28E+%5Ctextbar+H%29+%2A+P%28H%29%7D%7D+%7B%5Ctextrm%7BP%28E%29%7D%7D&bg=ffffff&fg=000&s=0&c=20201002" alt="img" width="200"/>	)


  -  P(H) is the probability of hypothesis H being true. This is known as the prior probability.
  -  P(E) is the probability of the evidence(regardless of the hypothesis).
  -  P(E|H) is the probability of the evidence given that hypothesis is true.
  -  P(H|E) is the probability of the hypothesis given that the evidence is there.


Good at  predicting:
- Classical
- Anime

Bad at predicting:
- everything else

In [0]:
gnb = GaussianNB()
y_hat = gnb.fit(X_train, y_train).predict(X_test)

sns.heatmap(confusion_matrix(y_test, y_hat), annot=True, fmt='g', xticklabels=labels, yticklabels=labels)

## Accuracy

In [0]:
print((np.sum((y_test - y_hat)**2))/len(y_test) )#mean squared error
print(f1_score(y_test, y_hat, average=None))
print(classification_report(y_test, y_hat,target_names=[i+":" for i in labels]))

# ____________________________________________________________________________________________________



## Classification Trees



**About Classification Trees:** 

Decision Trees \(DTs\) are a non\-parametric supervised learning method used for classification and regression. The goal is to create a model that predicts the value of a target variable by learning simple decision rules inferred from the data features. A tree can be seen as a piecewise constant approximation.

**Pros**

- Simple to understand and to interpret. Trees can be visualized.
- Requires little data preparation.
- Able to handle both numerical and categorical data.
- Possible to validate a model using statistical tests.

**Cons**

- Decision\-tree learners can create over\-complex trees that do not generalize the data well.
- Decision trees can be unstable because small variations in the data might result in a completely different tree being generated
- Predictions of decision trees are neither smooth nor continuous, but piecewise constant approximations as seen in the above figure. 
- Decision tree learners create biased trees if some classes dominate.



In [0]:
dummies = pd.get_dummies(df["mode"])
dummies.head()

In [0]:
# df["mode"] = dummies["Major"]
# df.head()

In [0]:
#df['music_genre_codes'] = df['music_genre'].map({'Electronic': 0, 'Anime': 1, 'Jazz': 2, 'Alternative': 3, 'Country': 4, 'Rap': 5, 'Blues': 6, 'Rock': 7, 'Classical': 8, 'Hip-Hop': 9})

In [0]:
#df.drop(['music_genre'], axis=1, inplace=True)

In [0]:
#df.drop(['key'], axis=1, inplace=True)
#df.head()

In [0]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # 80% training and 20% test

In [228]:
y_pred = clf.predict(X_test)


X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names



In [0]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [227]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)

In [0]:
fig, axe = plt.subplots(figsize=(30,30))
tree.plot_tree(clf, ax = axe, fontsize=15)

# ____________________________________________________________________________________________________



## Random Forest 

In [0]:
#df[df['key']=="B"]

In [0]:
target = df["music_genre"]                                                                      # The target is defined.
x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.2, random_state=1) # Splits our data using the train_test_split() function on Sklearn.

In [0]:
print(x_train.shape)    # Prints out the shape of the variable.
print(x_test.shape)     # Prints out the shape of the variable.
print(y_train.shape)    # Prints out the shape of the variable.
print(y_test.shape)     # Prints out the shape of the variable.

In [0]:
tst = RandomForestClassifier(max_depth=2, random_state=0)   # RandomForestRegressor is added.
tst.fit(x_train, y_train)                                   # Fits the model.

y_pred1 = tst.predict(x_test)                                  # Predicts.

print("Accuracy:", metrics.accuracy_score(y_test, y_pred1))

# ____________________________________________________________________________________________________



In [0]:
iris = load_iris()

# Model (can also use single decision tree)
model = RandomForestClassifier(n_estimators=10)

# Train
model.fit(iris.data, iris.target)

# Extract single tree
estimator = model.estimators_[5]

# Export as dot file
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = iris.feature_names,
                class_names = iris.target_names,
                rounded = True, proportion = False, 
                precision = 2, filled = True)


# Convert to png using system command (requires Graphviz)
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

# Display in jupyter notebook
Image(filename = 'tree.png')

In [0]:
target = df["music_genre"]                                                                      # The target is defined.
x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.2, random_state=1)  # Splits our data using the train_test_split() function on Sklearn.

# Model (can also use single decision tree)
model = RandomForestClassifier(n_estimators=10)

# Train
model.fit(x_train, y_train)

# Extract single tree
df1 = model.estimators_[5]

# Export as dot file
export_graphviz(df1, out_file='tree.dot', rounded = True, proportion = False, precision = 2, filled = True)

# Convert to png using system command (requires Graphviz)
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

# Display in jupyter notebook
Image(filename = 'tree.png')

In [0]:
# Classification Tree ( Test )

#target = df["music_genre"]                                                                     # The target is defined.
#x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.2, random_state=1) # Splits our data using the train_test_split() function on Sklearn.

#tst1 = DecisionTreeClassifier(max_depth = 2, random_state = 0)
#tst1.fit(x_train, y_train)

#tst1.predict(x_test)

#fig, axe = plt.subplots(figsize=(20,10))
#tree.plot_tree(tst1, ax = axe, fontsize=15)

In [0]:
#label_encoder = preprocessing.LabelEncoder()
#df['popularity']= label_encoder.fit_transform(df['popularity'])
#df['popularity'].unique()

In [0]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
tree.plot_tree(clf)
plt.show()

# ____________________________________________________________________________________________________

## Xgboost

In [0]:
dfxg = music_genre.copy(deep=True)

#dropping irrelevent columns
badInfo = ["instance_id","obtained_date","artist_name","track_name"]
dfxg.drop(columns=badInfo,axis=1,inplace=True)

#dropping null rows
dfxg.dropna(inplace=True)
dfxg.reset_index(drop=True, inplace=True) #Very good practice to reset how your rows are counted when you drop rows.

#fixing tempo
dfxg["tempo"]=df["tempo"].replace("?",np.nan)
dfxg["tempo"] = df["tempo"].astype("float")
dfxg["tempo"]=df.groupby("music_genre")["tempo"].transform(lambda x: x.fillna(x.mean(skipna=True)))

#fixing duration
dfxg["duration_ms"]=df["duration_ms"].replace(-1.0,np.nan)
dfxg["duration_ms"]=df.groupby("music_genre")["duration_ms"].transform(lambda x: x.fillna(x.mean(skipna=True)))
dfxg['duration_ms'] = np.around(df['duration_ms'],2)

#creating dummy variables for categorical variables
dummies = pd.get_dummies(dfxg["key"])
dfxg["A"] = dummies["A"]
dfxg["B"] = dummies["B"]
dfxg["C"] = dummies["C"]
dfxg["D"] = dummies["D"]
dfxg["E"] = dummies["E"]
dfxg["F"] = dummies["F"]
dfxg["G"] = dummies["G"]
dfxg["A#"] = dummies["A#"]
dfxg["C#"] = dummies["C#"]
dfxg["D#"] = dummies["D#"]
dfxg["F#"] = dummies["F#"]
dfxg["G#"] = dummies["G#"]
dfxg.drop("key", axis = 1, inplace = True)

dummies = pd.get_dummies(dfxg["mode"])
dfxg["Major"] = dummies["Major"]
dfxg["Minor"] = dummies["Minor"]
dfxg.drop("mode", axis = 1, inplace = True)

dummies = pd.get_dummies(dfxg["music_genre"])
dfxg["Electronic"] = dummies["Electronic"]
dfxg["Anime"] = dummies["Anime"]
dfxg["Jazz"] = dummies["Jazz"]
dfxg["Alternative"] = dummies["Alternative"]
dfxg["Country"] = dummies["Country"]
dfxg["Rap"] = dummies["Rap"]
dfxg["Blues"] = dummies["Blues"]
dfxg["Rock"] = dummies["Rock"]
dfxg["Classical"] = dummies["Classical"]
dfxg["Hip-Hop"] = dummies["Hip-Hop"]
dfxg.drop("music_genre", axis = 1, inplace = True)

#XGBoost model
genre_list = ['Electronic', 'Anime', 'Jazz', 'Alternative', 'Country', 'Rap', 'Blues', 'Rock', 'Classical', 'Hip-Hop']
target = pd.get_dummies(dummies)
input_columns = dfxg.iloc[:,:-10]
x_train, x_test, y_train, y_test = train_test_split(input_columns, target, train_size=0.8)
scaler = preprocessing.StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(x_train_scaled, y_train)

y_pred = model.predict(x_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print("Overall Accuracy: ", accuracy)

total_squared_error = (np.sum((y_test - y_pred)**2))
mean_squared_error = total_squared_error/len(y_test)
print(mean_squared_error)

#confusion matrix
labels = ['Electronic', 'Anime', 'Jazz', 'Alternative', 'Country', 'Rap', 'Blues', 'Rock', 'Classical', 'Hip-Hop']
sns.heatmap(confusion_matrix(y_test.values.argmax(axis=1), y_pred.argmax(axis=1)), annot=True, fmt='g', xticklabels=labels, yticklabels=labels)

# ____________________________________________________________________________________________________

## Neural Networks

# ____________________________________________________________________________________________________