# Projekt VIII: Music Genre Classification

As simple as it gets, make a Music Genre Prediction based on the data given.

            key features: EDA, Supervised Learning, Logistic Regression, KNN, Decision Tree, GradientBoostingClassifier

#### Objectives:

   * Music Genre Prediction
   
Target Feature: quality

##### Used Algorithims

- KNeighborsClassifier
- DecisionTreeClassifier

#### Phases:
   * Data Preparation
   * Data Analysis
   * Supervised Learning

In [1]:
#lib imports
import pandas as pd
import seaborn as sns
import numpy as np
import cufflinks as cf
import matplotlib.pyplot as plt

#plotly
import chart_studio.plotly as py
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

#Scikit and Yellowbrick
from sklearn.metrics import classification_report, confusion_matrix, r2_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer, StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier , plot_tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.decomposition import PCA
from sklearn.utils import resample
from sklearn.cluster import KMeans, AgglomerativeClustering
from yellowbrick.cluster import KElbowVisualizer #elbow method

#pycaret
#from pycaret.classification import *
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
%matplotlib inline

UsageError: Line magic function `%%javascript` not found.


In [None]:
#load dataframe
df = pd.read_csv('data/train.csv')
genre = pd.read_csv('data/submission.csv')
test = pd.read_csv('data/test.csv')

In [None]:
genre.head()

Normal procedure: head, info, describe, look for NaNs and Nulls, etc.

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
#NaN values visualization
fix, (ax1,ax2) = plt.subplots(1,2,figsize=(12, 8), dpi=75)
sns.heatmap(data = df.isnull(), cmap="PuBu_r", ax = ax1).set_title('Before')

#loads of NaNs, lets work on it
cols = ['Popularity','key','instrumentalness']
for i in cols:
    df[i].fillna(df[i].mean(), inplace = True)
    
#NaN values after mean fill
sns.heatmap(data = df.isnull(), cmap="PuBu_r", ax = ax2).set_title('After')

#change columns name
df.rename(columns = {'duration_in min/ms':'duration'}, inplace = True)
test.rename(columns = {'duration_in min/ms':'duration'}, inplace = True)
#remove song titles
df.drop('Track Name', axis = 1, inplace = True)

In [None]:
#Lets deal with categorical variables

#backup data 
backup = df
#Get list of categorical variables
s = (df.dtypes == 'object')
object_cols = list(s[s].index)
print(object_cols)

#Transform Artist name in numerical value
LE = LabelEncoder()
df['Artist Name'] = df[['Artist Name']].apply(LE.fit_transform)
test['Artist Name'] = test[['Artist Name']].apply(LE.fit_transform)

In [None]:
df_corr = df.corr()
fig, ax = plt.subplots(figsize=(20,20))
sns.heatmap(df_corr, annot = True, center = 0, cmap = 'twilight_shifted')
ax.set_title('Correlation Matrix', fontsize = 24)

In [None]:
df.corr()['Class'].sort_values(ascending=False)

No realy high correlation can be seen

In [None]:
#Class Distribution
df_count = df['Class'].value_counts().reset_index().sort_values(by = 'index', ascending = False).rename(columns={'Class':'Count','index':'Class'})

#plot
fig, ax = plt.subplots(figsize=(10, 6), dpi=75)
sns.countplot(data = df, x = 'Class', palette = 'twilight_shifted_r')

#title & axis
xmin, xmax = ax.get_xlim()
ymin, ymax = ax.get_ylim()
ax.set_title('Class Distribution',
             fontdict = {'fontsize': 20},
            loc = 'left')
ax.set_ylabel('')
ax.grid(axis = 'y',linestyle = 'dotted')
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.set_ylim(0,ymax*1.1)

#xtick values
rate_list = list(range(0,11))
ticks = df_count.sort_values(by = 'Class', ascending = True).reset_index()
for i in rate_list:
    ax.text(x = i, y = ticks['Count'][i],
            s = ticks['Count'][i],
            horizontalalignment= 'center', verticalalignment = 'bottom', 
            fontsize=16)

Will be necessary to balance sampling

In [None]:
#all histograms
df.hist(bins=50, figsize=(20,15),color='green',alpha=0.6)
plt.show()

In [None]:
#verify outliers
fig, [(ax1, ax2, ax3),(ax4, ax5, ax6),(ax7, ax8, ax9), (ax10,ax11, ax12)] = plt.subplots(nrows = 4, ncols = 3, figsize = (16 , 30))
fig.suptitle('Verify Outliers', fontsize = 24,y = 0.9)

#first row
sns.boxplot(data = df, x = 'Class', y = 'Popularity', palette= 'twilight_shifted', ax = ax1)
sns.boxplot(data = df, x = 'Class', y = 'danceability', palette= 'twilight_shifted', ax = ax2)
sns.boxplot(data = df, x = 'Class', y = 'energy', palette= 'twilight_shifted', ax = ax3)
#second row
sns.boxplot(data = df, x = 'Class', y = 'loudness', palette= 'twilight_shifted', ax = ax4)
sns.boxplot(data = df, x = 'Class', y = 'speechiness', palette= 'twilight_shifted', ax = ax5)
sns.boxplot(data = df, x = 'Class', y = 'acousticness', palette= 'twilight_shifted', ax = ax6)
#third row
sns.boxplot(data = df, x = 'Class', y = 'instrumentalness', palette= 'twilight_shifted', ax = ax7)
sns.boxplot(data = df, x = 'Class', y = 'liveness', palette= 'twilight_shifted', ax = ax8)
sns.boxplot(data = df, x = 'Class', y = 'valence', palette= 'twilight_shifted', ax = ax9)
#fourth row
sns.boxplot(data = df, x = 'Class', y = 'tempo', palette= 'twilight_shifted', ax = ax10)
sns.boxplot(data = df, x = 'Class', y = 'duration', palette= 'twilight_shifted', ax = ax11)
sns.countplot(data = df, x = 'Class', palette = 'twilight_shifted_r', ax = ax12)

plt.show()

Good portion of atributes have really high number of outliers

In [None]:
df.skew(numeric_only=None)

#### Data conclusion

* Huge number of outliers
* Some atributes are not normalized
* No atribute with big correlation with quality
* Highly Skewed data
* Some quantity gap between Class values
* Normalize columns: [energy;loudness; speechiness; acousticness; instrumentalness; liveness; duration]

#### Normalize

Since we have 0 values we can't use log(x) transformation. A good way to fix it is use a constant like log(x+1). Since we also have negative values, the maximum negative value must be added. We'ill use log(x + min +1)

In [None]:
#minimum value
df['loudness'].min()

In [None]:
#make normalized df
df_normalized = df

def log_transform(col):
    return np.log(col[0] + 39.952 + 1)

df_normalized['energy'] = df[['energy']].applymap(lambda x: np.log(x + 39.952 + 1))
df_normalized['loudness'] = df[['loudness']].applymap(lambda x: np.log(x + 39.952 + 1))
df_normalized['speechiness'] = df[['speechiness']].applymap(lambda x: np.log(x + 39.952 + 1))
df_normalized['acousticness'] = df[['acousticness']].applymap(lambda x: np.log(x + 39.952 + 1))
df_normalized['instrumentalness'] = df[['instrumentalness']].applymap(lambda x: np.log(x + 39.952 + 1))
df_normalized['liveness'] = df[['liveness']].applymap(lambda x: np.log(x + 39.952 + 1))
df_normalized['duration'] = df[['duration']].applymap(lambda x: np.log(x + 39.952 + 1))

In [None]:
#all histograms
df_normalized.hist(bins=50, figsize=(20,15),color='green',alpha=0.6)
plt.show()

#### Oversample and Downsample

In [None]:
df['Class'].value_counts().mean()

In [None]:
#create dfs
df_0 = df_normalized[df_normalized.Class==0]
df_1 = df_normalized[df_normalized.Class==1]
df_2 = df_normalized[df_normalized.Class==2]
df_3 = df_normalized[df_normalized.Class==3]         
df_4 = df_normalized[df_normalized.Class==4]         
df_5 = df_normalized[df_normalized.Class==5]
df_6 = df_normalized[df_normalized.Class==6]
df_7 = df_normalized[df_normalized.Class==7]
df_8 = df_normalized[df_normalized.Class==8]
df_8 = df_normalized[df_normalized.Class==9]
df_8 = df_normalized[df_normalized.Class==10]

# oversample:
df_0_upsampled = resample(df_0, replace=True, n_samples=1636, random_state=12) 
df_1_upsampled = resample(df_1, replace=True, n_samples=1636, random_state=12) 
df_2_upsampled = resample(df_2, replace=True, n_samples=1636, random_state=12) 
df_3_upsampled = resample(df_3, replace=True, n_samples=1636, random_state=12)
df_4_upsampled = resample(df_4, replace=True, n_samples=1636, random_state=12)
df_5_upsampled = resample(df_5, replace=True, n_samples=1636, random_state=12)
df_7_upsampled = resample(df_7, replace=True, n_samples=1636, random_state=12)

# downsample:
df_6_downsampled = df_normalized[df_normalized.Class==6].sample(n=1636).reset_index(drop=True)
df_8_downsampled = df_normalized[df_normalized.Class==8].sample(n=1636).reset_index(drop=True)
df_9_downsampled = df_normalized[df_normalized.Class==9].sample(n=1636).reset_index(drop=True)
df_10_downsampled = df_normalized[df_normalized.Class==10].sample(n=1636).reset_index(drop=True)

# Combine downsampled majority class with upsampled minority class
df_balance = pd.concat([df_0_upsampled, df_1_upsampled, df_2_upsampled, df_3_upsampled, df_4_upsampled,
                        df_5_upsampled, df_7_upsampled, df_6_downsampled, df_8_downsampled, df_9_downsampled,
                        df_10_downsampled]).reset_index(drop=True)

# Display new class counts
df_balance.Class.value_counts()

### Supervised Learning


Since we already have a train and test df, there is no need to make a train test spli. We just need to separate the target feature.

In [None]:
#train_test_split
X = df_balance.drop('Class',axis = 1,)
y = df_balance['Class']
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [None]:
#scalling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#### Logistic Regression

In [None]:
lr = LogisticRegression()
lr.fit(X_train,y_train)
lr_pred = lr.predict(X_test)
acc = round(accuracy_score(y_test, lr_pred)*100,2)
print("Accurracy Score: " + str(acc) + "%")
print('-----------------------------------')
print('Classification Report: \n\n' + classification_report(y_test,lr_pred, zero_division = True))
print('----------------------------------- \n' + 'Confusion Matrix: \n')
print(sns.heatmap(confusion_matrix(y_test,lr_pred), annot=True))

#### KNN

In [None]:
for n_neighbors in [3,5,10,15,20,30]:
    knn = KNeighborsClassifier(n_neighbors, weights='distance')
    knn.fit(X_train, y_train) 
    scr = knn.score(X_test, y_test)
    print("For n_neighbors = ", n_neighbors  ,", score: ",round(scr*100,2),'%')

In [None]:
knn = KNeighborsClassifier(n_neighbors=15)
knn_fit = knn.fit(X_train,y_train)
knn_pred = knn_fit.predict(X_test)
acc = round(accuracy_score(y_test, knn_pred)*100,2)
print("Accurracy Score: " + str(acc) + "%")
print('-----------------------------------')
print('Classification Report: \n\n' + classification_report(y_test,knn_pred, zero_division = True))
print('----------------------------------- \n' + 'Confusion Matrix: \n')
print(sns.heatmap(confusion_matrix(y_test,knn_pred), annot=True))

#### Decision Tree

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train,y_train)
dtc_pred = dtc.predict(X_test)
acc = round(accuracy_score(y_test, dtc_pred)*100,2)
print("Accurracy Score: " + str(acc) + "%")
print('-----------------------------------')
print('Classification Report: \n\n' + classification_report(y_test,dtc_pred, zero_division = True))
print('----------------------------------- \n' + 'Confusion Matrix: \n')
print(sns.heatmap(confusion_matrix(y_test,dtc_pred), annot=True))

#### GradientBoostingClassifier

In [None]:
gbc = GradientBoostingClassifier()
gbc.fit(X_train,y_train)
gbc_pred = gbc.predict(X_test)
acc = round(accuracy_score(y_test, gbc_pred)*100,2)
print("Accurracy Score: " + str(acc) + "%")
print('-----------------------------------')
print('Classification Report: \n\n' + classification_report(y_test,gbc_pred, zero_division = True))
print('----------------------------------- \n' + 'Confusion Matrix: \n')
print(sns.heatmap(confusion_matrix(y_test,gbc_pred), annot=True))

### Conclusions

* Highly skewed data make realy dificult to get better results
* Balance sampling really boosted algorithims results
* Results between 50% and 60%, with Gradient Boosting Classifier as best with over 68%%