*@Author Mehmet Halis Çiçek / 180315037*

In [None]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

In [None]:
# READ DATA SET 
data_set = pd.read_csv("music_genre.csv", sep = r',', skipinitialspace = True)
data_set.head()

In [None]:
data_set.describe().T

In [None]:
#Features
data_set.columns

In [None]:
#Check type of features
data_set.dtypes

In [None]:
#Data graphs of features which define data

bar_music_genre=data_set['music_genre'].value_counts().plot.barh();
bar_music_genre.set_title('Music Genre')

In [None]:
bar_mode=data_set['mode'].value_counts().plot.barh();
bar_mode.set_title('Mode')

In [None]:
import seaborn as sns

hist_popularity = sns.histplot(data_set.popularity,kde=True)
hist_popularity.set_title('Histogram of Popularity')

In [None]:
dist_energy = sns.distplot(data_set.energy,kde=True)
dist_energy.set_title('Distribution of Energy')

In [None]:
sns.catplot(x='energy',y='popularity',hue='music_genre',kind='point',data=data_set)

In [None]:
#Check is there any null value
data_set.isnull().sum()

In [None]:
# Remove row which contains null value
data_set = data_set.dropna(how='any',axis=0)
data_set.isnull().sum()

In [None]:
#Drop track_name column
del data_set["track_name"]

In [None]:
# Data Imputation and Interpolation
data_set['tempo'] = data_set['tempo'].str.replace('?','nan')
data_set["tempo"] = data_set["tempo"].astype(float)
data_set['tempo'] = data_set['tempo'].interpolate()

In [None]:
#Encoding
from sklearn import preprocessing
converter = preprocessing.LabelEncoder()

data_set["obtained_date"] = converter.fit_transform(data_set["obtained_date"])
data_set["music_genre"] = converter.fit_transform(data_set["music_genre"])

In [None]:
#One hot encoding for categorical variables
data_set=pd.concat([data_set,pd.get_dummies(data_set['key'],prefix='key')],axis=1).drop(['key'],axis=1)
data_set=pd.concat([data_set,pd.get_dummies(data_set['mode'],prefix='mode')],axis=1).drop(['mode'],axis=1)
data_set=pd.concat([data_set,pd.get_dummies(data_set['artist_name'],prefix='artist_name')],axis=1).drop(['artist_name'],axis=1)

In [None]:
# Review data set
data_set.info()

In [None]:
data_set.head()

In [None]:
data_set.tail()
print(data_set["music_genre"])

In [None]:
# Train and test split
from sklearn.model_selection import train_test_split

X = data_set.drop(["music_genre"], axis = 1).values
Y = data_set["music_genre"].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 1, test_size = 0.3)

In [None]:
# Scaling data - Standardize features by removing the mean and scaling to unit variance
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

### ***--> Support Vector Machine (SVM) Model***

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
clf = SVC(kernel='linear')
 
clf.fit(X_train[:5000], Y_train[:5000])
clf_predict=clf.predict(X_test[:5000])
print('Accuracy : ', accuracy_score(Y_test[:5000], clf_predict))

In [None]:
"""import pandas as pd
import numpy as np
from sklearn import preprocessing

from sklearn import svm, datasets
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

print("Training the model...")

rbf = svm.SVC(kernel='rbf', gamma=0.5, C=0.1).fit(X_train, Y_train)
poly = svm.SVC(kernel='poly', degree=3, C=1).fit(X_train, Y_train)

poly_pred = poly.predict(X_test)
rbf_pred = rbf.predict(X_test)

poly_accuracy = accuracy_score(y_test, poly_pred)
poly_f1 = f1_score(y_test, poly_pred, average='weighted')
print('Accuracy (Polynomial Kernel): ', "%.2f" % (poly_accuracy*100))
print('F1 (Polynomial Kernel): ', "%.2f" % (poly_f1*100))

rbf_accuracy = accuracy_score(Y_test, rbf_pred)
rbf_f1 = f1_score(Y_test, rbf_pred, average='weighted')
print('Accuracy (RBF Kernel): ', "%.2f" % (rbf_accuracy*100))
print('F1 (RBF Kernel): ', "%.2f" % (rbf_f1*100))""""

### ***--> Logistic Regression Model***

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

classifier = LogisticRegression(random_state = 0)
print('Training...')
classifier.fit(X_train, Y_train)

Y_pred = classifier.predict(X_test)

confusion_m = confusion_matrix(Y_test, Y_pred)
 
print ("Confusion Matrix : \n", confusion_m)
print ("Accuracy : ", accuracy_score(Y_test, Y_pred))

 ### ***--> K-Nearest Neighbors Model***

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
 
knn.fit(X_train, Y_train)
print('Training...') 
print(knn.predict(X_test))

neighbors = np.arange(1, 11)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))

for i, k in enumerate(neighbors):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, Y_train)
     
    # Compute training and test data accuracy
    train_accuracy[i] = knn.score(X_train, Y_train)
    test_accuracy[i] = knn.score(X_test, Y_test)

plt.plot(neighbors, test_accuracy, label = 'Testing dataset Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training dataset Accuracy')
 
plt.legend()
plt.xlabel('n_neighbors')
plt.ylabel('Accuracy')
plt.show()

classifier = KNeighborsClassifier(n_neighbors = 10, metric = 'minkowski', p = 2)
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)

#creatin confussion matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(Y_test, Y_pred)

In [None]:
print('Accurancy: ', accuracy_score(Y_test, Y_pred)*100, 'percent')