## K-Nearest Neighbor Model

In [32]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 300)
pd.set_option('display.width', 1000)

# Import data to Pandas DataFrame
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
test_final = pd.read_csv('data/test_final.csv')

# Split X & y for train, test, and test_final
X_train = train.drop('target', axis=1)
y_train = train['target']

X_test = test.drop('target', axis=1)
y_test = test['target']

X_test_final = test_final.drop('target', axis=1)
y_test_final = test_final['target']

# Preivew data
print(train.info())
train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13988 entries, 0 to 13987
Data columns (total 28 columns):
acousticness          13988 non-null float64
danceability          13988 non-null float64
duration_ms           13988 non-null float64
energy                13988 non-null float64
instrumentalness      13988 non-null float64
liveness              13988 non-null float64
loudness              13988 non-null float64
mode_feat             13988 non-null float64
speechiness           13988 non-null float64
tempo                 13988 non-null float64
valence               13988 non-null float64
time_signature_1.0    13988 non-null int64
time_signature_3.0    13988 non-null int64
time_signature_4.0    13988 non-null int64
time_signature_5.0    13988 non-null int64
key_0.0               13988 non-null int64
key_1.0               13988 non-null int64
key_2.0               13988 non-null int64
key_3.0               13988 non-null int64
key_4.0               13988 non-null int64
key_5.0  

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,mode_feat,speechiness,tempo,valence,time_signature_1.0,time_signature_3.0,time_signature_4.0,time_signature_5.0,key_0.0,key_1.0,key_2.0,key_3.0,key_4.0,key_5.0,key_6.0,key_7.0,key_8.0,key_9.0,key_10.0,key_11.0,target
0,0.62,0.465,146494.0,0.469,0.0,0.118,-4.256,1.0,0.0319,69.22,0.518,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,7
1,0.00598,0.489,211885.0,0.641,0.0201,0.122,-7.011,0.0,0.032,108.901,0.587,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1
2,0.249,0.594,283733.0,0.635,0.0,0.454,-4.259,0.0,0.47,75.035,0.552,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,2
3,0.000667,0.674,191250.0,0.869,0.604,0.0651,-4.758,0.0,0.0417,127.802,0.071,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,7
4,0.0288,0.675,231133.0,0.76,2.2e-05,0.0585,-4.435,1.0,0.0513,109.619,0.611,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,4


### Scale Train & Test Data

In [33]:
# Standardize with MinMax Scaler
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
X_train_mmscaled = min_max_scaler.fit_transform(X_train)
X_test_mmscaled = min_max_scaler.transform(X_test)

### Fit KNN Model

In [34]:
# Fit KNN Model
from sklearn.neighbors import KNeighborsClassifier

knn1 = KNeighborsClassifier()
knn1.fit(X_train_mmscaled, y_train)
knn1_test_preds = knn1.predict(X_test_mmscaled)

### Print Metrics

In [35]:
# Print metrics
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
def print_metrics(labels, preds):
    print("Precision Score: {}".format(precision_score(labels, preds, average='weighted')))
    print("Recall Score: {}".format(recall_score(labels, preds, average='weighted')))
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))
    print("F1 Score: {}".format(f1_score(labels, preds, average='weighted')))
    
print_metrics(y_test, knn1_test_preds)

Precision Score: 0.4148363847934521
Recall Score: 0.4143551615670575
Accuracy Score: 0.4143551615670575
F1 Score: 0.40752956509695043


### Find Best K by Different Metrics

#### Best K by F1-Score

In [37]:
# Find best K to improve model performance (by F1-Score)
def find_best_k_f1(X_train, y_train, X_test, y_test, min_k=1, max_k=50):
    best_k = 0
    best_score = 0.0
    for k in range(min_k, max_k+1):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        preds = knn.predict(X_test)
        f1 = f1_score(y_test, preds, average='weighted')
        if f1 > best_score:
            best_k = k
            best_score = f1
    
    print("Best Value for k: {}".format(best_k))
    print("F1-Score: {}".format(best_score))

find_best_k_f1(X_train_mmscaled, y_train, X_test_mmscaled, y_test)

Best Value for k: 17
F1-Score: 0.4257982312290444


#### Best K by Accuracy

In [41]:
# Find best K to improve model performance (by Accuracy)
def find_best_k_acc(X_train, y_train, X_test, y_test, min_k=1, max_k=50):
    best_k = 0
    best_score = 0.0
    for k in range(min_k, max_k+1):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        preds = knn.predict(X_test)
        acc = accuracy_score(y_test, preds)
        if acc > best_score:
            best_k = k
            best_score = acc
    
    print("Best Value for k: {}".format(best_k))
    print("Accuracy: {}".format(best_score))

find_best_k_acc(X_train_mmscaled, y_train, X_test_mmscaled, y_test)

Best Value for k: 27
Accuracy: 0.44895624821275376


#### Best K by Precision

In [39]:
# Find best K to improve model performance
def find_best_k_precision(X_train, y_train, X_test, y_test, min_k=1, max_k=20):
    best_k = 0
    best_score = 0.0
    for k in range(min_k, max_k+1):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        preds = knn.predict(X_test)
        precision = precision_score(y_test, preds, average='weighted')
        if precision > best_score:
            best_k = k
            best_score = precision
    
    print("Best Value for k: {}".format(best_k))
    print("Precision Score: {}".format(best_score))

find_best_k_precision(X_train_mmscaled, y_train, X_test_mmscaled, y_test)

Best Value for k: 17
Precision Score: 0.4326102064402426


#### Best K by Recall

In [40]:
# Find best K to improve model performance
def find_best_k_recall(X_train, y_train, X_test, y_test, min_k=1, max_k=20):
    best_k = 0
    best_score = 0.0
    for k in range(min_k, max_k+1):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        preds = knn.predict(X_test)
        recall = recall_score(y_test, preds, average='weighted')
        if recall > best_score:
            best_k = k
            best_score = recall
    
    print("Best Value for k: {}".format(best_k))
    print("Recall Score: {}".format(best_score))

find_best_k_recall(X_train_mmscaled, y_train, X_test_mmscaled, y_test)

Best Value for k: 20
Recall Score: 0.4455247354875608


### Fit KNN Model w/ Optimized K=3

In [42]:
# Fit KNN Model
from sklearn.neighbors import KNeighborsClassifier

knn1 = KNeighborsClassifier(n_neighbors=27)
knn1.fit(X_train_mmscaled, y_train)
knn1_test_preds = knn1.predict(X_test_mmscaled)

### Print Metrics

In [43]:
# Print metrics
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
def print_metrics(labels, preds):
    print("Precision Score: {}".format(precision_score(labels, preds, average='weighted')))
    print("Recall Score: {}".format(recall_score(labels, preds, average='weighted')))
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))
    print("F1 Score: {}".format(f1_score(labels, preds, average='weighted')))
    
print_metrics(y_test, knn1_test_preds)

Precision Score: 0.434818208193056
Recall Score: 0.44895624821275376
Accuracy Score: 0.44895624821275376
F1 Score: 0.4248069948093477
