## K-Nearest Neighbor Model

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 300)
pd.set_option('display.width', 1000)

# Import data to Pandas DataFrame
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
test_final = pd.read_csv('data/test_final.csv')

# Split X & y for train, test, and test_final
X_train = train.drop('target', axis=1)
y_train = train['target']

X_test = test.drop('target', axis=1)
y_test = test['target']

X_test_final = test_final.drop('target', axis=1)
y_test_final = test_final['target']

# Preivew data
print(train.info())
train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106988 entries, 0 to 106987
Data columns (total 28 columns):
acousticness          106988 non-null float64
danceability          106988 non-null float64
duration_ms           106988 non-null float64
energy                106988 non-null float64
instrumentalness      106988 non-null float64
liveness              106988 non-null float64
loudness              106988 non-null float64
mode_feat             106988 non-null float64
speechiness           106988 non-null float64
tempo                 106988 non-null float64
valence               106988 non-null float64
time_signature_1.0    106988 non-null int64
time_signature_3.0    106988 non-null int64
time_signature_4.0    106988 non-null int64
time_signature_5.0    106988 non-null int64
key_0.0               106988 non-null int64
key_1.0               106988 non-null int64
key_2.0               106988 non-null int64
key_3.0               106988 non-null int64
key_4.0               106988 no

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,mode_feat,speechiness,tempo,valence,time_signature_1.0,time_signature_3.0,time_signature_4.0,time_signature_5.0,key_0.0,key_1.0,key_2.0,key_3.0,key_4.0,key_5.0,key_6.0,key_7.0,key_8.0,key_9.0,key_10.0,key_11.0,target
0,0.254,0.69,197333.0,0.707,0.0,0.106,-6.875,1.0,0.0863,90.092,0.586,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,0.396,0.632,162215.0,0.936,0.0,0.348,-3.639,1.0,0.208,80.196,0.782,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0.0657,0.953,512080.0,0.39,0.503,0.154,-10.123,0.0,0.0568,108.345,0.743,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,0.387,0.839,277882.0,0.49,8.8e-05,0.0896,-6.973,1.0,0.0601,117.058,0.963,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,4e-05,0.487,271179.0,0.908,0.252,0.153,-5.628,0.0,0.0468,150.062,0.511,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0


### Scale Train & Test Data

In [2]:
# Standardize with MinMax Scaler
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
X_train_mmscaled = min_max_scaler.fit_transform(X_train)
X_test_mmscaled = min_max_scaler.transform(X_test)

# Standardize with Standard Scaler
from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()
X_train_sscaled = standard_scaler.fit_transform(X_train)
X_test_sscaled = standard_scaler.transform(X_test)

### Fit KNN Model

In [3]:
# Fit KNN Model
from sklearn.neighbors import KNeighborsClassifier

knn1 = KNeighborsClassifier()
knn1.fit(X_train_mmscaled, y_train)
knn1_test_preds = knn1.predict(X_test_mmscaled)

### Print Metrics

In [4]:
# Print metrics
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
def print_metrics(labels, preds):
    print("Precision Score: {}".format(precision_score(labels, preds, average='weighted')))
    print("Recall Score: {}".format(recall_score(labels, preds, average='weighted')))
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))
    print("F1 Score: {}".format(f1_score(labels, preds, average='weighted')))
    
print_metrics(y_test, knn1_test_preds)

Precision Score: 0.783349006613525
Recall Score: 0.8646627785254972
Accuracy Score: 0.8646627785254972
F1 Score: 0.812467044914369


### Find Best K by Different Metrics

In [5]:
# Find best K to improve model performance (by F1-Score)
def find_best_k_f1(X_train, y_train, X_test, y_test, min_k=1, max_k=25):
    best_k = 0
    best_score = 0.0
    for k in range(min_k, max_k+1, 5):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        preds = knn.predict(X_test)
        f1 = f1_score(y_test, preds, average='weighted')
        if f1 > best_score:
            best_k = k
            best_score = f1
    
    print("Best Value for k: {}".format(best_k))
    print("F1-Score: {}".format(best_score))

find_best_k_f1(X_train_mmscaled, y_train, X_test_mmscaled, y_test)

Best Value for k: 6
F1-Score: 0.8122130722033843


In [6]:
# Find best K to improve model performance (by Accuracy)
def find_best_k_acc(X_train, y_train, X_test, y_test, min_k=1, max_k=25):
    best_k = 0
    best_score = 0.0
    for k in range(min_k, max_k+1, 5):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        preds = knn.predict(X_test)
        acc = accuracy_score(y_test, preds)
        if acc > best_score:
            best_k = k
            best_score = acc
    
    print("Best Value for k: {}".format(best_k))
    print("Accuracy: {}".format(best_score))

find_best_k_acc(X_train_mmscaled, y_train, X_test_mmscaled, y_test)

Best Value for k: 21
Accuracy: 0.8691490952594586


In [7]:
# Find best K to improve model performance
def find_best_k_precision(X_train, y_train, X_test, y_test, min_k=1, max_k=25):
    best_k = 0
    best_score = 0.0
    for k in range(min_k, max_k+1, 2):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        preds = knn.predict(X_test)
        precision = precision_score(y_test, preds, average='weighted')
        if precision > best_score:
            best_k = k
            best_score = precision
    
    print("Best Value for k: {}".format(best_k))
    print("Precision Score: {}".format(best_score))

find_best_k_precision(X_train_mmscaled, y_train, X_test_mmscaled, y_test)

Best Value for k: 1
Precision Score: 0.7841263806815183


In [None]:
# Find best K to improve model performance
def find_best_k_recall(X_train, y_train, X_test, y_test, min_k=1, max_k=25):
    best_k = 0
    best_score = 0.0
    for k in range(min_k, max_k+1, 2):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        preds = knn.predict(X_test)
        recall = recall_score(y_test, preds, average='weighted')
        if recall > best_score:
            best_k = k
            best_score = recall
    
    print("Best Value for k: {}".format(best_k))
    print("Recall Score: {}".format(best_score))

find_best_k_recall(X_train_mmscaled, y_train, X_test_mmscaled, y_test)