## K-Nearest Neighbor

### Import Data

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 300)
pd.set_option('display.width', 1000)

# Import data to Pandas DataFrame
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
test_final = pd.read_csv('data/test_final.csv')

# Split X & y for train, test, and test_final
X_train = train.drop('target', axis=1)
y_train = train['target']

X_test = test.drop('target', axis=1)
y_test = test['target']

X_test_final = test_final.drop('target', axis=1)
y_test_final = test_final['target']

# Preivew data
print(train.info())
train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13988 entries, 0 to 13987
Data columns (total 28 columns):
acousticness          13988 non-null float64
danceability          13988 non-null float64
duration_ms           13988 non-null float64
energy                13988 non-null float64
instrumentalness      13988 non-null float64
liveness              13988 non-null float64
loudness              13988 non-null float64
mode_feat             13988 non-null float64
speechiness           13988 non-null float64
tempo                 13988 non-null float64
valence               13988 non-null float64
time_signature_1.0    13988 non-null int64
time_signature_3.0    13988 non-null int64
time_signature_4.0    13988 non-null int64
time_signature_5.0    13988 non-null int64
key_0.0               13988 non-null int64
key_1.0               13988 non-null int64
key_2.0               13988 non-null int64
key_3.0               13988 non-null int64
key_4.0               13988 non-null int64
key_5.0  

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,mode_feat,speechiness,tempo,valence,time_signature_1.0,time_signature_3.0,time_signature_4.0,time_signature_5.0,key_0.0,key_1.0,key_2.0,key_3.0,key_4.0,key_5.0,key_6.0,key_7.0,key_8.0,key_9.0,key_10.0,key_11.0,target
0,0.62,0.465,146494.0,0.469,0.0,0.118,-4.256,1.0,0.0319,69.22,0.518,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,7
1,0.00598,0.489,211885.0,0.641,0.0201,0.122,-7.011,0.0,0.032,108.901,0.587,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1
2,0.249,0.594,283733.0,0.635,0.0,0.454,-4.259,0.0,0.47,75.035,0.552,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,2
3,0.000667,0.674,191250.0,0.869,0.604,0.0651,-4.758,0.0,0.0417,127.802,0.071,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,7
4,0.0288,0.675,231133.0,0.76,2.2e-05,0.0585,-4.435,1.0,0.0513,109.619,0.611,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,4


### Scale Train & Test Data

In [2]:
# Standardize with MinMax Scaler
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
X_train_mmscaled = min_max_scaler.fit_transform(X_train)
X_test_mmscaled = min_max_scaler.transform(X_test)

### Fit KNN Model

In [3]:
# Fit KNN Model
from sklearn.neighbors import KNeighborsClassifier

knn1 = KNeighborsClassifier()
knn1.fit(X_train_mmscaled, y_train)
knn1_train_preds = knn1.predict(X_train_mmscaled)
knn1_test_preds = knn1.predict(X_test_mmscaled)

# Print metrics
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
def print_metrics(labels, preds):
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))

print_metrics(y_train, knn1_train_preds)
print_metrics(y_test, knn1_test_preds)

Accuracy Score: 0.5868601658564484
Accuracy Score: 0.4143551615670575


### Find Best K by Accuracy

In [4]:
# Find best K to improve model performance (by Accuracy)
def find_best_k_acc(X_train, y_train, X_test, y_test, min_k=1, max_k=50):
    best_k = 0
    best_score = 0.0
    for k in range(min_k, max_k+1):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        preds = knn.predict(X_test)
        acc = accuracy_score(y_test, preds)
        if acc > best_score:
            best_k = k
            best_score = acc
    
    print("Best Value for k: {}".format(best_k))
    print("Accuracy: {}".format(best_score))

find_best_k_acc(X_train_mmscaled, y_train, X_test_mmscaled, y_test)

Best Value for k: 27
Accuracy: 0.44895624821275376


### Fit KNN Model w/ Optimized K=27

In [5]:
# Fit KNN Model
from sklearn.neighbors import KNeighborsClassifier

knn2 = KNeighborsClassifier(n_neighbors=27)
knn2.fit(X_train_mmscaled, y_train)
knn2_train_preds = knn2.predict(X_train_mmscaled)
knn2_test_preds = knn2.predict(X_test_mmscaled)

# Print Accuracy
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
def print_metrics(labels, preds):
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))
    
print_metrics(y_train, knn2_train_preds)
print_metrics(y_test, knn2_test_preds)

Accuracy Score: 0.4845581927366314
Accuracy Score: 0.44895624821275376


### Omit Key and Time Signature Features from X

In [3]:
# Split X & y for train, test, and test_final
# Omit columns related to time signature and 
X2_train = train.drop(['target', 'mode_feat', 'liveness', 'time_signature_1.0', 'time_signature_3.0', 'time_signature_4.0', 'time_signature_5.0', 'key_0.0', 'key_1.0', 'key_2.0', 'key_3.0', 'key_4.0', 'key_5.0', 'key_6.0', 'key_7.0', 'key_8.0', 'key_9.0', 'key_10.0', 'key_11.0'], axis=1)
y2_train = train['target']

X2_test = test.drop(['target', 'mode_feat', 'liveness', 'time_signature_1.0', 'time_signature_3.0', 'time_signature_4.0', 'time_signature_5.0', 'key_0.0', 'key_1.0', 'key_2.0', 'key_3.0', 'key_4.0', 'key_5.0', 'key_6.0', 'key_7.0', 'key_8.0', 'key_9.0', 'key_10.0', 'key_11.0'], axis=1)
y2_test = test['target']

X2_test_final = test_final.drop(['target', 'mode_feat', 'liveness', 'time_signature_1.0', 'time_signature_3.0', 'time_signature_4.0', 'time_signature_5.0', 'key_0.0', 'key_1.0', 'key_2.0', 'key_3.0', 'key_4.0', 'key_5.0', 'key_6.0', 'key_7.0', 'key_8.0', 'key_9.0', 'key_10.0', 'key_11.0'], axis=1)
y2_test_final = test_final['target']

# Preivew data
print(X2_train.info())
X2_train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13988 entries, 0 to 13987
Data columns (total 9 columns):
acousticness        13988 non-null float64
danceability        13988 non-null float64
duration_ms         13988 non-null float64
energy              13988 non-null float64
instrumentalness    13988 non-null float64
loudness            13988 non-null float64
speechiness         13988 non-null float64
tempo               13988 non-null float64
valence             13988 non-null float64
dtypes: float64(9)
memory usage: 983.6 KB
None


Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,loudness,speechiness,tempo,valence
0,0.62,0.465,146494.0,0.469,0.0,-4.256,0.0319,69.22,0.518
1,0.00598,0.489,211885.0,0.641,0.0201,-7.011,0.032,108.901,0.587
2,0.249,0.594,283733.0,0.635,0.0,-4.259,0.47,75.035,0.552
3,0.000667,0.674,191250.0,0.869,0.604,-4.758,0.0417,127.802,0.071
4,0.0288,0.675,231133.0,0.76,2.2e-05,-4.435,0.0513,109.619,0.611


### Scale Train & Test Data

In [4]:
# Standardize with MinMax Scaler
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
X2_train_mmscaled = min_max_scaler.fit_transform(X2_train)
X2_test_mmscaled = min_max_scaler.transform(X2_test)

### Fit KNN Model

In [5]:
# Fit KNN Model
from sklearn.neighbors import KNeighborsClassifier

knn3 = KNeighborsClassifier()
knn3.fit(X2_train_mmscaled, y2_train)
knn3_train_preds = knn3.predict(X2_train_mmscaled)
knn3_test_preds = knn3.predict(X2_test_mmscaled)

# Print metrics
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
def print_metrics(labels, preds):
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))

print_metrics(y2_train, knn3_train_preds)
print_metrics(y2_test, knn3_test_preds)

Accuracy Score: 0.614097798112668
Accuracy Score: 0.4566771518444381


### Find Best K by Accuracy

In [6]:
# Find best K to improve model performance (by Accuracy)
def find_best_k_acc(X2_train, y2_train, X2_test, y2_test, min_k=1, max_k=50):
    best_k = 0
    best_score = 0.0
    for k in range(min_k, max_k+1):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X2_train, y2_train)
        preds = knn.predict(X2_test)
        acc = accuracy_score(y2_test, preds)
        if acc > best_score:
            best_k = k
            best_score = acc
    
    print("Best Value for k: {}".format(best_k))
    print("Accuracy: {}".format(best_score))

find_best_k_acc(X2_train_mmscaled, y2_train, X2_test_mmscaled, y2_test)

Best Value for k: 26
Accuracy: 0.5038604518158422


### Fit KNN Model w/ Optimized K=26

In [7]:
# Fit KNN Model
from sklearn.neighbors import KNeighborsClassifier

knn4 = KNeighborsClassifier(n_neighbors=26)
knn4.fit(X2_train_mmscaled, y2_train)
knn4_train_preds = knn4.predict(X2_train_mmscaled)
knn4_test_preds = knn4.predict(X2_test_mmscaled)

# Print Accur
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
def print_metrics(labels, preds):
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))

print_metrics(y2_train, knn4_train_preds)
print_metrics(y2_test, knn4_test_preds)

Accuracy Score: 0.5288104089219331
Accuracy Score: 0.5038604518158422


### Integrating PCA

In [55]:
from sklearn.decomposition import PCA
def find_PCA_components(X,n_start=2,target=0.8,skip=1):
    curr_target = 0
    n = n_start
    while curr_target < target:
        pca = PCA(n_components=n)
        transformed = pca.fit_transform(X)
        curr_target = np.sum(pca.explained_variance_ratio_)
        n += skip
    print(f"n_component={n}, variance ={curr_target}")
find_PCA_components(X_train_mmscaled,n_start=2,target=0.9,skip=1)

n_component=16, variance =0.9121079282126102


In [63]:
pca = PCA(n_components=16)
X_train_transformed = pca.fit_transform(X_train_mmscaled)
X_test_transformed = pca.fit_transform(X_test_mmscaled)

In [64]:
# Fit KNN Model
from sklearn.neighbors import KNeighborsClassifier

knn5 = KNeighborsClassifier(n_neighbors=26)
knn5.fit(X_train_transformed, y_train)
knn5_train_preds = knn5.predict(X_train_transformed)
knn5_test_preds = knn5.predict(X_test_transformed)

# Print Accur
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
def print_metrics(labels, preds):
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))

print_metrics(y_train, knn5_train_preds)
print_metrics(y_test, knn5_test_preds)

Accuracy Score: 0.4275092936802974
Accuracy Score: 0.3068344295110094


### PCA -features

In [58]:
from sklearn.decomposition import PCA
def find_PCA_components(X,n_start=2,target=0.8,skip=1):
    curr_target = 0
    n = n_start
    while curr_target < target:
        pca = PCA(n_components=n)
        transformed = pca.fit_transform(X)
        curr_target = np.sum(pca.explained_variance_ratio_)
        n += skip
    print(f"n_component={n}, variance ={curr_target}")
find_PCA_components(X2_train_mmscaled,n_start=2,target=0.8,skip=1)

n_component=5, variance =0.8310453663005559


In [65]:
pca = PCA(n_components=5)
X2_train_transformed = pca.fit_transform(X2_train_mmscaled)
X2_test_transformed = pca.fit_transform(X2_test_mmscaled)

In [66]:
# Fit KNN Model
from sklearn.neighbors import KNeighborsClassifier

knn6 = KNeighborsClassifier(n_neighbors=26)
knn6.fit(X2_train_transformed, y2_train)
knn6_train_preds = knn6.predict(X2_train_transformed)
knn6_test_preds = knn6.predict(X2_test_transformed)

# Print Accur
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
def print_metrics(labels, preds):
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))

print_metrics(y2_train, knn6_train_preds)
print_metrics(y2_test, knn6_test_preds)

Accuracy Score: 0.4623248498713183
Accuracy Score: 0.27766657134686873
