In [115]:
# IMPORTS
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.neighbors import KNeighborsClassifier
import time

In [116]:
# Build KNN algorithm:
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1-x2)**2))

class KNN:
    def __init__(self, k=3):
        self.k = k
        
    # Fit the data:
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
    
    # Predict the target:
    def predict(self, X):
        predict_labels = [self.distances(x) for x in X]
        return np.array(predict_labels).astype(float)

    def distances(self, x: float):
        # Compute the ditances:
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        # Get k nearest samples, labels:
        indices = np.argsort(distances)[:self.k]
        nearest_labels = [self.y_train[i] for i in indices]
        # Get the most common class label:
        most_common = Counter(nearest_labels).most_common(1)
        return most_common[0][0]

In [121]:
# Upload the data:
path = '../app/api/spotify.csv'
df = pd.read_csv(path)
df = df.drop(columns=['Unnamed: 0', 'song_title', 'artist'])
df.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,target
0,0.0102,0.833,204600,0.434,0.0219,2,0.165,-8.795,1,0.431,150.062,4.0,0.286,1
1,0.199,0.743,326933,0.359,0.00611,1,0.137,-10.401,1,0.0794,160.083,4.0,0.588,1
2,0.0344,0.838,185707,0.412,0.000234,2,0.159,-7.148,1,0.289,75.044,4.0,0.173,1
3,0.604,0.494,199413,0.338,0.51,5,0.0922,-15.236,1,0.0261,86.468,4.0,0.23,1
4,0.18,0.678,392893,0.561,0.512,5,0.439,-11.648,0,0.0694,174.004,4.0,0.904,1


In [123]:
# test the knn class:
from sklearn import datasets
from sklearn.model_selection import train_test_split


train, test= train_test_split(df, test_size=0.2, random_state=42)

X_train = train.drop(columns='target')
y_train = train['target']

X_test = test.drop(columns='target')
y_test = test['target']

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(1613, 13) (1613,) (404, 13) (404,)


In [124]:
# convert the train and test to numpy array:
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

In [125]:
# Test my_knn algorithm on Spotify dataset:
start_time = time.time()
my_knn = KNN(k=5)
my_knn.fit(X_train, y_train)
predictions = my_knn.predict(X_test)
acc = np.sum(predictions == y_test)/len(y_test)

# Print the score and the run time to compare with sklearn:
print(acc)
print('-------{}--------'.format(time.time()-start_time))

0.5618811881188119
-------5.72216796875--------


In [126]:
# Test the Sklearn knn:
start_time = time.time()
skl_knn = KNeighborsClassifier()
skl_knn.fit(X_train, y_train)
skl_pred = skl_knn.predict(X_test)
skl_acc = skl_knn.score(X_test, y_test)

# Print the score and the run time to compare with my_knn algorithm:
print(skl_acc)
print('-------{}--------'.format(time.time()-start_time))

0.5618811881188119
-------0.02453017234802246--------
