In [None]:
def euclidian_distance(v1, v2):
    return np.sqrt(sum([(d1 - d2) ** 2 for d1, d2 in zip(v1, v2)]))

In [12]:
import numpy as np
import scipy

class KNearestNeighbors():
    
    def __init__(self, k=10, dist_func=euclidian_distance):
        self.k = k
        self.dist_func = dist_func
    
    def fit(self, X, y):
        if type(X) == scipy.sparse.csr.csr_matrix:
            X = X.todense()
        if type(X) != np.ndarray:
            X = np.array(X)
        if type(y) != np.ndarray:
            y = np.array(y)
        if X.shape[0] != y.shape[0]:
            raise Exception('number of rows in X isn\'t match with number of rows in y')
        self.X = X
        self.y = y
        
    def predict_single(self, x):
        if type(x) != np.ndarray:
            x = np.array(x)
        if x.shape[1] != self.X.shape[1]:
            raise Exception('number of features in x isn\'t match with number of features in training data')
        
        k_nearest = [(self.dist_func(x, self.X[i]), self.y[i]) for i in range(self.X.shape[0])]
        print(k_nearest)
        k_nearest = sorted(k_nearest)
        k_nearest = k_nearest[:self.k]
        
        counts = {}
        for _, cls in k_nearest:
            if cls not in counts:
                counts[cls] = 0
            counts[cls] += 1
        
        current_count_max = 0
        current_class = 0
        for cls, cnt in counts:
            if cnt > current_count_max:
                current_class = cls
                current_count_max = cnt
        
        return current_class
    
    def predict(self, X):
        return [self.predict_single(x) for x in X]
    

In [13]:
import pandas as pd

df = pd.read_csv('clean_dataset_with_stemming.csv')
X = df['Teks']
y = df['label']

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from utils import fair_train_test_split

X_train, X_test, y_train, y_test = fair_train_test_split(X, y, test_size=0.0001)

count_vect = CountVectorizer()
df_count_vect = count_vect.fit_transform(X_train).todense()

In [15]:
from sklearn.metrics import accuracy_score

clf = KNearestNeighbors()
clf.fit(df_count_vect, y_train)
y_pred = clf.predict(count_vect.transform(X_test).todense())
print('Accuracy of KNN : ', accuracy_score(y_pred, y_test))

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [5]:
arr = [(4, 5.01), (5.03, 2), (1.01, 5)]
arr = sorted(arr)
print(arr)

[(1.01, 5), (4, 5.01), (5.03, 2)]
