In [1]:
import numpy as np
from time import time
np.random.seed(2)

In [2]:
# print(np.random.randint(1, 11, size = (10,10)))

matrix = np.random.randint(1, 6, size = (1,5))
vector = np.random.randint(1, 6, size = (5,1))

print(vector + matrix)
print(vector)
print(matrix)


[[2 2 5 4 5]
 [4 4 7 6 7]
 [3 3 6 5 6]
 [5 5 8 7 8]
 [4 4 7 6 7]]
[[1]
 [3]
 [2]
 [4]
 [3]]
[[1 1 4 3 4]]


In [15]:
d, N = 1000, 10000
X = np.random.randn(N, d)
z = np.random.randn(d)


#### Khoảng cách từ một điểm đến từng điểm trong một tập hợp


In [4]:
def dist_pp(z, x):
    minus = z - x.reshape(z.shape)
    return np.sum(minus * minus)

def dist_ps_naive(z, X):
    N = X.shape[0] 
    result = np.zeros(shape = (1, N))
    for i in range(N):
        result[0][i] = dist_pp(z, X[i])
    return result

def dist_ps_fast(z, X):
    X2 = np.sum(X * X, axis = 1)
    z2 = np.sum(z * z)
    return X2 + z2 - 2*(X @ z)

begin = time()
D1 = dist_ps_naive(z, X)
print('naive point2set, running time:', time() - begin, 's')

begin = time()
D2 = dist_ps_fast(z, X)
print('fast point2set, running time:', time() - begin, 's')

print('Result difference : ',np.linalg.norm(D1 - D2))
    

naive point2set, running time: 0.4497668743133545 s
fast point2set, running time: 0.12954950332641602 s
Result difference :  2.5816676365570415e-11


#### Khoảng cách giữa từng cặp điểm trong hai tập hợp

In [5]:
M = 100
Z = np.random.randn(M, d)

def dist_ss_naive(Z, X):
    M = Z.shape[0]
    N = X.shape[0]
    result = np.zeros(shape = (M, N))
    for i in range(M):
        result[i] = dist_ps_fast(Z[i], X)
    return result

def dist_ss_fast(Z, X):
    X2 = np.sum(X * X, axis = 1)
    Z2 = np.sum(Z * Z, axis = 1)
    return Z2.reshape(-1, 1) + X2.reshape(1, -1) - 2*(Z @ X.T)

begin = time()
D3 = dist_ss_naive(Z, X)
print('naive set2set, running time:', time() - begin, 's')

begin = time()
D4 = dist_ss_fast(Z, X)
print('fast set2set, running time:', time() - begin, 's')

print('Result difference : ',np.linalg.norm(D3 - D4))

naive set2set, running time: 10.85176396369934 s
fast set2set, running time: 0.22307085990905762 s
Result difference :  9.551967834751661e-11


#### Iris flower dataset

In [6]:
from sklearn import datasets, neighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [7]:
np.random.seed(7)
iris_data = datasets.load_iris()
iris_X = iris_data.data
iris_y = iris_data.target
print('Labels :',np.unique(iris_y))

# split train and test
X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, test_size = 130)
print(f'Train size : {X_train.shape[0]}, test_size : {X_test.shape[0]}')

Labels : [0 1 2]
Train size : 20, test_size : 130


In [8]:
model = neighbors.KNeighborsClassifier(n_neighbors = 1, p = 2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Accuracy of 1NN : {:.2f}%'.format(100 * accuracy_score(y_test, y_pred)))

Accuracy of 1NN : 92.31%


In [9]:
model = neighbors.KNeighborsClassifier(n_neighbors = 7, p = 2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Accuracy of 7NN with major voting : {:.2f}%'.format(100 * accuracy_score(y_test, y_pred)))

Accuracy of 7NN with major voting : 93.85%


In [10]:
model = neighbors.KNeighborsClassifier(n_neighbors = 7, p = 2, weights = 'distance')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Accuracy of 7NN with (1/distance weights) : {:.2f}%'.format(100 * accuracy_score(y_test, y_pred)))

Accuracy of 7NN with (1/distance weights) : 94.62%


In [11]:
def customized_weight(distance):
    sigma = .4
    return np.exp(-distance**2/sigma)

model = neighbors.KNeighborsClassifier(n_neighbors = 7, p = 2, weights = customized_weight)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Accuracy of 7NN with customized_weight : {:.2f}%'.format(100 * accuracy_score(y_test, y_pred)))

Accuracy of 7NN with customized_weight : 95.38%
