In [1]:
import numpy as np
from sklearn.datasets import load_iris

In [2]:
data = load_iris()

In [None]:
print data.DESCR

sepal - чашелистник<br>
petal - лепесток

In [None]:
data.data.shape

In [None]:
data.feature_names

In [None]:
data.target

In [None]:
data.target_names

In [3]:
# normalization
transpose_data = data.data.transpose()
for i in range(data.data.shape[1]):
    max_value = transpose_data[i].max()
    transpose_data[i] = transpose_data[i] / max_value
data.data = transpose_data.transpose()

In [4]:
setosa_data = data.data[:50, :]
versicolor_data = data.data[50:100, :]
virginica_data = data.data[100:, :]

In [5]:
np.random.shuffle(setosa_data)
np.random.shuffle(versicolor_data)
np.random.shuffle(virginica_data)

In [6]:
setosa_data_train = setosa_data[:40]
setosa_data_test = setosa_data[40:]

versicolor_data_train = versicolor_data[:40]
versicolor_data_test = versicolor_data[40:]

virginica_data_train = virginica_data[:40]
virginica_data_test = virginica_data[40:]

$$ρ(𝑢, 𝑥_𝑖 )=\sqrt{∑_{𝑗=1}^{𝑛}(|𝑢^𝑗−𝑥_𝑖^𝑗|^2)} $$

In [7]:
def distance(u, x):
    diff = u - x
    return np.dot(np.transpose(diff), diff) ** 0.5    

Написать функции всех метрических алгоритмов, которые были на слайдах (ближайшего соседа, k ближайших соседей, k взвешанных ближайших соседей, метод окна Парзена, метод потенциальных функций). Применить (обучить) все эти алгоритмы на train выборке. Построить матрицу неточностей, для каждого класса вывести точность и полноту (все это сделать на тестовой выборке). Не пользоваться стандартными функциями.

In [8]:
class Classifier(object):    
    def __init__(self, distance):
        self.distance = distance
        
    def fit(self, X, y, n_classes):
        self.X_train = X
        self.y_train = y
        self.n_classes = n_classes
        
    def countDistances(self, u):
        distances = []
        for i in range(len(self.X_train)):
            distances.append([self.distance(u, X_train[i]), i])
        return distances        

In [11]:
class WeightKnn(Classifier):
    def predictElement(self, u, distances): 
        classes = [0 for i in range(self.n_classes)]
        sorted_distances = sorted(distances)
        for i, s in enumerate(sorted_distances[0:self.k_neighbors]):
            classes[y_train[s[1]]] += self.weight(i, s[1], u)
        return  sorted(range(len(classes)), key=lambda i : classes[i], reverse=True)[0]
    
    def predict(self, X, k_neighbors, weight):
        self.k_neighbors = k_neighbors
        self.weight = weight
        predicted = []
        for x in X:
            distances = self.countDistances(x)
            predicted.append(self.predictElement(x, distances))
        return predicted   

In [10]:
class Knn(WeightKnn):
    def predict(self, X, k_neighbors):
        return super(Knn, self).predict(X, k_neighbors, lambda i, j, y: 1)
    

In [12]:
class Parzen(WeightKnn):
    def predictParzenElement(self, u, kernel):
        distances = self.countDistances(u)
        h = 1.0 * sorted(distances)[self.k_neighbors - 1][0]
        self.weight = lambda i, j, y: kernel(self.distance(self.X_train[j], y) / h)
        return self.predictElement(u, distances) 
    
    def predict(self, X, k_neighbors, kernel, *h):
        self.k_neighbors = k_neighbors
        if len(h) > 0:
            return super(Parzen, self).predict(X, k_neighbors, lambda i, j, x: kernel(self.distance(self.X_train[j], x) / h[0]))
        else:
            self.kernel = kernel
            predicted = []
            for x in X:
                predicted.append(self.predictParzenElement(x, kernel))
            return predicted 

In [13]:
class Potential(WeightKnn):
    def predictPotentialElement(self, u):
        distances = self.countDistances(u)
        h = sorted(distances)[self.k_neighbors - 1][0]
        self.weight = lambda i, j, u: self.potentials[j] * kernel(self.distance(self.X_train[j], u) / h)
        return self.predictElement(u, distances) 
    
    def predict(self, X, k_neighbors, kernel):
        self.kernel = kernel
        self.k_neighbors = k_neighbors
        self.potentialSetup(0.05)
        predicted = []
        for x in X:
            predicted.append(self.predictPotentialElement(x))
        return predicted
    
    def potentialSetup(self, eps):
        l = len(self.X_train)
        self.potentials = [0 for i in range(l)]
        n_errors = l
        while 1.0 * n_errors / l > eps:
            for i, x in enumerate(self.X_train):
                n_errors = 0
                if self.predictPotentialElement(x) != self.y_train:
                    n_errors += 1
                    self.potentials[i] += 1

In [14]:
X_train = []
X_train.extend(setosa_data_train)
X_train.extend(versicolor_data_train)
X_train.extend(virginica_data_train)

y_train = [0 for i in range(40)]
y_train.extend([1 for i in range(40)])
y_train.extend([2 for i in range(40)])

X_test = []
X_test.extend(setosa_data_test)
X_test.extend(versicolor_data_test)
X_test.extend(virginica_data_test)

y_test = [0 for i in range(10)]
y_test.extend([1 for i in range(10)])
y_test.extend([2 for i in range(10)])

n_classes = 3

In [15]:
def getConfusionMatrix(y_expected, y_predicted, k):
    matrix = []
    for i in range(k):
        matrix.append([0 for i in range(k)])
    for i in range(len(y_expected)):
            matrix[y_expected[i]][y_predicted[i]] += 1
    return matrix

In [16]:
def getPrecision(confusion_matrix, c):
    summ = sum(confusion_matrix[c])
    if summ is not 0:
        return 1.0 * confusion_matrix[c][c] / summ

In [17]:
def getRecall(confusion_matrix, c):
    summ = 0.0
    for i in range(len(confusion_matrix)):
        summ += confusion_matrix[i][c]
    if summ is not 0:
        return confusion_matrix[c][c] / summ

In [18]:
def print2dMatrix(matrix):
    for raw in matrix:
        print(raw)

In [19]:
def testClassifier(classifier, *args):
    classifier.fit(X_train, y_train, n_classes)
    y_predicted = classifier.predict(X_test, *args)
    print("#predicted")
    print y_predicted
    print("#confusion matrix")
    matrix = getConfusionMatrix(y_predicted, y_test, 3)
    print(data.target_names)
    print2dMatrix(matrix)
    precisions = []
    recalls = []
    for i in range(3):
        precisions.append(getPrecision(matrix, i))
        recalls.append(getRecall(matrix, i))
    print("#Precisions")
    print(precisions)
    print("#Recalls")
    print(recalls)

In [20]:
knn = Knn(distance)

In [21]:
print("# KNN, N=1")
testClassifier(knn, 1)
print
print("# KNN, N=5")
testClassifier(knn, 2)

# KNN, N=1
#predicted
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
#confusion matrix
['setosa' 'versicolor' 'virginica']
[10, 0, 0]
[0, 10, 0]
[0, 0, 10]
#Precisions
[1.0, 1.0, 1.0]
#Recalls
[1.0, 1.0, 1.0]

# KNN, N=5
#predicted
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2]
#confusion matrix
['setosa' 'versicolor' 'virginica']
[10, 0, 0]
[0, 10, 2]
[0, 0, 8]
#Precisions
[1.0, 0.8333333333333334, 1.0]
#Recalls
[1.0, 1.0, 0.8]


In [22]:
weightKnn = WeightKnn(distance)

In [23]:
print("# Weighted KNN, N=2, q = 0.5")
N = 2
q = 0.5
testClassifier(weightKnn, N, lambda i, x, u: q**i)

# Weighted KNN, N=2, q = 0.5
#predicted
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
#confusion matrix
['setosa' 'versicolor' 'virginica']
[10, 0, 0]
[0, 10, 0]
[0, 0, 10]
#Precisions
[1.0, 1.0, 1.0]
#Recalls
[1.0, 1.0, 1.0]


In [24]:
ParzenClassifier = Parzen(distance)

In [25]:
def kernel1(r):
    if r < 0 or r > 1:
        return 0
    return 1 * ((1 - r) ** 2)

In [26]:
print("# Fixed Parzen, N=10  h = 0.5")
h = 0.5,
N = 10
kernel = kernel1
testClassifier(ParzenClassifier, N, kernel, h)

# Fixed Parzen, N=10  h = 0.5
#predicted
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
#confusion matrix
['setosa' 'versicolor' 'virginica']
[10, 0, 0]
[0, 10, 0]
[0, 0, 10]
#Precisions
[1.0, 1.0, 1.0]
#Recalls
[1.0, 1.0, 1.0]


In [27]:
print("# Parzen, N=10")
N = 10
kernel = kernel1
testClassifier(ParzenClassifier, N, kernel)

# Parzen, N=10
#predicted
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
#confusion matrix
['setosa' 'versicolor' 'virginica']
[10, 0, 0]
[0, 10, 0]
[0, 0, 10]
#Precisions
[1.0, 1.0, 1.0]
#Recalls
[1.0, 1.0, 1.0]


In [28]:
potential = Potential(distance)

In [29]:
def kernelPotential(r):
    return 1.0 / (r + 1.0)

In [30]:
print("# Potential, N = 10")
N = 10
kernel = kernelPotential
testClassifier(potential, N, kernel)

# Potential, N = 10
#predicted
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
#confusion matrix
['setosa' 'versicolor' 'virginica']
[10, 0, 0]
[0, 10, 0]
[0, 0, 10]
#Precisions
[1.0, 1.0, 1.0]
#Recalls
[1.0, 1.0, 1.0]
