Marcos André Monteiro de Barros

<h2>Implementação da classe KNN</h2>

In [None]:
import numpy as np
from collections import Counter

def euclidian_distance(x1, x2):
    distance = np.sqrt(np.sum((x1 - x2)**2))
    return distance

class KNN:
    def __init__(self, k=3):
        self.k = k
        
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
    
    def predict(self, X):
        predicitons = [self._predict(x) for x in X]
        return predicitons

    def _predict(self, x):
        
        distances = [euclidian_distance(x, x_train) for x_train in self.X_train]

        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]

        most_common = Counter(k_nearest_labels).most_common()
        return most_common[0][0]


<h2>Treino da classe KNN</h2>

In [None]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from random import randint

iris = datasets.load_iris()
X, y = iris.data, iris.target
acc_cont = 0

for i in range(5):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 42)

    clf = KNN(k=5)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)

    acc_cont += np.sum(predictions == y_test) / len(y_test)


acc = acc_cont / 5

print(f"A média da acuracia foi de {round(acc, 4)*100}%")

A média da acuracia foi de 100.0%


 <h2>Classe WKNN</h2>

In [None]:
import numpy as np
from collections import Counter

def euclidian_distance(x1, x2):
    distance = np.sqrt(np.sum((x1 - x2)**2))
    #weight = 1 / (1 + distance)
    return distance

class WKNN:
    def __init__(self, k=3):
        self.k = k
        
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        return [self._predict(x) for x in X]


    def _predict(self, x):
        
        distances = [euclidian_distance(x, x_train) for x_train in self.X_train]

        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        weight = 0

        weighted_labels = {}
        
        for i in range(self.k):
            index = k_indices[i]
            weight = 0

            if euclidian_distance(x, self.X_train[k_indices[-1]]) == euclidian_distance(x, self.X_train[k_indices[0]]):
                weight = 1
            else:
                weight = (euclidian_distance(x, self.X_train[k_indices[-1]]) - euclidian_distance(x, self.X_train[index])) / (euclidian_distance(x, self.X_train[k_indices[-1]]) - euclidian_distance(x, self.X_train[k_indices[0]]))

            label = self.y_train[index]
            if label in weighted_labels:
                weighted_labels[label] += weight
            else:
                weighted_labels[label] = weight
        
        return max(weighted_labels, key=weighted_labels.get)

<h2>Treino WKNN</h2>

In [None]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from random import randint

iris = datasets.load_iris()
X, y = iris.data, iris.target
acc_cont = 0


for i in range(5):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 42)

    clf = WKNN(k=5)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)

    #print(predictions)

    acc_cont += np.sum(predictions == y_test) / len(y_test)
    
acc = acc_cont / 5
print(f"A média da acuracia foi de {acc*100}%")

A média da acuracia foi de 100.0%


<h2>Treino com múltiplos datasets KNN</h2>

In [None]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from random import randint

datasets_list = [datasets.load_iris(), datasets.load_digits(), datasets.load_wine(), datasets.load_breast_cancer()]
dataset_names = ["Iris", "Digits", "Wine", "Breast Cancer"]

for i, dataset in enumerate(datasets_list):
    X, y = dataset.data, dataset.target
    acc_count_KNN = 0
    
    for j in range(5):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state= 42)

        clf = KNN(k=5)
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        correct_predictions = np.sum(predictions == y_test)
        acc_count_KNN += correct_predictions


    KNN_accuracy = acc_count_KNN / (5 * len(y_test))
    print(f"A média da acurácia do algoritmo {dataset_names[i]} foi de {round(KNN_accuracy, 4)*100}%")

A média da acurácia do algoritmo Iris foi de 98.33%
A média da acurácia do algoritmo Digits foi de 98.89%
A média da acurácia do algoritmo Wine foi de 69.44%
A média da acurácia do algoritmo Breast Cancer foi de 95.61%


<h2>Treino com múltiplos datasets WKNN</h2>


In [None]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from random import randint

datasets_list = [datasets.load_iris(), datasets.load_digits(), datasets.load_wine(), datasets.load_breast_cancer()]
dataset_names = ["Iris", "Digits", "Wine", "Breast Cancer"]

for i, dataset in enumerate(datasets_list):
    X, y = dataset.data, dataset.target
    acc_count_WKNN = 0
    
    for j in range(5):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 42)

        clf = WKNN(k=5)
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        correct_predictions = np.sum(predictions == y_test)
        acc_count_WKNN += correct_predictions


    WKNN_accuracy = acc_count_WKNN / (5 * len(y_test))
    print(f"A média da acurácia do algoritmo {dataset_names[i]} foi de {round(WKNN_accuracy, 4)*100}%")


A média da acurácia do algoritmo Iris foi de 100.0%
A média da acurácia do algoritmo Digits foi de 98.33%
A média da acurácia do algoritmo Wine foi de 77.78%
A média da acurácia do algoritmo Breast Cancer foi de 92.97999999999999%
