In [117]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from collections import Counter

In [118]:
file = "Exo1.xlsx"
dataset = pd.read_excel(file)


In [119]:
def visualize_data (dataset) :
    print(dataset.head())
    print ("Nombre de lignes du dataset :", dataset.shape[0])
    print("Nombre de colonnes du dataset : ", dataset.shape[1])
    print("Noms des colonnes : ", dataset.columns.tolist())

visualize_data(dataset)

   petal length  petal width  sepal length  sepal width        Class
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa
Nombre de lignes du dataset : 150
Nombre de colonnes du dataset :  5
Noms des colonnes :  ['petal length', 'petal width', 'sepal length', 'sepal width', 'Class']


In [120]:
def manhattan_distance(point1, point2):
    if len(point1) != len(point2):
        raise ValueError("Les points doivent être de même dimension")
    return sum([abs(x - y) for x, y in zip(point1, point2)])

In [121]:
dist = manhattan_distance(dataset.iloc[0, [0,1,2,3]].values, dataset.iloc[1, [0,1,2,3]].values)
dist

0.6999999999999993

In [122]:
def euclidean_distance( point1, point2):
    if len(point1) != len(point2):
        raise ValueError("Les points doivent être de même dimension")
    return math.sqrt(sum([(x - y) ** 2 for x, y in zip(point1, point2)]))

In [123]:
dist = euclidean_distance(dataset.iloc[0, [0,1,2,3]].values, dataset.iloc[1, [0,1,2,3]].values)
dist

0.5385164807134502

In [124]:
def minkowski_distance(dataset, attribut1, attribut2, p):
    if len(dataset[attribut1]) != len(dataset[attribut2]):
        raise ValueError("Les points doivent être de même dimension")
    return sum(abs(x - y) ** p for x, y in zip(dataset[attribut1], dataset[attribut2])) ** (1 / p)

In [125]:
# Avec p = 2 ça devient équivalent a la distance euclidienne
dist = minkowski_distance(dataset,'petal length', 'petal width', 2)
dist

36.1792758357599

In [126]:
dist = minkowski_distance(dataset,'petal length', 'petal width', 1)
dist

418.40000000000003

In [127]:
def sort_dataset_by_distance(dataset, reference_point, distance_function, columns):
    distances = dataset[columns].apply(lambda row: distance_function( row.values, reference_point), axis=1)
    dataset['Distance'] = distances  # Ajout d'une colonne "Distance" pour trier le dataset
    sorted_dataset = dataset.sort_values(by='Distance')
    return sorted_dataset

In [128]:
# Point de référence
point_reference = [0, 0, 0, 0]

# Utilisation de la fonction pour trier le dataset selon la distance euclidienne
sorted_df = sort_dataset_by_distance(dataset, point_reference, euclidean_distance, columns=['petal length', 'petal width', 'sepal length', 'sepal width'])
print(sorted_df)

     petal length  petal width  sepal length  sepal width           Class  \
41            4.5          2.3           1.3          0.3     Iris-setosa   
13            4.3          3.0           1.1          0.1     Iris-setosa   
8             4.4          2.9           1.4          0.2     Iris-setosa   
38            4.4          3.0           1.3          0.2     Iris-setosa   
42            4.4          3.2           1.3          0.2     Iris-setosa   
..            ...          ...           ...          ...             ...   
105           7.6          3.0           6.6          2.1  Iris-virginica   
122           7.7          2.8           6.7          2.0  Iris-virginica   
118           7.7          2.6           6.9          2.3  Iris-virginica   
131           7.9          3.8           6.4          2.0  Iris-virginica   
117           7.7          3.8           6.7          2.2  Iris-virginica   

      Distance  
41    5.226854  
13    5.358171  
8     5.456189  
38    5

In [129]:
def majority_class(classes):
    counter = Counter(classes)
    return counter.most_common(1)[0][0]

In [130]:
def KNN(dataset, query_point, k, distance_function):
    sorted_dataset = sort_dataset_by_distance(dataset, query_point, distance_function, columns=['petal length', 'petal width', 'sepal length', 'sepal width'])
    k_nearest_neighbors = sorted_dataset[:k]
    classes = [str(instance) for instance in k_nearest_neighbors.iloc[:,[4]].values]  
    return majority_class(classes)

In [131]:
instance = [5.2, 3.5, 1.41, 0.25]

# Avec K = 3
result_K3 = KNN(dataset, instance, k=3, distance_function=euclidean_distance)
print(f"Classe prédite avec K = 3 : {result_K3}")

# Avec K = 5
result_K5 = KNN(dataset, instance, k=5, distance_function=euclidean_distance)
print(f"Classe prédite avec K = 5 : {result_K5}")

Classe prédite avec K = 3 : ['Iris-setosa']
Classe prédite avec K = 5 : ['Iris-setosa']


In [132]:
# Avec K = 5
result_K5 = KNN(dataset, instance, k=5, distance_function=manhattan_distance)
print(f"Classe prédite avec K = 5 : {result_K5}")

Classe prédite avec K = 5 : ['Iris-setosa']
