In [38]:
import pandas as pd

In [39]:
#function that creates a dataframe from a csv file with the column containing strings being indexed
def create_indexed_dataframe(csv_filename):
    #iterate over columns to check for columns that contain anything other than numbers
    initial_data = pd.read_csv(csv_filename)
    for col in initial_data.columns:
        #index the column that does not contain numbers
        if initial_data[col].dtype == object:
            indexed_data = initial_data.set_index(col)
    return indexed_data

In [40]:
"""function that creates a Distance column that will contain the Manhattan distances 
between the attributes and the species"""
def manhattan_distance_KNN(dataframe, *attributes):
    #create an empty list which will be used to populate the new column
    neighbours = []
    #iterate over all the rows in the dataframe
    for i in range(dataframe.shape[0]):
        #new_row variable will populate the neighbbours[] list
        new_row = 0
        #col_no variable will keep track of the column number being indexed
        col_no = 0
        #iterate over all the attributes passed to the function
        for attribute in attributes:
            """increment the new row value by the Manhattan distance between the 
            attribute and the indexed datapoint"""
            new_row += (abs(attribute - dataframe.iloc[i][col_no]))
            #increment the col_no variable
            col_no+=1
        #append the final value of new_row to the list
        neighbours.append(new_row)
    #create the new column and populate it with the list
    dataframe['Distance'] = neighbours
    return dataframe

In [41]:
"""function that creates a Distance column that will contain the Euclidean distances 
between the attributes and the species"""
def euclidean_distance_KNN(dataframe, *attributes):
    #create an empty list which will be used to populate the new column
    neighbours = []
    #iterate over all the rows in the dataframe
    for i in range(dataframe.shape[0]):
        #new_row variable will populate the neighbbours[] list
        new_row = 0
        #col_no variable will keep track of the column number being indexed
        col_no = 0
        #iterate over all the attributes passed to the function
        for attribute in attributes:
            """increment the new row value by the Euclidean distance between the 
            attribute and the indexed datapoint"""
            new_row += ((attribute - dataframe.iloc[i][col_no])**2)
            #increment the col_no variable
            col_no+=1
        #get the square root of the new_row value, in accordance to the Euclidean formula
        new_row**=(1/2)
        #append the final value of new_row to the list
        neighbours.append(new_row)
    #create the new column and populate it with the list
    dataframe['Distance'] = neighbours
    return dataframe

In [42]:
"""function that creates a Distance column that will contain the Minkowski distances 
between the attributes and the species"""
def minkowski_distance_KNN(dataframe, p, *attributes):
    #create an empty list which will be used to populate the new column
    neighbours = []
    #iterate over all the rows in the dataframe
    for i in range(dataframe.shape[0]):
        #new_row variable will populate the neighbbours[] list
        new_row = 0
        #col_no variable will keep track of the column number being indexed
        col_no = 0
        #iterate over all the attributes passed to the function
        for attribute in attributes:
            """increment the new row value by the Minkowski distance between the 
            attribute and the indexed datapoint"""
            new_row += (abs(attribute - dataframe.iloc[i][col_no])**p)
            #increment the col_no variable
            col_no+=1
        #get the pth root of the new_row value, in accordance to the Minkowski formula
        new_row**=(1/p)
        #append the final value of new_row to the list
        neighbours.append(new_row)
    #create the new column and populate it with the list
    dataframe['Distance'] = neighbours
    return dataframe

In [43]:
#function that returns a dataframe containing the nearest neighbbours
def nearest_neighbours(dataframe, N):
    """create a dataframe containing the rows that have the n smallest points
    in the Distance column"""
    n_nearest = dataframe.nsmallest(N, 'Distance')
    #drop all columns except the Distance column
    n_nearest.drop(n_nearest.columns.difference(['Distance']), 1, inplace = True)
    return n_nearest

In [44]:
#function that returns a list of the best species
def find_species(dictionary):
    #find the key with the highest value
    itemMaxValue = max(dictionary.items(), key=lambda x: x[1])
    #create an empty list that will contain the keys that have the highest value
    listOfKeys = list()
    #iterate over all the items in dictionary to find keys with max value
    for key, value in dictionary.items():
        if value == itemMaxValue[1]:
            listOfKeys.append(key)
    return listOfKeys

In [45]:
"""function will return the N nearest neighbours to a given set of attributes from a CSV file 
using Manhattan distance"""
def KNN_manhattan(csv_filename, N, *attributes):
    indexed_data = create_indexed_dataframe(csv_filename)
    print("The original data:")
    display(indexed_data)
    print("")
    print("The Manhattan distances between the species and the given attributes")
    display(manhattan_distance_KNN(indexed_data, *attributes))
    print("")
    print(f"{N} Nearest Neighbours:")
    n_nearest = nearest_neighbours(indexed_data, N)
    display(n_nearest)
    #reset the index to a normal column so that the species can be become keys for the dictionary
    n = n_nearest.reset_index()
    """create a dictionary that will have the names of the former index column
    as keys and the number of times they occur as values"""
    counts = n.iloc[:, 0].value_counts().to_dict()
    #from this dictionary, find the best species
    best_species = find_species(counts)
    if len(best_species) > 1:
        return f"The best species are: {best_species}"
    else:
        return f"The best species is: {best_species}"

In [46]:
KNN_manhattan('KNN.csv', 5, 6.75, 3, 2)

The original data:


Unnamed: 0_level_0,Length,Width,Height
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Setosa,2.25,5.5,7.2
Setosa,3.25,8.25,8.1
Versciolor,2.75,7.5,3.3
Setosa,3.5,5.25,2.7
Setosa,3.0,3.25,6.9
Virginica,2.0,2.0,7.0
Virginica,5.75,8.75,3.1
Virginica,4.75,6.25,5.4
Virginica,5.5,6.75,7.8
Versciolor,5.25,9.5,8.6



The Manhattan distances between the species and the given attributes


Unnamed: 0_level_0,Length,Width,Height,Distance
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Setosa,2.25,5.5,7.2,12.2
Setosa,3.25,8.25,8.1,14.85
Versciolor,2.75,7.5,3.3,9.8
Setosa,3.5,5.25,2.7,6.2
Setosa,3.0,3.25,6.9,8.9
Virginica,2.0,2.0,7.0,10.75
Virginica,5.75,8.75,3.1,7.85
Virginica,4.75,6.25,5.4,8.65
Virginica,5.5,6.75,7.8,10.8
Versciolor,5.25,9.5,8.6,14.6



5 Nearest Neighbours:


Unnamed: 0_level_0,Distance
Species,Unnamed: 1_level_1
Setosa,6.2
Virginica,7.85
Virginica,8.65
Versciolor,8.87
Setosa,8.9


"The best species are: ['Virginica', 'Setosa']"

In [47]:
"""function to return the N nearest neighbours to a given set of attributes from a CSV file
using Euclidean distance"""
def KNN_euclidean(csv_filename, N, *attributes):
    indexed_data = create_indexed_dataframe(csv_filename)
    print("The original data:")
    display(indexed_data)
    print("")
    print("The Euclidean distances between the species and the given attributes")
    display(euclidean_distance_KNN(indexed_data, *attributes))
    print("")
    print(f"{N} Nearest Neighbours:")
    n_nearest = nearest_neighbours(indexed_data, N)
    display(n_nearest)
    #reset the index to a normal column so that the species can be become keys for the dictionary
    n = n_nearest.reset_index()
    """create a dictionary that will have the names of the former index column
    as keys and the number of times they occur as values"""
    counts = n.iloc[:, 0].value_counts().to_dict()
    #from this dictionary, find the best species
    best_species = find_species(counts)
    if len(best_species) > 1:
        return f"The best species are: {best_species}"
    else:
        return f"The best species is: {best_species}"

In [48]:
KNN_euclidean('KNN.csv', 4, 6.75, 3, 2)

The original data:


Unnamed: 0_level_0,Length,Width,Height
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Setosa,2.25,5.5,7.2
Setosa,3.25,8.25,8.1
Versciolor,2.75,7.5,3.3
Setosa,3.5,5.25,2.7
Setosa,3.0,3.25,6.9
Virginica,2.0,2.0,7.0
Virginica,5.75,8.75,3.1
Virginica,4.75,6.25,5.4
Virginica,5.5,6.75,7.8
Versciolor,5.25,9.5,8.6



The Euclidean distances between the species and the given attributes


Unnamed: 0_level_0,Length,Width,Height,Distance
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Setosa,2.25,5.5,7.2,7.317103
Setosa,3.25,8.25,8.1,8.776246
Versciolor,2.75,7.5,3.3,6.159545
Setosa,3.5,5.25,2.7,4.014349
Setosa,3.0,3.25,6.9,6.175354
Virginica,2.0,2.0,7.0,6.96868
Virginica,5.75,8.75,3.1,5.939066
Virginica,4.75,6.25,5.4,5.111018
Virginica,5.5,6.75,7.8,7.018903
Versciolor,5.25,9.5,8.6,9.384029



4 Nearest Neighbours:


Unnamed: 0_level_0,Distance
Species,Unnamed: 1_level_1
Setosa,4.014349
Virginica,5.111018
Versciolor,5.933203
Virginica,5.939066


"The best species is: ['Virginica']"

In [49]:
"""function will return the N nearest neighbours to a given set of attributes from a CSV file
using Minkowski distance"""
def KNN_minkowski(csv_filename, N, p, *attributes):
    indexed_data = create_indexed_dataframe(csv_filename)
    print("The original data:")
    display(indexed_data)
    print("")
    print("The Minkowski distances between the species and the given attributes")
    display(minkowski_distance_KNN(indexed_data, p, *attributes))
    print("")
    print(f"{N} Nearest Neighbours:")
    n_nearest = nearest_neighbours(indexed_data, N)
    display(n_nearest)
    #reset the index to a normal column so that the species can be become keys for the dictionary
    n = n_nearest.reset_index()
    """create a dictionary that will have the names of the former index column
    as keys and the number of times they occur as values"""
    counts = n.iloc[:, 0].value_counts().to_dict()
    #from this dictionary, find the best species
    best_species = find_species(counts)
    if len(best_species) > 1:
        return f"The best species are: {best_species}"
    else:
        return f"The best species is: {best_species}"

In [50]:
KNN_minkowski('KNN.csv', 5, 3, 6.75, 3, 2)                                                                                         

The original data:


Unnamed: 0_level_0,Length,Width,Height
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Setosa,2.25,5.5,7.2
Setosa,3.25,8.25,8.1
Versciolor,2.75,7.5,3.3
Setosa,3.5,5.25,2.7
Setosa,3.0,3.25,6.9
Virginica,2.0,2.0,7.0
Virginica,5.75,8.75,3.1
Virginica,4.75,6.25,5.4
Virginica,5.5,6.75,7.8
Versciolor,5.25,9.5,8.6



The Minkowski distances between the species and the given attributes


Unnamed: 0_level_0,Length,Width,Height,Distance
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Setosa,2.25,5.5,7.2,6.277335
Setosa,3.25,8.25,8.1,7.456394
Versciolor,2.75,7.5,3.3,5.398376
Setosa,3.5,5.25,2.7,3.58465
Setosa,3.0,3.25,6.9,5.543989
Virginica,2.0,2.0,7.0,6.154962
Virginica,5.75,8.75,3.1,5.773406
Virginica,4.75,6.25,5.4,4.337975
Virginica,5.5,6.75,7.8,6.297921
Versciolor,5.25,9.5,8.6,8.269448



5 Nearest Neighbours:


Unnamed: 0_level_0,Distance
Species,Unnamed: 1_level_1
Setosa,3.58465
Virginica,4.337975
Versciolor,5.374543
Versciolor,5.398376
Setosa,5.543989


"The best species are: ['Setosa', 'Versciolor']"