In [1]:
# !pip install scikit.learn

### Necessary Imports
- NumPy
- Pandas
- Math
- Copy
- Train Test Split

In [2]:
# Importing libraries
import numpy as np
import pandas as pd
import math
import copy
from sklearn.model_selection import train_test_split
from queue import PriorityQueue

### Load the Dataset

In [3]:
# Load the dataset using pandas
data = pd.read_csv('seeds_dataset.csv')
# Print the dataset
data

Unnamed: 0,Area,Perimeter.,Compactness,Length of kernel,Width of kernel,Asymmetry coefficient,Length of kernel groove,class
0,15.26,14.84,0.8710,5.763,3.312,2.221,5.220,1
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956,1
2,14.29,14.09,0.9050,5.291,3.337,2.699,4.825,1
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805,1
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175,1
...,...,...,...,...,...,...,...,...
205,12.19,13.20,0.8783,5.137,2.981,3.631,4.870,3
206,11.23,12.88,0.8511,5.140,2.795,4.325,5.003,3
207,13.20,13.66,0.8883,5.236,3.232,8.315,5.056,3
208,11.84,13.21,0.8521,5.175,2.836,3.598,5.044,3


### Perform Some Preprocessing

In [4]:
# Extract the class labels into a numpy array called 'y'
Y = data['class'].to_numpy()
# Remove the class labels from the dataframe and store the remaining data in a numpy array called 'X'
X = data.drop('class' , axis = 1).to_numpy()

In [5]:
# Split the dataset into training and testing
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size = 0.2)

In [6]:
# Print the dimensions of the training and testing data
print("Training Data Set Dimensions "+str(x_train.shape)+", Training True Class lables Dimension "+str(y_train.shape))
print("Test Data Set Dimensions "+str(x_test.shape)+", Test True Class lables Dimension "+str(y_test.shape))

Training Data Set Dimensions (168, 7), Training True Class lables Dimension (168,)
Test Data Set Dimensions (42, 7), Test True Class lables Dimension (42,)


### Define the Following Functions

In [7]:
def euclidean_distance(predicted, target):
    # Calculate the Euclidean distance between the predicted and target values
    distance = 0
    for i in range(len(predicted)):
        sq = abs(target[i]-predicted[i])**2
        distance+= sq
    return distance**0.5

In [8]:
def get_k_nearest(distances, k):
    # Sort the list of distances in ascending order
    newd = sorted(distances,key= lambda x:x[0])
    # Get the first 'k' distances from the sorted list and return them
    return newd[:k]

In [9]:
def get_max_class(classes):
    """
    Short Method: 
        myclass,values =np.unique(classes,return_counts=True)
        index = np.argmax(values)
        return myclass[index]
    """
    # Get the unique values and their counts
    max_class = None
    max_count = float('-inf')
    # Get the index of the maximum count value
    for i in classes:
        if (classes.count(i)> max_count):
            max_count = classes.count(i)
            max_class = i
    # Return the class with the maximum count
    return max_class

In [10]:
def get_classes_only(k_nearest):
    # Use list comprehension to extract the class labels from k_nearest tuples
    newlist = [i[1] for i in k_nearest]
    return newlist

In [11]:
def calculate_accuracy(prediction, target):
    # Calculate the accuracy of the predictions
    accuracy = float(0.0)
    count = int(0)
    
    for i in range(len(prediction)):
        if prediction[i] == target[i]:
            count += 1
    accuracy = (count/len(prediction))*100
    print("Calculated Accuracy:  " +str(accuracy))
    
    """
    Short Method: 
    return np.mean(predicted==target)*100
    """

### Define KNN Running Script Function

In [16]:
# Using the KNN Supervised Learning Algorithm for the model
# It will help us classify the quality of wheat seeds

# Defining the KNN function
def KNN(x_train,x_test,y_train, k):
    # Initialize an empty list to store the predicted labels for the test set
    predicted_labels = []
    # Iterate over each instance in the test set
    for i in range(len(x_test)):
        # Initialize an empty list to store the distances and labels of all training instances with respect to the current test instance
        distance = []
        # Iterate over each instance in the training set
        for j in range(len(x_train)):
            # Calculate the Euclidean distance between the current test instance and the current training instance
            distance.append((euclidean_distance(x_train[j],x_test[i]),y_train[j]))
            # Store the distance and the label of the current training instance as a tuple in the info list
        # Find the k nearest training instances based on their distances to the current test instance
        nearest_k = get_k_nearest(distance,k)
        # Extract the labels of the k nearest training instances
        k_lables = get_classes_only(nearest_k)
        # Predict the label of the current test instance by choosing the label with the highest frequency among the k nearest training instances
        predicted_label = get_max_class(k_lables)
        # Add the predicted label to the list of predictions for the test set
        predicted_labels.append(copy.deepcopy(predicted_label))
    # Return the list of predicted labels for the test set
    return predicted_labels

### Perform Prediction and Show Accuracy

In [17]:
# Running on k = 3
preds = KNN(x_train,x_test,y_train, 3)
calculate_accuracy(preds, y_test)

Calculated Accuracy:  92.85714285714286


<hr>

### Test Cases (If Required in Logic Building)

In [14]:
# Testing euclidean_distance function
point1 = [1, 2, 3]
point2 = [4, 5, 6]
distance = euclidean_distance(point1, point2)
print(distance)

# Testing get_classes_only function
k_nearest = [(np.array([1, 2]), 'A'),(np.array([3, 4]), 'B'),(np.array([5, 6]), 'A'),(np.array([7, 8]), 'A'),(np.array([9, 10]), 'B')]
classes = get_classes_only(k_nearest)
print(classes)

# Testing get_max_class function
classes = ['apple', 'banana', 'banana', 'orange', 'apple', 'banana', 'banana']
max_class = get_max_class(classes)
print(max_class)

# Testing get_k_nearest function
distances = [(0.3, 'class1'), (0.5, 'class2'), (0.2, 'class1'), (0.4, 'class2'), (0.1, 'class1')]
k_nearest = get_k_nearest(distances, 3)
print(k_nearest)

# Testing calculate_accuracy function
prediction = [1, 1, 1, 0, 0, 1, 1, 0]
target = [1, 1, 1, 0, 0, 1, 1, 0]
calculate_accuracy(prediction, target)


5.196152422706632
['A', 'B', 'A', 'A', 'B']
banana
[(0.1, 'class1'), (0.2, 'class1'), (0.3, 'class1')]
Calculated Accuracy:  100.0
