<a href="https://colab.research.google.com/github/menna03/Decision-Trees-and-K-nn-/blob/main/K_Nearest_Neighbors_(KNN).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

20217011_20217005_20216129_20216091_20218003

#  Problem 2 :K-Nearest Neighbors (KNN)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from collections import Counter

In [None]:
#loading the data
diabetes_data = pd.read_csv('diabetes.csv')

In [None]:
#splitting the data into trainging and testing
train_data, test_data = train_test_split(diabetes_data, test_size=0.3, random_state=42)

In [None]:
#normalizing the features using the min-max scaling approach

#determining the columns we want to do operations on (numerics)
cols = ["Pregnancies","Glucose","BloodPressure","SkinThickness","Insulin","BMI","DiabetesPedigreeFunction","Age","Outcome"]
#get the max values existed in the training set
max_values = train_data[cols].max()
#get the min values existed in the training set
min_values = train_data[cols].min()
#get the max values existed in the testing set
max_values_test = test_data[cols].max()
#get the min values existed in the testing set
min_values_test = test_data[cols].min()

for col in cols:
  train_data[col] = ((train_data[col] - min_values[col]) / (max_values[col] - min_values[col]))
  test_data[col] = ((test_data[col] - min_values_test[col]) / (max_values_test[col] - min_values_test[col]))

In [None]:
#this function calculates the euclidean distance between two points (objects)
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((np.array(x1) - np.array(x2)) ** 2))

In [None]:
# this function is to get the nearest neighbours of given testing instance within the training data using euclidean distance
def get_neighbors(training_data, test_instance, k):
    distances = []
    for index, row in training_data.iterrows():
    #for train_instance in training_data:
        dist = euclidean_distance(test_instance[:-1], row[:-1])
        distances.append((row, dist))
    distances.sort(key=lambda x: x[1])
    return distances[:k]

In [None]:
#this function is too give a higher weight to the nearest neighbor (that has the biggest influnce on the classification decision)
def distance_weighted_voting(neighbors):
    # Initialize an empty dictionary to store votes for different classes
    class_votes = {}

    # Iterate through each neighbor and its distance
    for neighbor, distance in neighbors:
        # Extract the class label (assumed to be the last element in the neighbor tuple)
        label = neighbor[-1]

        # Compute the weight for the neighbor based on its distance
        # If the distance is zero, assign a high weight (1e4) to avoid division by zero
        weight = 1 / distance if distance != 0 else 1e4

        # Update the vote count for the corresponding class label
        # If the label doesn't exist yet, initialize it to zero and then add the weight
        class_votes[label] = class_votes.get(label, 0) + weight

    # Return the class label that has the maximum accumulated weight
    return max(class_votes, key=class_votes.get)


In [None]:
#creating a function to calculate the k nearest neighbors
def KNN(k, training_data, test_data):
    correct = 0
    total = len(test_data)
    for index, row in test_data.iterrows():
    #for test_instance in test_data:
        # Get the neighbors for the current test instance using the training data (calling the function we created )
        neighbors = get_neighbors(training_data, row, k)

        # Sort the neighbors by their distances
        sorted_neighbors = sorted(neighbors, key=lambda x: x[1])

        # Extract the top k classes from the sorted neighbors
        top_classes = [neighbor[0][-1] for neighbor in sorted_neighbors[:k]]

        # Count occurrences of the top classes
        counter = Counter(top_classes)

        # Find the most common class (or classes) among the top k neighbors
        most_common = counter.most_common(2)

        # Check if there is a tie in the most common classes
        if len(most_common) > 1 and most_common[0][1] == most_common[1][1]:
            # If there is a tie, use distance-weighted voting to predict the class
            predicted_class = distance_weighted_voting(sorted_neighbors)
        else:
            # Predict the class based on the most common class among the top k neighbors
            predicted_class = Counter(top_classes).most_common(1)[0][0]

        # Check if the predicted class matches the actual class for the test instance
        if predicted_class == row[-1]:
            # Increment the correct counter if the prediction is correct
            correct += 1

#calculating the accuracy of the correct results
    accuracy = correct / total
    return correct, total, accuracy

In [None]:
#initializing different values of the k, set the avg_accuracy, num_iteration to 0
k_values = [2, 3, 4, 5, 8, 7]
avg_accuracy = 0
num_iterations = 0
#calculate the KNN according to the value of the k
for k in k_values:
    correct, total, accuracy = KNN(k, train_data, test_data)
    #summing up the avg_accuracy
    avg_accuracy += accuracy
    #increasing the num_iterations by 1
    num_iterations += 1
    #displaying the results
    print(f"K value: {k}")
    print(f"Number of correctly classified instances: {correct}")
    print(f"Total number of instances in the test set: {total}")
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("")
#displaying the final result of the avg_accuracy
avg_accuracy /= num_iterations
print(f"Average accuracy across all iterations: {avg_accuracy * 100:.2f}%")

K value: 2
Number of correctly classified instances: 157
Total number of instances in the test set: 231
Accuracy: 67.97%

K value: 3
Number of correctly classified instances: 159
Total number of instances in the test set: 231
Accuracy: 68.83%

K value: 4
Number of correctly classified instances: 163
Total number of instances in the test set: 231
Accuracy: 70.56%

K value: 5
Number of correctly classified instances: 166
Total number of instances in the test set: 231
Accuracy: 71.86%

K value: 8
Number of correctly classified instances: 172
Total number of instances in the test set: 231
Accuracy: 74.46%

K value: 7
Number of correctly classified instances: 163
Total number of instances in the test set: 231
Accuracy: 70.56%

Average accuracy across all iterations: 70.71%


K = 8, is the optimal value for this database

# 20217011
#20217005
#20216129
#20216091
#20218003