In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load data
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Series(iris.target)

# Split data (using the same random_state for a fair comparison)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert labels to NumPy arrays for easier use in our functions
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

print("Data loaded and preprocessed successfully.")
print(f"X_train_scaled shape: {X_train_scaled.shape}")
print(f"y_train shape: {y_train.shape}")

Data loaded and preprocessed successfully.
X_train_scaled shape: (120, 4)
y_train shape: (120,)


In [2]:
def calculate_distance(point1, point2):
    """
    Calculates the Euclidean distance between two points (NumPy arrays).
    """
    return np.sqrt(np.sum((point1 - point2)**2))

# --- Example Usage ---
# Let's test our function with the first two points from the training data.
point_a = X_train_scaled[0]
point_b = X_train_scaled[1]

distance = calculate_distance(point_a, point_b)

print(f"Sample Point A: {point_a}")
print(f"Sample Point B: {point_b}")
print(f"Euclidean Distance between A and B: {distance:.4f}")

Sample Point A: [-1.72156775 -0.33210111 -1.34572231 -1.32327558]
Sample Point B: [-1.12449223 -1.22765467  0.41450518  0.6517626 ]
Euclidean Distance between A and B: 2.8562


In [5]:
def get_neighbors(X_train, y_train, test_point, k):
    """
    Finds the k nearest neighbors for a given test point.
    """
    distances = []
    # Loop through each point in the training set
    for index, train_point in enumerate(X_train):
        # Calculate the distance
        dist = calculate_distance(test_point, train_point)
        # Store the distance and the corresponding label
        distances.append((dist, y_train[index]))

    # Sort the list of distances in ascending order
    distances.sort(key=lambda x: x[0])

    # Get the top k neighbors
    neighbors = []
    for i in range(k):
        neighbors.append(distances[i][1]) # Append only the label
    
    return neighbors

# --- Example Usage ---
# Let's find the 5 nearest neighbors for the first point in our test set
k = 5
test_sample = X_test_scaled[0]

neighbor_labels = get_neighbors(X_train_scaled, y_train, test_sample, k)

print(f"Test Sample: {test_sample}")
print(f"Labels of the {k} nearest neighbors: {neighbor_labels}")
print(f"True label of the test sample: {y_test[0]}")

Test Sample: [-1.72156775 -0.10821272 -1.40250384 -1.32327558]
Labels of the 5 nearest neighbors: [np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0)]
True label of the test sample: 0


In [6]:
from collections import Counter

def predict(neighbors):
    """
    Makes a prediction based on a majority vote of neighbors.
    """
    # Count the occurrences of each label
    vote_counts = Counter(neighbors)
    
    # Find the most common label and return it
    # .most_common(1) returns a list of the most common items and their counts
    # e.g., [(1, 4)] which means label 1 appeared 4 times.
    # We take the first element of the list [0], and then the label from that tuple [0].
    prediction = vote_counts.most_common(1)[0][0]
    
    return prediction

# --- Example Usage ---
# Let's use the neighbor_labels we found in the previous step
print(f"Neighbor labels: {neighbor_labels}")

final_prediction = predict(neighbor_labels)
print(f"Final Prediction: {final_prediction}")
print(f"True Label: {y_test[0]}")

Neighbor labels: [np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0)]
Final Prediction: 0
True Label: 0


In [7]:
# A list to store our predictions
predictions = []
k = 5 # Use the same k as our scikit-learn model

# Loop through each sample in the test set
for test_point in X_test_scaled:
    # 1. Find the neighbors for the current test point
    neighbor_labels = get_neighbors(X_train_scaled, y_train, test_point, k)
    
    # 2. Make a prediction based on the neighbors
    final_prediction = predict(neighbor_labels)
    
    # 3. Add the prediction to our list
    predictions.append(final_prediction)

# Calculate the accuracy by comparing our predictions to the true labels
# We sum up the cases where the prediction was correct and divide by the total number of test samples.
accuracy = np.sum(predictions == y_test) / len(y_test)

print(f"--- From-Scratch k-NN Model ---")
print(f"Accuracy: {accuracy:.4f}")

print(f"\n--- Scikit-learn k-NN Benchmark ---")
print(f"Accuracy: 0.9333")

--- From-Scratch k-NN Model ---
Accuracy: 0.9333

--- Scikit-learn k-NN Benchmark ---
Accuracy: 0.9333
