In [10]:
import pandas as pd
import numpy as np

df_iris = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", header=None)
df_mnist = pd.read_csv("https://raw.githubusercontent.com/dbdmg/data-science-lab/master/datasets/mnist_test.csv", header=None)

print(df_iris.index.shape)
sample_iris = df_iris.sample(frac=0.2, random_state=1)
print(sample_iris.index.shape)

# Extract test data
X_test = sample_iris.iloc[:, :4].values  # Convert to numpy array
y_test = sample_iris.iloc[:, 4].values   # Extract as 1D array

# Extract training data (remove test indices from original dataframe)
X_train = df_iris.drop(sample_iris.index).iloc[:, :4].values  # Convert to numpy array
y_train = df_iris.drop(sample_iris.index).iloc[:, 4].values   # Extract as 1D array

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

(150,)
(30,)
X_train shape: (120, 4)
y_train shape: (120,)
X_test shape: (30, 4)
y_test shape: (30,)


In [11]:
class KNearestNeighbors:
    def __init__(self, k, distance_metric="euclidean"):
        """
        Initialize the K-Nearest Neighbors classifier.

        Parameters
        ----------
        k : int
            Number of neighbors to consider.
        distance_metric : str, optional
            Distance metric to use ("euclidean" by default).
        """
        self.k = k
        self.distance_metric = distance_metric
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        """
        Store the training data and corresponding labels.

        Parameters
        ----------
        X : ndarray of shape (R, C)
            Training data points.
        y : ndarray of shape (R,)
            Training labels.
        """
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        """
        Predict labels for the given data points using the trained model.

        Parameters
        ----------
        X : ndarray of shape (N, C)
            Test data points.

        Returns
        -------
        labels : ndarray of shape (N,)
            Predicted labels.
        """
        predictions = []
        
        # Select distance function based on metric
        if self.distance_metric == "euclidean":
            distance_func = self.__eu_distance_chat
        elif self.distance_metric == "cosine":
            distance_func = self.__cd_distance_chat
        elif self.distance_metric == "manhattan":
            distance_func = self.__md_distance_chat
        else:
            distance_func = self.__eu_distance_chat
        
        # For each test point
        for test_point in X:
            # Calculate distances to all training points
            distances = []
            for train_point in self.X_train:
                dist = distance_func(test_point, train_point)
                distances.append(dist)
            
            # Convert to numpy array and find k nearest neighbors
            distances = np.array(distances)
            k_nearest_indices = np.argsort(distances)[:self.k]
            
            # Get labels of k nearest neighbors
            k_nearest_labels = self.y_train[k_nearest_indices]
            
            # Find most common label (majority vote)
            unique_labels, counts = np.unique(k_nearest_labels, return_counts=True)
            predicted_label = unique_labels[np.argmax(counts)]
            
            predictions.append(predicted_label)
        
        return np.array(predictions)

    def get_label(self, sorted):
        """Helper method to get the most common label."""
        values, counts = np.unique(sorted, return_counts=True)
        return values[np.argmax(counts)]
        
    def __eu_distance(self, p, q):
        """Euclidean distance"""
        n = len(p)
        sum_sq = 0
        for i in range(0, n):
            sum_sq += (p[i] - q[i]) ** 2
        d = np.sqrt(sum_sq)
        return d
    
    def __eu_distance_chat(self, p, q):
        """Compute the Euclidean distance between two points."""
        p, q = np.array(p), np.array(q)
        return np.sqrt(np.sum((p - q) ** 2))
    
    def __cd_distance(self, p, q):
        """Cosine distance"""
        n = len(p)
        sum_num = 0
        sum_den_p = 0
        sum_den_q = 0
        
        for i in range(0, n):
            sum_num += p[i] * q[i]
            sum_den_p += p[i]**2
            sum_den_q += q[i]**2
            
        cs = sum_num / (np.sqrt(sum_den_p) * np.sqrt(sum_den_q))
        cd = 1 - cs
        return cd

    def __cd_distance_chat(self, p, q):
        """Compute the cosine distance between two vectors."""
        p, q = np.array(p), np.array(q)
        dot_product = np.dot(p, q)
        norm_p = np.linalg.norm(p)
        norm_q = np.linalg.norm(q)
        if norm_p == 0 or norm_q == 0:
            return 1.0  # Maximum distance if either vector is zero
        return 1 - (dot_product / (norm_p * norm_q))
    
    def __md_distance(self, p, q):
        """Manhattan distance"""
        n = len(p)
        md = 0
        for i in range(0, n):
            md += np.abs(p[i] - q[i])
        return md
    
    def __md_distance_chat(self, p, q):
        """Compute the Manhattan distance between two vectors."""
        p, q = np.array(p), np.array(q)
        return np.sum(np.abs(p - q))

In [14]:
knn = KNearestNeighbors(k=5, distance_metric="euclidean")
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate accuracy
print(predictions)
print(y_test)

accuracy = np.mean(predictions == y_test)
print(f"Accuracy: {accuracy:.4f}")

['Iris-setosa' 'Iris-versicolor' 'Iris-versicolor' 'Iris-setosa'
 'Iris-virginica' 'Iris-versicolor' 'Iris-virginica' 'Iris-setosa'
 'Iris-setosa' 'Iris-virginica' 'Iris-versicolor' 'Iris-setosa'
 'Iris-virginica' 'Iris-versicolor' 'Iris-versicolor' 'Iris-setosa'
 'Iris-versicolor' 'Iris-versicolor' 'Iris-setosa' 'Iris-setosa'
 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-setosa'
 'Iris-virginica' 'Iris-versicolor' 'Iris-setosa' 'Iris-setosa'
 'Iris-versicolor' 'Iris-virginica']
['Iris-setosa' 'Iris-versicolor' 'Iris-versicolor' 'Iris-setosa'
 'Iris-virginica' 'Iris-versicolor' 'Iris-virginica' 'Iris-setosa'
 'Iris-setosa' 'Iris-virginica' 'Iris-versicolor' 'Iris-setosa'
 'Iris-virginica' 'Iris-versicolor' 'Iris-versicolor' 'Iris-setosa'
 'Iris-versicolor' 'Iris-versicolor' 'Iris-setosa' 'Iris-setosa'
 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-setosa'
 'Iris-virginica' 'Iris-versicolor' 'Iris-setosa' 'Iris-setosa'
 'Iris-versicolor' 'Iris-virginica']
Ac