## Loading the data

In [None]:
import torch
from sklearn.datasets import load_breast_cancer
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import mode
import random

# Load the breast cancer dataset
cancer = load_breast_cancer()
X_cancer = cancer['data']
y_cancer = cancer['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, test_size=0.2, random_state=42)

X_train_mu = np.mean(X_train, axis=0)
X_train_std = np.std(X_train, axis=0)
X_train_normalized = (X_train - X_train_mu) / X_train_std
X_test_normalized = (X_test - X_train_mu ) / X_train_std

## Coding the class in PyTorch to use CUDA and evaluate the code on GPU, instead of CPU

In [None]:
class CudaKNNClassifier(torch.nn.Module):
  def __init__(self, k=5):
    super(CudaKNNClassifier, self).__init__()
    self.k = k
    self.X_train = None
    self.y_train = None

  def _validate_input(self, X, y=None):
    if not isinstance(X, torch.Tensor):
      raise ValueError("Input data must be a PyTorch tensor.")
    if y is not None and X.shape[0] != y.shape[0]:
      raise ValueError("Number of samples in X and y must be equal.")

  def fit(self, X_train, y_train):
    self._validate_input(X_train, y_train)
    self.X_train = X_train.to(self.device)  # Move to device (CPU/GPU)
    self.y_train = y_train.to(self.device)  # Move to device (CPU/GPU)

  def predict(self, X_test):
    self._validate_input(X_test, None)

    # Ensure X_train and y_train are on the same device as X_test
    if self.X_train.device != X_test.device:
      self.X_train = self.X_train.to(X_test.device)
      self.y_train = self.y_train.to(X_test.device)

    # Distance calculation, neighbor finding, voting (same as before)
    X_train_norm = self.X_train.unsqueeze(1)
    X_test_norm = X_test.unsqueeze(0)
    d_testTrain = torch.sqrt(torch.sum((X_train_norm - X_test_norm) ** 2, dim=2))
    idxs = torch.argsort(d_testTrain, dim=1)

    y_train_casted = self.y_train.repeat(X_test_norm.shape[0], 1)
    y_train_sorted = torch.gather(y_train_casted, dim=1, index=idxs)
    y_train_topK = y_train_sorted[:, :self.k]
    y_test_pred = torch.mode(y_train_topK, dim=1)[0]

    return y_test_pred

  @property
  def device(self):
    return torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Example Usage
cudaModelKNN = CudaKNNClassifier(k=5)
cudaModelKNN.fit(X_train, y_train)
y_pred = cudaModelKNN.predict(X_test)

## Using for loop for each sample

In [None]:
class MyKNN(torch.nn.Module):
  def __init__(self, X_train, y_train, k=3):
    super(MyKNN, self).__init__()
    self.k = k
    self.X_train = torch.tensor(X_train, dtype=torch.float).to(self.device)  # Move to device
    self.y_train = torch.tensor(y_train, dtype=torch.long).to(self.device)  # Move to device

  @property
  def device(self):
    return torch.device("cuda" if torch.cuda.is_available() else "cpu")

  def _euclidean_distance(self, X, sample):
    """
    Compute the Euclidean distance between each row in X and a sample using PyTorch.

    Args:
      X: A torch.Tensor of shape (num_samples, feature_dim) representing the training data.
      sample: A torch.Tensor of shape (feature_dim,) representing a single sample.

    Returns:
      A torch.Tensor of shape (num_samples,) containing the Euclidean distances between
      each sample in X and the given sample.
    """
    # Expand the sample to match the number of training samples (broadcasting)
    sample = sample.unsqueeze(0)  # Shape (1, feature_dim)
    # Calculate squared differences efficiently using broadcasting
    squared_diff = torch.sum((X - sample) ** 2, dim=1)
    # Return the square root of the sum of squared differences (Euclidean distance)
    return torch.sqrt(squared_diff)

  def predict_sample(self, sample, type='euclidean', n=None):
    if type != 'euclidean':
      raise ValueError("Only Euclidean distance is supported in PyTorch implementation.")

    # Calculate distances using the defined function
    distances = self._euclidean_distance(self.X_train, sample)

    # Rest of the prediction logic remains the same (sorting, finding neighbors, voting)
    _, indices = torch.sort(distances, dim=0, descending=True)  # Sort by descending distance
    y_train_sorted = self.y_train[indices]
    y_train_neighbors = y_train_sorted[:self.k]
    unique_labels, counts = torch.unique(y_train_neighbors, return_counts=True)
    return unique_labels[torch.argmax(counts)]

  def predict_dataset(self, dataset, type='euclidean', n=None):
    # Convert dataset to PyTorch tensor and move to device
    dataset_tensor = torch.tensor(dataset, dtype=torch.float).to(self.device)
    # Call predict_sample on each sample in the dataset
    predictions = torch.stack([self.predict_sample(x) for x in dataset_tensor])
    return predictions.cpu().numpy()  # Convert back to NumPy array

In [None]:
# Example Usage
modelKNNLoop = MyKNN(X_train, y_train, k=3)
y_pred = modelKNNLoop.predict_dataset(X_test)

## Benchmarking

In [None]:
k = 21
X_rand = np.random.rand(5000, 300)
y_rand = np.zeros((5000, 1))
X_rand = np.divide(np.subtract(X_rand, X_rand.mean()), X_rand.std())
X_rand_test = np.random.rand(1000, 300)

# Example Usage Loop
modelKNNLoop = MyKNN(X_rand, y_rand, k=k)
y_pred = modelKNNLoop.predict_dataset(X_rand_test)

# Example Usage CUDA
cudaModelKNN = CudaKNNClassifier(k=k)
cudaModelKNN.fit(y_rand, y_rand)
y_pred = cudaModelKNN.predict(X_rand_test)

In [None]:
print(f"KNNPredictor with for loop (k={k}): ")
%timeit -n 1 modelKNNLoop.predict_dataset(X_rand_test)

print(f"cudaModel KNN (k={k}): ")
%timeit -n 1 cudaModelKNN.predict(X_rand_test)