In [5]:
import numpy as np
from sklearn.neighbors import NearestNeighbors

data = [[[255,0,0],"RED"], [[255,51,0],"RED"], [[255,102,0],"RED"], [[204,0,0],"RED"], [[190,0,20],"RED"], [[0,0,153],"BLUE"], [[0,51,204],"BLUE"], [[51,51,255],"BLUE"], [[0,102,255], "BLUE"], [[204,0,198],"BLUE"]]
target = np.array([[255,51,120]]) # Must be a 2D array (see explanation below)

# Separate features and labels as the fit() method expects feature vectors, not the label-data tuples. 
features = np.array([item[0] for item in data])
labels = np.array([item[1] for item in data])

knn = NearestNeighbors(n_neighbors=3, metric='euclidean') # Try n_neighbors = 1 or any k. Try metric = 'manhattan'
knn.fit(features) # fitting is on features only

distances, indices = knn.kneighbors(target)

#print(f"Distances: {distances}")
#print(f"Indices: {indices}")
print(f"Nearest labels: {labels[indices[0]]}")

Nearest labels: ['BLUE' 'RED' 'RED']


If you want direct classification, use KNeighborsClassifier:

In [6]:
from sklearn.neighbors import KNeighborsClassifier

knn_classifier = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
knn_classifier.fit(features, labels)

prediction = knn_classifier.predict(target)
print(f"Predicted color: {prediction[0]}")

Predicted color: RED


Key characteristics:

    Optimized Algorithm: Uses ball trees/kd-trees for O(n log n) searches

    Parallel Processing: Automatically uses multiple CPU cores

    Distance Metrics: Supports 20+ metrics out-of-the-box

    Batch Processing: Can predict multiple targets at once

**Why target Must Be 2D in scikit-learn**

In scikit-learn, all input data must be 2-dimensional arrays where:

    Rows represent samples/data points

    Columns represent features/attributes

This is a core design principle in scikit-learn to maintain consistency across all algorithms. Here's why your target needs to be 2D:

1. Consistent Data Representation

Scikit-learn treats every input as a collection of samples - even single points:

    [255, 51, 120] → 1D array (interpreted as 3 features with unknown samples)

    [[255, 51, 120]] → 2D array (1 sample × 3 features)
    
2. Batch Processing Capability

Scikit-learn is optimized for processing multiple samples at once:

In [8]:
# Predicting 3 targets simultaneously
targets = np.array([
    [255, 51, 120],
    [0, 0, 153],
    [190, 0, 20]
])  # Shape: (3 samples, 3 features)

distances, indices = knn.kneighbors(targets)

print(f"Nearest labels: {labels[indices[0]]}")

Nearest labels: ['BLUE' 'RED' 'RED']


For Very Large Datasets add these parameters for better performance:

In [9]:
knn = NearestNeighbors(
    n_neighbors=3,
    metric='euclidean',
    algorithm='ball_tree',  # or 'kd_tree'
    n_jobs=-1  # Use all CPU cores
)

More to KNeighborsClassifier

In [13]:
# Naive approach (brute-force)
knn_brute = KNeighborsClassifier(n_neighbors=3, algorithm='brute')
knn_brute.fit(features, labels)
brute_predictions = knn_brute.predict(target)
print("Brute-force predictions completed. Nearest neighbors are: ", brute_predictions)

# Using a k-d tree (faster for low dimensions)
knn_kd_tree = KNeighborsClassifier(n_neighbors=3, algorithm='kd_tree')
knn_kd_tree.fit(features, labels)
kdtree_predictions = knn_kd_tree.predict(target)
print("k-d tree predictions completed. Nearest neighbors are: ", kdtree_predictions)

# Using a Ball Tree (faster for higher dimensions)
knn_ball_tree = KNeighborsClassifier(n_neighbors=3, algorithm='ball_tree')
knn_ball_tree.fit(features, labels)
balltree_predictions = knn_ball_tree.predict(target)
print("Ball Tree predictions completed. Nearest neighbors are: ", balltree_predictions)

Brute-force predictions completed. Nearest neighbors are:  ['RED']
k-d tree predictions completed. Nearest neighbors are:  ['RED']
Ball Tree predictions completed. Nearest neighbors are:  ['RED']
