# K-Nearest Neighbors (K-NN)

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## 1. Importing the dataset

In [None]:
dataset = pd.read_csv('Social_Network_Ads.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [None]:
dataset.head()

In [None]:
y

### 1.1 Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [None]:
X_train

In [None]:
y_train

In [None]:
X_test

In [None]:
y_test

### 1.2 Feature Scaling

Standardizing data using [sklearn.preprocessing.StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) is important because many machine learning algorithms perform better or converge faster when features are on a similar scale. This process, which involves scaling features to have zero mean and unit variance, helps in reducing the bias that can be introduced by features with larger scales and improves the overall predictive performance of models, especially those sensitive to feature scaling like Support Vector Machines and k-Nearest Neighbors.


### 1.2.1 Demo 

In [None]:
from sklearn.preprocessing import StandardScaler
data = [[0, 0], [0, 0], [1, 1], [1, 1]]
scaler = StandardScaler()

In [None]:
print(scaler.fit(data))

In [None]:
scaler.transform(data)

Z = (x-u) /s , u (mean) = 0.5, s (standard deviation ) = 0.5

Z_new = (2-0.5)/0.5 = 1.5/0.5 = 3

In [None]:
scaler.transform([[2, 2]])

### 1.2.2 Back to Training dataset

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test) #avoid data leakage

In [None]:
X_train

In [None]:
X_test.dtype

## 2. Training the K-NN model on the Training set

In [None]:
class KNN():
    def __init__(self,k):
        self.k=k
        print('Input k value: ',self.k)
        
    def fit(self,X_train,y_train):
        self.x_train=X_train
        self.y_train=y_train
    
    def calculate_euclidean(self,sample1,sample2):
        distance=0.0
        for i in range(len(sample1)):
            #Euclidean Distance = sqrt(sum i to N (x1_i ‚Äì x2_i)^2)
            distance+=(sample1[i]-sample2[i])**2 
        return np.sqrt(distance)

    def nearest_neighbors(self, test_sample):
        distances = []  # List to store distances from the test sample to each training sample

        # Loop over each sample in the training set
        for i in range(len(self.x_train)):
            # Calculate the Euclidean distance from the current training sample to the test sample
            # Append a tuple of (training label, distance) to the distances list
            distances.append((self.y_train[i], self.calculate_euclidean(self.x_train[i], test_sample)))

        # Sort the distances list in ascending order based on the distance
        distances.sort(key=lambda x: x[1])

        neighbors = []  # List to store the nearest neighbors

        # Retrieve the first 'k' nearest neighbors
        for i in range(self.k):
            # Append the label of each neighbor to the neighbors list
            neighbors.append(distances[i][0])

        # Return the list of nearest neighbors
        return neighbors

    def predict(self, test_set):
        predictions = []  # List to store predictions for each test sample

        # Loop over each sample in the test set
        for test_sample in test_set:
            # Find the nearest neighbors for the current test sample
            neighbors = self.nearest_neighbors(test_sample)

            # Extract the labels of the nearest neighbors
            labels = [sample for sample in neighbors]

            # Determine the most common label among the neighbors
            prediction = max(labels, key=labels.count)

            # Append the predicted label to the predictions list
            predictions.append(prediction)

        # Return the list of predictions for the test set
        return predictions


In [None]:
model=KNN(5) #our model
model.fit(X_train,y_train)

### 2.1 Predicting the Test set results

In [None]:
predictions=model.predict(X_test)#our model's predictions

In [None]:
X_test

In [None]:
y_test

In [None]:
predictions

## 2.3 Visualising the Test set results and add your name to them

In [None]:

#'sc' is your StandardScaler and 'X_test' is your scaled test dataset
X_set, y_set = sc.inverse_transform(X_test), y_test

# Plotting the results
plt.figure(figsize=(8, 6))

# Plot True labels
plt.scatter(X_set[:, 0], X_set[:, 1], c=y_test, cmap='viridis', edgecolor='k', s=20)
plt.title("<Your name>+True Labels")
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.show()

In [None]:


#'sc' is your StandardScaler and 'X_test' is your scaled test dataset
X_set, y_set = sc.inverse_transform(X_test), predictions

# Plotting the results
plt.figure(figsize=(8, 6))

# Plot True labels using original values
plt.scatter(X_set[:, 0], X_set[:, 1], c=predictions, cmap='viridis', edgecolor='k', s=20)
plt.title("<Your name>+ KNN Predicted Labels")
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.show()


## 3. The scikit-learn approach

In [None]:
from sklearn.neighbors import KNeighborsClassifier


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Initialize the KNN classifier with k=5

knn = #your code

# Train the classifier
knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_test)

### Please visualize the results and add your name to them

In [None]:
# Plotting the results
plt.figure(figsize=(12, 6))

# Plot true labels
plt.subplot(1, 2, 1)
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap='viridis', edgecolor='k', s=20)
plt.title("<Your Name>+True Labels")
plt.xlabel('Age')
plt.ylabel('Estimated Salary')

# Plot predicted labels
plt.subplot(1, 2, 2)
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_pred, cmap='viridis', edgecolor='k', s=20)
plt.title("<Your Name>+ KNN Predicted Labels")
plt.xlabel('Age')
plt.ylabel('Estimated Salary')

plt.show()


# ü§ù Practice Lab: K-Nearest Neighbors (KNN)

### üß† Objective
In this exercise, you will:
1. Build a simple KNN classifier.  
2. Predict class labels for new data points.  
3. Visualize decision boundaries and understand the role of **K**.  

---
## üß© Scenario

A teacher collected data about students‚Äô **study hours** and **sleep hours**, along with whether they **passed** an exam.

| Student | Study_Hours | Sleep_Hours | Pass |
|----------|--------------|-------------|------|
| A | 2 | 9 | No |
| B | 3 | 8 | No |
| C | 4 | 7 | No |
| D | 6 | 6 | Yes |
| E | 7 | 5 | Yes |
| F | 8 | 4 | Yes |
| G | 9 | 3 | Yes |

You will use KNN to predict whether a new student is likely to **pass** based on their study and sleep hours.

---
## üß† Task Requirements

1. Create a small dataset as a pandas DataFrame.  
2. Split it into features (`Study_Hours`, `Sleep_Hours`) and labels (`Pass`).  
3. Train a **KNN classifier** using `KNeighborsClassifier` from scikit-learn.  
4. Predict the outcome for a new student (e.g., `Study_Hours=5`, `Sleep_Hours=6`).  
5. Try different values of **K** (e.g., 1, 3, 5) and compare results.  
6. Visualize the data points with color-coded classes.

---


In [None]:
 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
 
data = {
    'Study_Hours': [2,3,4,6,7,8,9],
    'Sleep_Hours': [9,8,7,6,5,4,3],
    'Pass': ['No','No','No','Yes','Yes','Yes','Yes']
}

