<a href="https://www.kaggle.com/code/shag2003/knn-algorithms?scriptVersionId=172545574" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
dataset = pd.read_csv('/kaggle/input/phishing-urls/Preprocessed_data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [None]:
dataset.head()

## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [None]:
print(X_train)

In [None]:
print(y_train)

In [None]:
print(X_test)

In [None]:
print(y_test)

# Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test) #avoid data leakage

In [None]:
print(X_train)

In [None]:
print(X_test.dtype)

## Training the kNN model on the Training set

In [None]:
from math import sqrt
class KNN():
  def __init__(self,k):
    self.k=k
    print(self.k)
  def fit(self,X_train,y_train):
    self.x_train=X_train
    self.y_train=y_train
  def calculate_euclidean(self,sample1,sample2):
    distance=0.0
    for i in range(len(sample1)):
      distance+=(sample1[i]-sample2[i])**2 #Euclidean Distance = sqrt(sum i to N (x1_i – x2_i)^2)
    return sqrt(distance)
  def nearest_neighbors(self,test_sample):
    distances=[]#calculate distances from a test sample to every sample in a training set
    for i in range(len(self.x_train)):
      distances.append((self.y_train[i],self.calculate_euclidean(self.x_train[i],test_sample)))
    distances.sort(key=lambda x:x[1])#sort in ascending order, based on a distance value
    neighbors=[]
    for i in range(self.k): #get first k samples
      neighbors.append(distances[i][0])
    return neighbors
  def predict(self,test_set):
    predictions=[]
    for test_sample in test_set:
      neighbors=self.nearest_neighbors(test_sample)
      labels=[sample for sample in neighbors]
      prediction=max(labels,key=labels.count)
      predictions.append(prediction)
    return predictions

In [None]:
model=KNN(5) #our model
model.fit(X_train,y_train)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)#The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean metric.
classifier.fit(X_train, y_train)

## Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
predictions=model.predict(X_test) # our model's predictions

In [None]:
## Making the Confusion Matrix to compare both models

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, predictions)
print(cm)
accuracy_score(y_test, predictions)

In [None]:
## Adaptive KNN

In [None]:
# import numpy as np
from collections import Counter

class AdaptiveKNN:
    def __init__(self, k_values):
        self.k_values = k_values
    
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
    
    def predict(self, X_test):
        y_pred = []
        for x in X_test:
            distances = np.linalg.norm(self.X_train - x, axis=1)
            nearest_neighbor_indices = np.argsort(distances)[:max(self.k_values)]
            k_values_for_point = [self.k_values[idx] for idx in nearest_neighbor_indices]
            selected_k = Counter(k_values_for_point).most_common(1)[0][0]
            nearest_neighbor_labels = [self.y_train[idx] for idx in nearest_neighbor_indices[:selected_k]]
            prediction = Counter(nearest_neighbor_labels).most_common(1)[0][0]
            y_pred.append(prediction)
        return y_pred


In [None]:
# Usage example:
aknn = AdaptiveKNN(k_values=[3, 5, 7])  # Define different k values
aknn.fit(X_train, y_train)  # Train the model
aKNN_pred = aknn.predict(X_test)  # Make predictions

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, aKNN_pred)
print(cm)
accuracy_score(y_test, aKNN_pred)

In [None]:
## Fuzzy KNN

In [None]:
class FuzzyKNN:
    def __init__(self, k):
        self.k = k
    
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
    
    def predict(self, X_test):
        y_pred = []
        for x in X_test:
            distances = np.linalg.norm(self.X_train - x, axis=1)
            nearest_neighbor_indices = np.argsort(distances)[:self.k]
            nearest_neighbor_labels = [self.y_train[idx] for idx in nearest_neighbor_indices]
            membership_values = [1 / d for d in distances[nearest_neighbor_indices]]
            label_counts = {}
            for label, membership in zip(nearest_neighbor_labels, membership_values):
                if label in label_counts:
                    label_counts[label] += membership
                else:
                    label_counts[label] = membership
            prediction = max(label_counts, key=label_counts.get)
            y_pred.append(prediction)
        return y_pred


In [None]:
# Usage example:
fknn = FuzzyKNN(k=5)  # Define the number of nearest neighbors (k)
fknn.fit(X_train, y_train)  # Train the model
Fuzzy_pred = fknn.predict(X_test)  # Make predictions

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, Fuzzy_pred)
print(cm)
accuracy_score(y_test, Fuzzy_pred)

In [None]:
## Weighted KNN

In [None]:
class WeightAdjustedKNN:
    def __init__(self, k):
        self.k = k
    
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
    
    def _kernel_function(self, distances):
        # Example kernel function: inverse distance
        return 1 / (distances + 1e-5)
    
    def predict(self, X_test):
        y_pred = []
        for x in X_test:
            distances = np.linalg.norm(self.X_train - x, axis=1)
            weights = self._kernel_function(distances)
            nearest_neighbor_indices = np.argsort(distances)[:self.k]
            nearest_neighbor_labels = [self.y_train[idx] for idx in nearest_neighbor_indices]
            label_weights = {}
            for label, weight in zip(nearest_neighbor_labels, weights):
                if label in label_weights:
                    label_weights[label] += weight
                else:
                    label_weights[label] = weight
            prediction = max(label_weights, key=label_weights.get)
            y_pred.append(prediction)
        return y_pred


In [None]:
# Usage example:
wknn = WeightAdjustedKNN(k=5)  # Define the number of nearest neighbors (k)
wknn.fit(X_train, y_train)  # Train the model
wKNN_pred = wknn.predict(X_test)  # Make predictions

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, wKNN_pred)
print(cm)
accuracy_score(y_test, wKNN_pred)

In [None]:
## Hassanat distance KNN

In [None]:
class HassanatDistanceKNN:
    def __init__(self, k):
        self.k = k
    
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
    
    def _hassanat_distance(self, x1, x2):
        # Define Hassanat distance metric (example implementation)
        max_vector = np.maximum(x1, x2)
        min_vector = np.minimum(x1, x2)
        return np.linalg.norm(max_vector - min_vector)
    
    def predict(self, X_test):
        y_pred = []
        for x in X_test:
            distances = [self._hassanat_distance(x, x_train) for x_train in self.X_train]
            nearest_neighbor_indices = np.argsort(distances)[:self.k]
            nearest_neighbor_labels = [self.y_train[idx] for idx in nearest_neighbor_indices]
            prediction = max(set(nearest_neighbor_labels), key=nearest_neighbor_labels.count)
            y_pred.append(prediction)
        return y_pred


In [None]:
# Usage example:
hknn = HassanatDistanceKNN(k=5)  # Define the number of nearest neighbors (k)
hknn.fit(X_train, y_train)  # Train the model
hKNN_pred = hknn.predict(X_test)  # Make predictions

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, hKNN_pred)
print(cm)
accuracy_score(y_test, hKNN_pred)

In [None]:
## Mutual KNN

In [None]:
class MutualKNN:
    def __init__(self, k):
        self.k = k
    
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
    
    def _mutual_neighbors(self, distances):
        mutual_neighbors = []
        for i in range(len(distances)):
            mutual_indices = np.where(distances[i] <= self.k)[0]
            mutual_neighbors.append(mutual_indices)
        return mutual_neighbors
    
    def predict(self, X_test):
        y_pred = []
        for x in X_test:
            distances = np.linalg.norm(self.X_train - x, axis=1)
            mutual_neighbors = self._mutual_neighbors(distances)
            mutual_neighbor_labels = [self.y_train[idx] for idx in mutual_neighbors]
            prediction = max(set(mutual_neighbor_labels), key=mutual_neighbor_labels.count)
            y_pred.append(prediction)
        return y_pred


In [None]:
# Usage example:
mknn = MutualKNN(k=5)  # Define the number of nearest neighbors (k)
mknn.fit(X_train, y_train)  # Train the model
mKNN_pred = mknn.predict(X_test)  # Make predictions

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, mKNN_pred)
print(cm)
accuracy_score(y_test, mKNN_pred)

In [None]:
## Ensemble KNN

In [None]:
class EnsembleApproachKNN:
    def __init__(self, k_max):
        self.k_max = k_max
    
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
    
    def _inverse_logarithm(self, k):
        return 1 / np.log(k + 1)
    
    def predict(self, X_test):
        y_pred = []
        for x in X_test:
            all_distances = np.linalg.norm(self.X_train - x, axis=1)
            sorted_indices = np.argsort(all_distances)
            k_values = np.arange(1, self.k_max + 1, 2)
            weights = [self._inverse_logarithm(k) for k in k_values]
            label_counts = {}
            for k, weight in zip(k_values, weights):
                nearest_neighbor_indices = sorted_indices[:k]
                nearest_neighbor_labels = [self.y_train[idx] for idx in nearest_neighbor_indices]
                for label in nearest_neighbor_labels:
                    if label in label_counts:
                        label_counts[label] += weight
                    else:
                        label_counts[label] = weight
            prediction = max(label_counts, key=label_counts.get)
            y_pred.append(prediction)
        return y_pred

In [None]:
# Usage example:
eaknn = EnsembleApproachKNN(k_max=10)  # Define the maximum value of k
eaknn.fit(X_train, y_train)  # Train the model
EKNN_pred = eaknn.predict(X_test)  # Make predictions

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, EKNN_pred)
print(cm)
accuracy_score(y_test, EKNN_pred)

In [None]:
## Locally adaptive KNN with Discrimination class (LA-KNN):

In [None]:
# class LocallyAdaptiveKNN:
#     def __init__(self, k):
#         self.k = k
    
#     def fit(self, X_train, y_train):
#         self.X_train = X_train
#         self.y_train = y_train
    
#     def _calculate_discrimination_classes(self, X_train, y_train):
#         # Define discrimination classes based on majority and second majority class neighbors
#         # This is a simplified example, actual implementation may vary
#         discrimination_classes = []
#         for i in range(len(X_train)):
#             distances = np.linalg.norm(X_train - X_train[i], axis=1)
#             sorted_indices = np.argsort(distances)
#             k_nearest_labels = [y_train[idx] for idx in sorted_indices[1:self.k + 1]]  # Exclude itself
#             majority_class = max(set(k_nearest_labels), key=k_nearest_labels.count)
#             second_majority_class = sorted(set(k_nearest_labels), key=k_nearest_labels.count)[-2]
#             discrimination_classes.append((majority_class, second_majority_class))
#         return discrimination_classes
    
#     def _calculate_optimal_k_values(self, discrimination_classes):
#         # Define optimal k values based on discrimination classes
#         # This is a simplified example, actual implementation may vary
#         optimal_k_values = []
#         for majority_class, second_majority_class in discrimination_classes:
#             if majority_class == second_majority_class:
#                 optimal_k_values.append(self.k)
#             else:
#                 # Define your own rules to determine optimal k value
#                 optimal_k_values.append(self.k // 2)  # Example: use half of k
#         return optimal_k_values
    
#     def predict(self, X_test):
#         y_pred = []
#         discrimination_classes = self._calculate_discrimination_classes(self.X_train, self.y_train)
#         optimal_k_values = self._calculate_optimal_k_values(discrimination_classes)
#         for x, k in zip(X_test, optimal_k_values):
#             distances = np.linalg.norm(self.X_train - x, axis=1)
#             sorted_indices = np.argsort(distances)
#             nearest_neighbor_indices = sorted_indices[:k]
#             nearest_neighbor_labels = [self.y_train[idx] for idx in nearest_neighbor_indices]
#             prediction = max(set(nearest_neighbor_labels), key=nearest_neighbor_labels.count)
#             y_pred.append(prediction)
#         return y_pred

# # Usage example:
# # laknn = LocallyAdaptiveKNN(k=5)  # Define the number of nearest neighbors (k)
# # laknn.fit(X_train, y_train)  # Train the model
# # y_pred = laknn.predict(X_test)  # Make predictions


In [None]:
## Generalised mean distance KNN (GMD-KNN):

In [None]:
# import numpy as np

# class GeneralisedMeanDistanceKNN:
#     def __init__(self, k):
#         self.k = k
    
#     def fit(self, X_train, y_train):
#         self.X_train = X_train
#         self.y_train = y_train
    
#     def _compute_distance(self, x1, x2):
#         return np.linalg.norm(x1 - x2)
    
#     def _compute_mean_distance(self, X_train, y_train, x, k):
#         class_distances = {}
#         for label in set(y_train):
#             label_indices = np.where(y_train == label)[0]
#             distances = [self._compute_distance(x, X_train[idx]) for idx in label_indices]
#             sorted_distances = sorted(distances)[:k]
#             class_distances[label] = np.mean(sorted_distances)
#         return class_distances
    
#     def predict(self, X_test):
#         y_pred = []
#         for x in X_test:
#             mean_distances = self._compute_mean_distance(self.X_train, self.y_train, x, self.k)
#             prediction = min(mean_distances, key=mean_distances.get)
#             y_pred.append(prediction)
#         return y_pred

# # Usage example:
# # gmdknn = GeneralisedMeanDistanceKNN(k=5)  # Define the number of nearest neighbors (k)
# # gmdknn.fit(X_train, y_train)  # Train the model
# # y_pred = gmdknn.predict(X_test)  # Make predictions


In [None]:
## Ensemble approach KNN (EA-KNN):

In [None]:
# import numpy as np

# class EnsembleApproachKNN:
#     def __init__(self, k_max):
#         self.k_max = k_max
    
#     def fit(self, X_train, y_train):
#         self.X_train = X_train
#         self.y_train = y_train
    
#     def _weight_summation(self, distances, k_values):
#         weights = [1 / np.log(k + 1) for k in range(1, self.k_max + 1, 2)]
#         weighted_sum = 0
#         for k, distance in zip(k_values, distances):
#             if k <= self.k_max:
#                 weighted_sum += weights[(k-1) // 2] * distance
#         return weighted_sum
    
#     def predict(self, X_test):
#         y_pred = []
#         for x in X_test:
#             distances = np.linalg.norm(self.X_train - x, axis=1)
#             sorted_indices = np.argsort(distances)
#             k_values = np.arange(1, len(self.X_train) + 1, 2)
#             weighted_sums = [self._weight_summation(distances[sorted_indices][:k], k_values[:k]) for k in range(1, len(self.X_train) + 1)]
#             prediction = self.y_train[sorted_indices[np.argmin(weighted_sums)]]
#             y_pred.append(prediction)
#         return y_pred

# # Usage example:
# # eaknn = EnsembleApproachKNN(k_max=10)  # Define the maximum value of k
# # eaknn.fit(X_train, y_train)  # Train the model
# # y_pred = eaknn.predict(X_test)  # Make predictions


# Visualising the Test set results

In [None]:
from matplotlib.colors import ListedColormap
X_set, y_set = sc.inverse_transform(X_test), y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 10, stop = X_set[:, 0].max() + 10, step = 1),
                     np.arange(start = X_set[:, 1].min() - 1000, stop = X_set[:, 1].max() + 1000, step = 1))
plt.contourf(X1, X2, classifier.predict(sc.transform(np.array([X1.ravel(), X2.ravel()]).T)).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('K-NN (Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()