# KNN

In [1]:
from typing import Tuple

import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier


In [2]:
def load_data(
    path: str, target_map: dict, num_criteria: int
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    """
    Preprocesses the data for training a machine learning model.

    Args:
        path (str): The path to the CSV file containing the data.
        target_map (dict): A dictionary mapping target values to binary labels.
        num_criteria (int): The number of criteria used for classification.

    Returns:
        tuple: A tuple containing the preprocessed data and the train-test split.
    """
    # Read the data from the CSV file
    data = pd.read_csv(path, header=None)

    # Apply the target mapping to convert target values to binary labels
    data[num_criteria] = data[num_criteria].apply(lambda x: target_map[x])

    # Remove duplicate rows from the data
    data = data.drop_duplicates()

    # Split the data into input features and target variable
    data_input = data.iloc[:, :num_criteria]
    data_target = data[num_criteria]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        data_input, data_target, test_size=0.2, random_state=1234
    )

    return (X_train, X_test, y_train, y_test)


# Load the data and preprocess it
path = "data/employee selection.csv"
target_map = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 1, 7: 1, 8: 1, 9: 1}
num_criteria = 4
X_train, X_test, y_train, y_test = load_data(path, target_map, num_criteria)

In [3]:
model = KNeighborsClassifier(n_neighbors=9)

_ = model.fit(X_train, y_train)

# Predict on the training set
y_pred = model.predict(X_train)

# Calculate accuracy on the training set
accuracy = accuracy_score(y_train, y_pred)
print(f"Accuracy train: {accuracy:.2%}")

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate accuracy on the test set
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy test: {accuracy:.2%}")


Accuracy train: 91.18%
Accuracy test: 90.70%


In [4]:
# Interpretation of classification for alternative with index 0 from the test set
alternative_index = 0
alternative = X_test.iloc[[alternative_index]]

# Find nearest neighbors and their distances
distances, neighbors = model.kneighbors(alternative)

# Create DataFrame to display neighbors and distances
neighbors_df = y_train.iloc[neighbors[0]].copy().to_frame()
neighbors_df.columns = ["class"]
neighbors_df["distances"] = distances[0]

# Display neighbors and distances
print(f"Distances to alternative {alternative_index}:")
display(alternative)
print("Neighbors and distances:")
display(neighbors_df)

# Display true class and predicted class
print(f"True class: {y_test.iloc[alternative_index]}")
pred = model.predict(alternative)[0]
print(f"Predicted class: {pred}") # majority class of the neighbors


Distances to alternative 0:


Unnamed: 0,0,1,2,3
5,1.0,0.666667,0.666667,0.666667


Neighbors and distances:


Unnamed: 0,class,distances
344,1,0.111111
11,1,0.157135
258,1,0.157135
435,1,0.200308
203,1,0.200308
324,0,0.200309
25,1,0.222222
29,1,0.222222
52,1,0.229061


True class: 1
Predicted class: 1
