# KNN Classifier with Cross-Validation and Feature Selection

This notebook performs K-Nearest Neighbors classification on the breast cancer dataset.

It follows the correct ML workflow:
- Train/test split with stratification
- GridSearchCV on training data to tune number of features and `k`
- Final model training and test evaluation
- Prints selected feature names and plots confusion matrix

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay


In [None]:

# Load dataset
df = pd.read_csv("data.csv")
df = df.drop(columns=["id", "Unnamed: 32"])
X = df.drop(columns=["diagnosis"])
y = df["diagnosis"].map({"M": 1, "B": 0}).astype(int)

# Train/test split (once only)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Train size:", X_train.shape, "Test size:", X_test.shape)


In [None]:

# Define pipeline
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("select", SelectKBest(score_func=f_classif)),
    ("knn", KNeighborsClassifier())
])

# Define parameter grid
param_grid = {
    "select__k": [5, 10, 15, 20],
    "knn__n_neighbors": list(range(1, 16))
}

# GridSearchCV on training set only
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(X_train, y_train)

print("Best parameters from GridSearchCV:", grid.best_params_)
print("Best cross-validation score:", grid.best_score_)


In [None]:

# Show which features were selected
selector = grid.best_estimator_.named_steps['select']
mask = selector.get_support()
selected_feature_names = X.columns[mask]
print("Selected features:", list(selected_feature_names))


In [None]:

# Predict on test set using best model
y_pred = grid.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Benign", "Malignant"])
disp.plot()
plt.title("Confusion Matrix on Test Set")
plt.show()
