In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef
)
# Step 2: Load dataset
df = pd.read_csv("data/data.csv")
# Step 3: Drop unnecessary columns
df.drop(columns=["id", "Unnamed: 32"], inplace=True)

# Step 4: Encode target variable
# M = 1 (Malignant), B = 0 (Benign)
df["diagnosis"] = df["diagnosis"].map({"M": 1, "B": 0}) #we are changing M and B to 0,1 as in ML world string won't help.

# Step 5: Separate features and target
X = df.drop("diagnosis", axis=1) #we don't want this as an input as we have to predict this.
y = df["diagnosis"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=40
)
scaler = StandardScaler() # Scaling (VERY IMPORTANT for KNN)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

def knn_classifier():
    
    # Create model
    model = KNeighborsClassifier(
        n_neighbors=5,
        weights="distance",
        metric="euclidean"
    )
    
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    # Step 10: Evaluation Metrics
    baseline_metrics ={
    "Accuracy": accuracy_score(y_test, y_pred),
    "AUC": float(roc_auc_score(y_test, y_prob)),
    "Precision": precision_score(y_test, y_pred),
    "Recall": recall_score(y_test, y_pred),
    "F1": f1_score(y_test, y_pred),
    "MCC": float(matthews_corrcoef(y_test, y_pred))
    }
    return baseline_metrics

print(knn_classifier())    


{'accuracy': 0.9736842105263158, 'auc': np.float64(0.9842735042735042), 'precision': 0.9736842105263158, 'recall': 0.9487179487179487, 'f1': 0.961038961038961, 'mcc': 0.9413574486632834}
