In [38]:
import pandas as pd

# Replace 'breast-cancer.csv' with your actual file path
df = pd.read_csv("breast-cancer.csv")
print(df.head())

import numpy as np
from sklearn.impute import SimpleImputer

# Assume missing values are represented by 0 in numerical columns.
# Replace 0 with np.nan for proper imputation.
df_numeric = df.copy()
columns_to_impute = df_numeric.columns.drop("diagnosis")  # adjust as needed
df_numeric[columns_to_impute] = df_numeric[columns_to_impute].replace(0, np.nan)

# Mean imputation
imputer_mean = SimpleImputer(strategy="mean")
X_mean = imputer_mean.fit_transform(df_numeric[columns_to_impute])

# Median imputation
imputer_median = SimpleImputer(strategy="median")
X_median = imputer_median.fit_transform(df_numeric[columns_to_impute])

# You can then attach the target variable back and evaluate a classifier

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Split the dataset (assume X and y are prepared after imputation)
X = X_mean  # or X_median
y = df["diagnosis"].map({'M':1, 'B':0}).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define models
models = {
    "KNN": KNeighborsClassifier(n_neighbors=3),
    "Decision Tree": DecisionTreeClassifier(max_depth=8, random_state=42)
}

# Define scalers
scalers = {
    "None": None,
    "StandardScaler": StandardScaler(),
    "MinMaxScaler": MinMaxScaler()
}

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    }

for scaler_name, scaler in scalers.items():
    if scaler:
        scaler.fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        X_test_scaled = scaler.transform(X_test)
    else:
        X_train_scaled, X_test_scaled = X_train, X_test

    print(f"\nUsing {scaler_name}:")
    for model_name, model in models.items():
        model.fit(X_train_scaled, y_train)
        metrics = evaluate_model(model, X_test_scaled, y_test)
        print(f"{model_name} metrics: {metrics}")
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

results = {"Classifier": [], "Hyperparameter": [], "Value": [], "Accuracy": [], "Precision": [], "Recall": [], "F1 Score": []}

# KNN hyperparameter tuning
for k in [3, 9, 15, 21]:
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)
    metrics = evaluate_model(model, X_test, y_test)
    results["Classifier"].append("KNN")
    results["Hyperparameter"].append("n_neighbors")
    results["Value"].append(k)
    results["Accuracy"].append(metrics["Accuracy"])
    results["Precision"].append(metrics["Precision"])
    results["Recall"].append(metrics["Recall"])
    results["F1 Score"].append(metrics["F1 Score"])

# Decision Tree hyperparameter tuning
for depth in [2, 8, 14]:
    model = DecisionTreeClassifier(max_depth=depth, random_state=42)
    model.fit(X_train, y_train)
    metrics = evaluate_model(model, X_test, y_test)
    results["Classifier"].append("Decision Tree")
    results["Hyperparameter"].append("max_depth")
    results["Value"].append(depth)
    results["Accuracy"].append(metrics["Accuracy"])
    results["Precision"].append(metrics["Precision"])
    results["Recall"].append(metrics["Recall"])
    results["F1 Score"].append(metrics["F1 Score"])

# AdaBoost hyperparameter tuning
for n in [10, 20, 30]:
    model = AdaBoostClassifier(n_estimators=n, random_state=42)
    model.fit(X_train, y_train)
    metrics = evaluate_model(model, X_test, y_test)
    results["Classifier"].append("AdaBoost")
    results["Hyperparameter"].append("n_estimators")
    results["Value"].append(n)
    results["Accuracy"].append(metrics["Accuracy"])
    results["Precision"].append(metrics["Precision"])
    results["Recall"].append(metrics["Recall"])
    results["F1 Score"].append(metrics["F1 Score"])

# Random Forest hyperparameter tuning
for n in [10, 30, 50, 60]:
    model = RandomForestClassifier(n_estimators=n, random_state=42)
    model.fit(X_train, y_train)
    metrics = evaluate_model(model, X_test, y_test)
    results["Classifier"].append("Random Forest")
    results["Hyperparameter"].append("n_estimators")
    results["Value"].append(n)
    results["Accuracy"].append(metrics["Accuracy"])
    results["Precision"].append(metrics["Precision"])
    results["Recall"].append(metrics["Recall"])
    results["F1 Score"].append(metrics["F1 Score"])

from sklearn.model_selection import KFold, cross_validate

cv = KFold(n_splits=10, shuffle=True, random_state=42)
models = {
    "KNN (k=3)": KNeighborsClassifier(n_neighbors=3),
    "KNN (k=9)": KNeighborsClassifier(n_neighbors=9),
    "Decision Tree (max_depth=2)": DecisionTreeClassifier(max_depth=2, random_state=42),
    "Decision Tree (max_depth=8)": DecisionTreeClassifier(max_depth=8, random_state=42)
}

for name, model in models.items():
    cv_results = cross_validate(model, X, y, cv=cv, scoring=['accuracy', 'f1'])
    mean_acc = cv_results['test_accuracy'].mean()
    std_acc = cv_results['test_accuracy'].std()
    mean_f1 = cv_results['test_f1'].mean()
    std_f1 = cv_results['test_f1'].std()
    print(f"\n{name}:")
    print(f"Mean Accuracy: {mean_acc:.4f}, Std Dev: {std_acc:.4f}")
    print(f"Mean F1 Score: {mean_f1:.4f}, Std Dev: {std_f1:.4f}")



         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  radius_worst  texture_worst  perimeter_worst  area_wor




KNN (k=3):
Mean Accuracy: 0.7734, Std Dev: 0.0683
Mean F1 Score: 0.6550, Std Dev: 0.0861

KNN (k=9):
Mean Accuracy: 0.7190, Std Dev: 0.0618
Mean F1 Score: 0.4874, Std Dev: 0.0813

Decision Tree (max_depth=2):
Mean Accuracy: 0.9209, Std Dev: 0.0265
Mean F1 Score: 0.8896, Std Dev: 0.0431

Decision Tree (max_depth=8):
Mean Accuracy: 0.9367, Std Dev: 0.0285
Mean F1 Score: 0.9117, Std Dev: 0.0444
