In [1]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Classification models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# Load the dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

In [3]:
# Display dataset information
print("Dataset Shape:", X.shape)
print("\nTarget Distribution:\n", y.value_counts())
print("\nFirst 5 rows of features:")
print(X.head())

Dataset Shape: (569, 30)

Target Distribution:
 1    357
0    212
Name: count, dtype: int64

First 5 rows of features:
   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              

In [4]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Support Vector Machine": SVC(random_state=42),
    "k-Nearest Neighbors": KNeighborsClassifier()
}

In [6]:
# Train and evaluate models
results = []

for name, model in models.items():
    # Train the model
    model.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = model.predict(X_test_scaled)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Store results
    results.append({
        "Model": name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    })

    # Print results
    print(f"\n{name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")


Logistic Regression Results:
Accuracy: 0.9737
Precision: 0.9722
Recall: 0.9859
F1 Score: 0.9790

Decision Tree Results:
Accuracy: 0.9474
Precision: 0.9577
Recall: 0.9577
F1 Score: 0.9577

Random Forest Results:
Accuracy: 0.9649
Precision: 0.9589
Recall: 0.9859
F1 Score: 0.9722

Support Vector Machine Results:
Accuracy: 0.9825
Precision: 0.9726
Recall: 1.0000
F1 Score: 0.9861

k-Nearest Neighbors Results:
Accuracy: 0.9474
Precision: 0.9577
Recall: 0.9577
F1 Score: 0.9577


In [7]:
# Convert results to DataFrame
results_df = pd.DataFrame(results)
print("\nModel Comparison:")
print(results_df.to_string(index=False))


Model Comparison:
                 Model  Accuracy  Precision   Recall  F1 Score
   Logistic Regression  0.973684   0.972222 0.985915  0.979021
         Decision Tree  0.947368   0.957746 0.957746  0.957746
         Random Forest  0.964912   0.958904 0.985915  0.972222
Support Vector Machine  0.982456   0.972603 1.000000  0.986111
   k-Nearest Neighbors  0.947368   0.957746 0.957746  0.957746


In [8]:
# Identify best and worst models
best_model = results_df.loc[results_df['Accuracy'].idxmax()]
worst_model = results_df.loc[results_df['Accuracy'].idxmin()]

In [9]:
print("\nBest Model:")
print(f"Model: {best_model['Model']}, Accuracy: {best_model['Accuracy']:.4f}")


Best Model:
Model: Support Vector Machine, Accuracy: 0.9825


In [10]:
print("\nWorst Model:")
print(f"Model: {worst_model['Model']}, Accuracy: {worst_model['Accuracy']:.4f}")


Worst Model:
Model: Decision Tree, Accuracy: 0.9474
