In [8]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# --- Our 3 Contender Models! ---
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# --- Our Tools ---
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler # For KNN/Logistic
from sklearn.metrics import classification_report, confusion_matrix , accuracy_score

# --- Load the Data ---
data = load_breast_cancer()
X = data.data   # All the features
y = data.target # The answer (0 or 1)
feature_names = data.feature_names
target_names = data.target_names

print("...Data Loaded!...")



...Data Loaded!...


In [3]:

# --- 2. EDA & Prep ---
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

print("\n--- Data Head ---")
print(df.head())

print("\n--- Data Info ---")
df.info()




--- Data Head ---
   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst texture  worst perimeter  worst are

In [4]:

# --- 3. Split the Data ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nTraining set has {X_train.shape[0]} samples.")
print(f"Testing set has {X_test.shape[0]} samples.")




Training set has 455 samples.
Testing set has 114 samples.


In [5]:

# --- 4. Scale the Data (for *some* models) ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) # Remember: only .transform() on test set!

print("\n...Data is split and scaled! Ready to build models!...")




...Data is split and scaled! Ready to build models!...


In [6]:

# --- 5. Train the Models ---

# --- Model 1: Logistic Regression (Our Baseline) ---
print("\nTraining Logistic Regression...")
# 'max_iter' just gives it more time to find the best line
model_lr = LogisticRegression(max_iter=10000, random_state=42)
model_lr.fit(X_train_scaled, y_train) # Use SCALED data

# --- Model 2: K-Nearest Neighbors (K=5) ---
print("Training KNN...")
model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(X_train_scaled, y_train) # Use SCALED data

# --- Model 3: Random Forest (Our Powerhouse) ---
print("Training Random Forest...")
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train) # Use UNSCALED data!

print("\n...All 3 Models are Trained!...")




Training Logistic Regression...
Training KNN...
Training Random Forest...

...All 3 Models are Trained!...


In [9]:

# --- 6. Get Predictions ---
y_pred_lr = model_lr.predict(X_test_scaled)
y_pred_knn = model_knn.predict(X_test_scaled)
y_pred_rf = model_rf.predict(X_test)

# --- 7. Evaluate! ---
print("\n" + "="*30)
print(" MODEL 1: LOGISTIC REGRESSION ")
print("="*30)
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr)*100:.2f}%")
print(classification_report(y_test, y_pred_lr, target_names=target_names))

print("\n" + "="*30)
print(" MODEL 2: K-NEAREST NEIGHBORS (K=5) ")
print("="*30)
print(f"Accuracy: {accuracy_score(y_test, y_pred_knn)*100:.2f}%")
print(classification_report(y_test, y_pred_knn, target_names=target_names))

print("\n" + "="*30)
print(" MODEL 3: RANDOM FOREST ")
print("="*30)
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf)*100:.2f}%")
print(classification_report(y_test, y_pred_rf, target_names=target_names))




 MODEL 1: LOGISTIC REGRESSION 
Accuracy: 97.37%
              precision    recall  f1-score   support

   malignant       0.98      0.95      0.96        43
      benign       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114


 MODEL 2: K-NEAREST NEIGHBORS (K=5) 
Accuracy: 94.74%
              precision    recall  f1-score   support

   malignant       0.93      0.93      0.93        43
      benign       0.96      0.96      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114


 MODEL 3: RANDOM FOREST 
Accuracy: 96.49%
              precision    recall  f1-score   support

   malignant       0.98      0.93      0.95        43
      benign       0.96      0.99      0.97        71

    accuracy                       