# Breast Cancer Prediction - Model Comparison
This notebook compares several ML models to predict the likelihood of having diabetes.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("Breast_cancer_data.csv")
df.head(20)

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0
1,20.57,17.77,132.9,1326.0,0.08474,0
2,19.69,21.25,130.0,1203.0,0.1096,0
3,11.42,20.38,77.58,386.1,0.1425,0
4,20.29,14.34,135.1,1297.0,0.1003,0
5,12.45,15.7,82.57,477.1,0.1278,0
6,18.25,19.98,119.6,1040.0,0.09463,0
7,13.71,20.83,90.2,577.9,0.1189,0
8,13.0,21.82,87.5,519.8,0.1273,0
9,12.46,24.04,83.97,475.9,0.1186,0


## Prepare the data

In [2]:
X = df.drop(columns=["diagnosis"])
y = df["diagnosis"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Train and evaluate models

In [3]:
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(probability=True),
    "KNN": KNeighborsClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    proba = model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, preds)
    auc = roc_auc_score(y_test, proba)

    results.append((name, acc, auc))
    print(f"Model: {name}")
    print(f"Accuracy: {acc:.4f}, AUC: {auc:.4f}")
    print(classification_report(y_test, preds))
    print("-" * 40)

best_model_name, best_acc, best_auc = max(results, key=lambda x: x[2])
print(f"Best Model: {best_model_name} (AUC = {best_auc:.4f})")

best_model = models[best_model_name]


Model: Random Forest
Accuracy: 0.9474, AUC: 0.9920
              precision    recall  f1-score   support

           0       0.91      0.95      0.93        43
           1       0.97      0.94      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.95      0.94       114
weighted avg       0.95      0.95      0.95       114

----------------------------------------
Model: Logistic Regression
Accuracy: 0.9298, AUC: 0.9882
              precision    recall  f1-score   support

           0       0.89      0.93      0.91        43
           1       0.96      0.93      0.94        71

    accuracy                           0.93       114
   macro avg       0.92      0.93      0.93       114
weighted avg       0.93      0.93      0.93       114

----------------------------------------
Model: SVM
Accuracy: 0.9211, AUC: 0.9695
              precision    recall  f1-score   support

           0       1.00      0.79      0.88        43
      

## Make a Prediction with Best Model

In [4]:
input_data = {
    "mean_radius": 15.2,
    "mean_texture": 22.8,
    "mean_perimeter": 100.1,
    "mean_area": 728.1,
    "mean_smoothness": 0.09,
}

input_df = pd.DataFrame([input_data])
proba = best_model.predict_proba(input_df)[0][1]
print(f"Chance of having breast cancer: {proba * 100:.2f}%")
prediction = best_model.predict(input_df)[0]
print("Prediction:", "Has breast cancer" if prediction == 1 else "No breast cancer")


Chance of having breast cancer: 1.00%
Prediction: No breast cancer


In [5]:
import joblib
joblib.dump(best_model, "best_breast_cancer_model.pkl")

['best_breast_cancer_model.pkl']