In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc
import shap
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import joblib
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


In [None]:
df = pd.read_csv("/kaggle/input/breast-cancer-wisconsin-data/data.csv")

In [None]:
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:\n", df.head())
print("\nColumn names:\n", df.columns.tolist())



In [None]:
print("\nMissing values per column:\n", df.isnull().sum())
print("\nDuplicate rows:", df.duplicated().sum())



In [None]:
df = df.drop(columns=["Unnamed: 32"], errors="ignore")

In [None]:
df = df.drop(columns=["id"])

In [None]:
label = LabelEncoder()
df["diagnosis"] = label.fit_transform(df["diagnosis"])


In [None]:
# DEFINE FEATURES & TARGET
X = df.drop("diagnosis", axis=1)
y = df["diagnosis"]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
print("Before SMOTE:")
print("Training class distribution:")
print(y_train.value_counts())

In [None]:
# SMOTE OVERSAMPLING
sm = SMOTE(random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

print("\nAfter SMOTE:")
print("Training class distribution:")
print(y_train_sm.value_counts())


In [None]:
# SCALING
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_sm)
X_test_scaled = scaler.transform(X_test)

In [None]:
print("\nFinal Train/Test Shapes:")
print("X_train:", X_train_scaled.shape)
print("X_test:", X_test_scaled.shape)
print("y_train:", y_train_sm.shape)
print("y_test:", y_test.shape)

In [None]:
# SAVE PREPROCESSED DATASETS
train_df = pd.DataFrame(X_train_scaled, columns=X.columns)
train_df["diagnosis"] = y_train_sm

test_df = pd.DataFrame(X_test_scaled, columns=X.columns)
test_df["diagnosis"] = y_test.values

train_df.to_csv("cleaned_train.csv", index=False)
test_df.to_csv("cleaned_test.csv", index=False)

print("\nSaved: cleaned_train.csv & cleaned_test.csv")

In [None]:

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (6,4)


# Load preprocessed training data
df = pd.read_csv("cleaned_train.csv")
features = df.columns[:-1]  # all 30 features
target = 'diagnosis'
print("Dataset shape:", df.shape)
print("\nClass distribution:\n", df['diagnosis'].value_counts())


In [None]:
# COUNT PLOT
plt.figure(figsize=(6,4))
sns.countplot(x=target, data=df, palette=["green","red"])
plt.title("Count of Benign (0) vs Malignant (1)")
plt.show()
counts = df[target].value_counts()
print(f" Insight (Count Plot):")
print(f"Benign count = {counts[0]}, Malignant count = {counts[1]}")
if counts[0] > counts[1]:
    print("Benign class is slightly more frequent in training data.\n")
elif counts[1] > counts[0]:
    print("Malignant class is slightly more frequent in training data.\n")
else:
    print("Classes are perfectly balanced.\n")

In [None]:
# FUNCTION TO CALCULATE PREDICTOR STRENGTH

def predictor_strength(df, feature, target):
    """
    Determines if a feature is strong, medium, or weak predictor
    using Cohen's d.
    """
    class0 = df[df[target]==0][feature]
    class1 = df[df[target]==1][feature]
    mean_diff = abs(class1.mean() - class0.mean())
    pooled_std = np.sqrt((class1.std()**2 + class0.std()**2)/2)
    cohen_d = mean_diff / pooled_std

    if cohen_d >= 0.8:
        strength = "Strong Predictor"
    elif cohen_d >= 0.5:
        strength = "Medium Predictor"
    else:
        strength = "Weak Predictor"
    return cohen_d, strength


In [None]:
# HISTOGRAMS + AUTOMATIC INSIGHTS
print("HISTOGRAMS + INSIGHTS \n")
for feature in features:
    d, strength = predictor_strength(df, feature, target)
    
    plt.figure(figsize=(6,4))
    sns.histplot(df, x=feature, hue=target, kde=True, palette=["green","red"])
    plt.title(f"{feature} Distribution by Diagnosis")
    plt.show()
    
    print(f" Insight (Histogram) for {feature}:")
    print(f"Cohen's d = {d:.2f} → {strength}")
    if d >= 0.8:
        print(f"{feature} shows clear separation between classes → strong predictive feature.\n")
    elif d >= 0.5:
        print(f"{feature} shows moderate separation → medium predictive feature.\n")
    else:
        print(f"{feature} shows high overlap → weak predictive feature.\n")



In [None]:
# BOXPLOTS + AUTOMATIC INSIGHTS
print(" BOXPLOTS + INSIGHTS \n")
for feature in features:
    d, strength = predictor_strength(df, feature, target)
    
    plt.figure(figsize=(6,4))
    sns.boxplot(x=target, y=feature, data=df, palette=["green","red"])
    plt.title(f"{feature} by Diagnosis")
    plt.show()
    
    print(f" Insight (Boxplot) for {feature}:")
    print(f"Cohen's d = {d:.2f} → {strength}")
    if d >= 0.8:
        print(f"Boxplot shows malignant and benign medians clearly separated → strong predictor.\n")
    elif d >= 0.5:
        print(f"Boxplot shows some separation → medium predictor.\n")
    else:
        print(f"Boxplot shows overlap of distributions → weak predictor.\n")


In [None]:
# CORRELATION HEATMAP + INSIGHTS
plt.figure(figsize=(16,10))
corr = df.corr()
sns.heatmap(corr, annot=False, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()

In [None]:
# Top correlated features with target
target_corr = corr[target].sort_values(ascending=False)
print("\nTop 10 features positively correlated with malignancy:")
print(target_corr.head(10))

print("\nTop 10 features negatively correlated (benign indicators):")
print(target_corr.tail(10))

print("\n Insight (Correlation):")
print("Features highly positively correlated → strong indicators of malignancy.")
print("Features highly negatively correlated → strong indicators of benign.\n")


In [None]:
top_features = target_corr.head(5).index
sns.pairplot(df, vars=top_features, hue=target, palette=["green","red"])
plt.show()
print(" Insight (Pairplot): shows interactions among top features and separation between classes.")


In [None]:
# LOAD PREPROCESSED DATA
train_df = pd.read_csv("cleaned_train.csv")
test_df = pd.read_csv("cleaned_test.csv")

X_train = train_df.drop("diagnosis", axis=1)
y_train = train_df["diagnosis"]

X_test = test_df.drop("diagnosis", axis=1)
y_test = test_df["diagnosis"]

print("Train/Test shapes:", X_train.shape, X_test.shape)

In [None]:
# DEFINE MODELS
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

In [None]:
# TRAIN, PREDICT & EVALUATE
results_list = []


for name, model in models.items():
    # Train
    model.fit(X_train, y_train)
    # Predict
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1]  # probability for ROC-AUC


In [None]:
    # Metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_prob)
    
    results_list.append({
        "Model": name,
        "Accuracy": round(acc,3),
        "Precision": round(prec,3),
        "Recall": round(rec,3),
        "F1-Score": round(f1,3),
        "ROC-AUC": round(roc,3)
    })


In [None]:
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(f"\n {name} ")
    print("Confusion Matrix:")
    print(cm)


In [None]:
    # Heatmap of confusion matrix
    sns.heatmap(cm, annot=True, fmt='d', cmap="Blues")
    plt.title(f"{name} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()
    

In [None]:

results_df = pd.DataFrame(results_list)
print("\n MODEL COMPARISON TABLE ")
print(results_df)


In [None]:
# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=3))


In [None]:
def evaluate(y_true, y_pred, model_name):
    print(f"--- {model_name} ---")
    print("Accuracy :", round(accuracy_score(y_true, y_pred), 4))
    print("Precision:", round(precision_score(y_true, y_pred), 4))
    print("Recall   :", round(recall_score(y_true, y_pred), 4))
    print("F1-score :", round(f1_score(y_true, y_pred), 4))
    print()


In [None]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_sm, y_train_sm)

y_pred_lr = log_reg.predict(X_test)
evaluate(y_test, y_pred_lr, "Logistic Regression")


In [None]:
svm_model = SVC(kernel="rbf", probability=True)
svm_model.fit(X_train_sm, y_train_sm)

y_pred_svm = svm_model.predict(X_test)
evaluate(y_test, y_pred_svm, "SVM")


In [None]:
rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train_sm, y_train_sm)

y_pred_rf = rf.predict(X_test)
evaluate(y_test, y_pred_rf, "Random Forest")


In [None]:
xgb = XGBClassifier(eval_metric="logloss")
xgb.fit(X_train_sm, y_train_sm)

y_pred_xgb = xgb.predict(X_test)
evaluate(y_test, y_pred_xgb, "XGBoost")


In [None]:
#MODEL COMPARISON + EXPLAINABILITY
models = ["Logistic Regression", "SVM", "Random Forest", "XGBoost"]
preds = [y_pred_lr, y_pred_svm, y_pred_rf, y_pred_xgb]

comparison = pd.DataFrame({
    "Model": models,
    "Accuracy": [accuracy_score(y_test, p) for p in preds],
    "Precision": [precision_score(y_test, p) for p in preds],
    "Recall": [recall_score(y_test, p) for p in preds],
    "F1-score": [f1_score(y_test, p) for p in preds],
    "ROC-AUC": [
        roc_auc_score(y_test, m.predict_proba(X_test)[:,1])
        for m in [log_reg, svm_model, rf, xgb]
    ]
})

comparison


In [None]:
y_test.value_counts()


In [None]:
y_prob = model.predict_proba(X_test)[:,1]
# Lower threshold to 0.3 to capture more malignant
y_pred = (y_prob >= 0.3).astype(int)


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:

data = pd.read_csv("/kaggle/input/breast-cancer-wisconsin-data/data.csv")  
X = data.drop(columns=["id","diagnosis","Unnamed: 32"], errors='ignore')
y = data["diagnosis"].map({"B":0, "M":1})  # encode


# STRATIFIED TRAIN/TEST SPLIT

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train class distribution:\n", y_train.value_counts())
print("Test class distribution:\n", y_test.value_counts())


# SCALE FEATURES
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# APPLY SMOTE ON TRAINING SET ONLY
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print("\nAfter SMOTE:")
print(pd.Series(y_train_res).value_counts())


# DEFINE MODELS

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}


# TRAIN MODELS & EVALUATE

results_list = []

threshold = 0.3  # adjust threshold to improve recall

for name, model in models.items():
    try:
        # Train
        model.fit(X_train_res, y_train_res)
        
        # Predict probabilities
        y_prob = model.predict_proba(X_test)[:,1]
        y_pred = (y_prob >= threshold).astype(int)
        
        # Metrics
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc = roc_auc_score(y_test, y_prob)
        
        # Append to results
        results_list.append({
            "Model": name,
            "Accuracy": round(acc,3),
            "Precision": round(prec,3),
            "Recall": round(rec,3),
            "F1-Score": round(f1,3),
            "ROC-AUC": round(roc,3)
        })
        
        print(f"{name} trained successfully.")
        
        # Confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f"{name} Confusion Matrix (threshold={threshold})")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.show()
        
        # Classification report
        print(classification_report(y_test, y_pred, digits=3))
        
    except Exception as e:
        print(f"Error training {name}: {e}")


# RESULTS TABLE

results_df = pd.DataFrame(results_list)
print("\n MODEL COMPARISON TABLE ")
print(results_df)

# Optional: visualize metrics
metrics = ["Accuracy","Precision","Recall","F1-Score","ROC-AUC"]
results_df_melted = results_df.melt(id_vars="Model", value_vars=metrics, var_name="Metric", value_name="Score")

plt.figure(figsize=(12,6))
sns.barplot(data=results_df_melted, x="Metric", y="Score", hue="Model")
plt.title(f"Model Performance Comparison (threshold={threshold})")
plt.ylim(0,1)
plt.legend(loc="lower right")
plt.show()


# SAVE MODELS

for name, model in models.items():
    try:
        filename = name.replace(" ","_").lower() + ".pkl"
        joblib.dump(model, filename)
        print(f"{filename} saved.")
    except:
        print(f"Could not save {name}.")


In [None]:


# LOAD TRAINED MODELS

models = {
    "Logistic Regression": joblib.load("logistic_regression.pkl"),
    "SVM": joblib.load("svm.pkl"),
    "Random Forest": joblib.load("random_forest.pkl"),
    "XGBoost": joblib.load("xgboost.pkl")
}


# LOAD TEST DATA

test_df = pd.read_csv("cleaned_test.csv")
X_test = test_df.drop("diagnosis", axis=1)
y_test = test_df["diagnosis"]


# ROC CURVES

plt.figure(figsize=(10,7))

for name, model in models.items():
    y_prob = model.predict_proba(X_test)[:,1]
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.3f})')

plt.plot([0,1],[0,1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for All Models')
plt.legend(loc="lower right")
plt.show()


# FEATURE IMPORTANCE

# Random Forest
rf_model = models["Random Forest"]
rf_importances = rf_model.feature_importances_
rf_features = X_test.columns
rf_df = pd.DataFrame({"Feature": rf_features, "Importance": rf_importances}).sort_values(by="Importance", ascending=False)

plt.figure(figsize=(12,6))
sns.barplot(x="Importance", y="Feature", data=rf_df)
plt.title("Random Forest Feature Importance")
plt.show()

# XGBoost
xgb_model = models["XGBoost"]
xgb_importances = xgb_model.feature_importances_
xgb_features = X_test.columns
xgb_df = pd.DataFrame({"Feature": xgb_features, "Importance": xgb_importances}).sort_values(by="Importance", ascending=False)

plt.figure(figsize=(12,6))
sns.barplot(x="Importance", y="Feature", data=xgb_df)
plt.title("XGBoost Feature Importance")
plt.show()


# SHAP EXPLAINABILITY

# Only for XGBoost (fast and reliable)
explainer = shap.Explainer(xgb_model, X_test)
shap_values = explainer(X_test)

# Summary plot
shap.summary_plot(shap_values, X_test, plot_type="bar")


shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0].values, X_test.iloc[0])
