## IMPORT LIBRARIES

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## DATA IMPORTING

In [None]:
df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Deep Learning/CANCER DETECTION USING ANN/data.csv')
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


# **1) EDA & cleaning**
## what to check and why
**Goals:** confirm label distribution, spot missing values, check feature distributions and correlations (multicollinearity), detect outliers.

In [None]:
print(df.shape)

(569, 33)


In [None]:
print(df.columns.tolist())


['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32']


In [None]:
print(df['diagnosis'].value_counts())

diagnosis
B    357
M    212
Name: count, dtype: int64


In [None]:
print(df['diagnosis'].unique())

['M' 'B']


In [None]:
print(df.isnull().sum().sort_values(ascending=False).head(10))

Unnamed: 32            569
id                       0
diagnosis                0
texture_mean             0
radius_mean              0
area_mean                0
smoothness_mean          0
compactness_mean         0
perimeter_mean           0
concave points_mean      0
dtype: int64


In [None]:
# drop useless columns
df = df.drop(columns=[c for c in ['id','Unnamed: 32'] if c in df.columns])


In [None]:
df['diagnosis'] = df['diagnosis'].map({'M':1,'B':0})
df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [None]:
# correlations (heatmap or numeric)
print(df.corr()['diagnosis'].sort_values(ascending=False).head(10))

diagnosis               1.000000
concave points_worst    0.793566
perimeter_worst         0.782914
concave points_mean     0.776614
radius_worst            0.776454
perimeter_mean          0.742636
area_worst              0.733825
radius_mean             0.730029
area_mean               0.708984
concavity_mean          0.696360
Name: diagnosis, dtype: float64


Steps:



*   Map label: M → 1, B → 0.
*   plit: Stratified train / val / test (e.g., 70/15/15) to preserve class balance.

*   Scale numeric features with StandardScaler.


In [None]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
X = df.drop(columns=['diagnosis']).values
y = df['diagnosis'].values

In [None]:
# stratified split: train / temp, then temp -> val/test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)


In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models, regularizers, callbacks


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models, regularizers, callbacks

def build_baseline(input_dim):
    inputs = layers.Input(shape=(input_dim,))
    x = layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(1e-4))(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(1e-4))(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(32, activation='relu')(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs, outputs)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.AUC(name='auc'),
                 tf.keras.metrics.Precision(name='precision'),
                 tf.keras.metrics.Recall(name='recall')]
    )
    return model

model = build_baseline(X_train.shape[1])

callbacks_list = [
    callbacks.EarlyStopping(monitor='val_auc', mode='max', patience=15, restore_best_weights=True),
    callbacks.ModelCheckpoint('best_model.h5', monitor='val_auc', mode='max', save_best_only=True),
    callbacks.ReduceLROnPlateau(monitor='val_auc', mode='max', factor=0.5, patience=6, min_lr=1e-6)
]

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=200,
    batch_size=32,
    callbacks=callbacks_list
)


Epoch 1/200
[1m 1/13[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m53s[0m 4s/step - accuracy: 0.5312 - auc: 0.3009 - loss: 0.8476 - precision: 0.3000 - recall: 0.2727



[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 68ms/step - accuracy: 0.7184 - auc: 0.6907 - loss: 0.5936 - precision: 0.6092 - recall: 0.5580 - val_accuracy: 0.8706 - val_auc: 0.9764 - val_loss: 0.4547 - val_precision: 0.8621 - val_recall: 0.7812 - learning_rate: 0.0010
Epoch 2/200
[1m 1/13[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 27ms/step - accuracy: 0.9062 - auc: 1.0000 - loss: 0.2073 - precision: 1.0000 - recall: 0.7857



[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.9235 - auc: 0.9804 - loss: 0.2403 - precision: 0.9333 - recall: 0.8679 - val_accuracy: 0.9294 - val_auc: 0.9909 - val_loss: 0.3163 - val_precision: 0.9643 - val_recall: 0.8438 - learning_rate: 0.0010
Epoch 3/200
[1m12/13[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 5ms/step - accuracy: 0.9505 - auc: 0.9928 - loss: 0.1580 - precision: 0.9392 - recall: 0.9372 



[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9511 - auc: 0.9921 - loss: 0.1587 - precision: 0.9392 - recall: 0.9374 - val_accuracy: 0.9412 - val_auc: 0.9947 - val_loss: 0.2514 - val_precision: 0.9655 - val_recall: 0.8750 - learning_rate: 0.0010
Epoch 4/200
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9578 - auc: 0.9923 - loss: 0.1329 - precision: 0.9389 - recall: 0.9451 - val_accuracy: 0.9412 - val_auc: 0.9947 - val_loss: 0.2155 - val_precision: 0.9655 - val_recall: 0.8750 - learning_rate: 0.0010
Epoch 5/200
[1m 1/13[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 173ms/step - accuracy: 0.9688 - auc: 0.9960 - loss: 0.1326 - precision: 1.0000 - recall: 0.9231



[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.9677 - auc: 0.9931 - loss: 0.1225 - precision: 0.9957 - recall: 0.9183 - val_accuracy: 0.9529 - val_auc: 0.9965 - val_loss: 0.1879 - val_precision: 1.0000 - val_recall: 0.8750 - learning_rate: 0.0010
Epoch 6/200
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9558 - auc: 0.9896 - loss: 0.1353 - precision: 0.9514 - recall: 0.9260 - val_accuracy: 0.9529 - val_auc: 0.9965 - val_loss: 0.1724 - val_precision: 1.0000 - val_recall: 0.8750 - learning_rate: 0.0010
Epoch 7/200
[1m 1/13[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 28ms/step - accuracy: 0.9688 - auc: 1.0000 - loss: 0.0880 - precision: 0.8889 - recall: 1.0000



[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9715 - auc: 0.9960 - loss: 0.0977 - precision: 0.9645 - recall: 0.9526 - val_accuracy: 0.9529 - val_auc: 0.9971 - val_loss: 0.1577 - val_precision: 1.0000 - val_recall: 0.8750 - learning_rate: 0.0010
Epoch 8/200
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9870 - auc: 0.9972 - loss: 0.0769 - precision: 0.9778 - recall: 0.9872 



[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.9863 - auc: 0.9969 - loss: 0.0782 - precision: 0.9774 - recall: 0.9857 - val_accuracy: 0.9529 - val_auc: 0.9976 - val_loss: 0.1417 - val_precision: 1.0000 - val_recall: 0.8750 - learning_rate: 0.0010
Epoch 9/200
[1m 1/13[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 28ms/step - accuracy: 1.0000 - auc: 1.0000 - loss: 0.0448 - precision: 1.0000 - recall: 1.0000



[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.9704 - auc: 0.9959 - loss: 0.0897 - precision: 0.9715 - recall: 0.9481 - val_accuracy: 0.9529 - val_auc: 0.9982 - val_loss: 0.1302 - val_precision: 1.0000 - val_recall: 0.8750 - learning_rate: 0.0010
Epoch 10/200
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.9903 - auc: 0.9981 - loss: 0.0657 - precision: 0.9912 - recall: 0.9820 - val_accuracy: 0.9529 - val_auc: 0.9979 - val_loss: 0.1291 - val_precision: 1.0000 - val_recall: 0.8750 - learning_rate: 0.0010
Epoch 11/200
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.9857 - auc: 0.9972 - loss: 0.0806 - precision: 0.9791 - recall: 0.9838 - val_accuracy: 0.9529 - val_auc: 0.9982 - val_loss: 0.1170 - val_precision: 1.0000 - val_recall: 0.8750 - learnin



[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.9907 - auc: 0.9991 - loss: 0.0551 - precision: 1.0000 - recall: 0.9733 - val_accuracy: 0.9765 - val_auc: 0.9988 - val_loss: 0.0877 - val_precision: 1.0000 - val_recall: 0.9375 - learning_rate: 5.0000e-04
Epoch 21/200
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9764 - auc: 0.9984 - loss: 0.0664 - precision: 0.9739 - recall: 0.9638 - val_accuracy: 0.9765 - val_auc: 0.9982 - val_loss: 0.0882 - val_precision: 1.0000 - val_recall: 0.9375 - learning_rate: 5.0000e-04
Epoch 22/200
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.9886 - auc: 0.9995 - loss: 0.0498 - precision: 0.9863 - recall: 0.9832 - val_accuracy: 0.9765 - val_auc: 0.9979 - val_loss: 0.0883 - val_precision: 1.0000 - val_recall: 0.9375 -

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, classification_report, precision_recall_curve, average_precision_score

model = tf.keras.models.load_model('best_model.h5')
y_proba = model.predict(X_test).ravel()
y_pred = (y_proba >= 0.5).astype(int)   # default threshold

print("ROC AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n", cm)

# Find threshold for target sensitivity (recall)
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
# e.g., choose threshold that gives tpr >= 0.95 (if you need very few false negatives)
idx = np.where(tpr >= 0.95)[0][0]
best_thresh = thresholds[idx]
print("Threshold for 95% sensitivity:", best_thresh)




[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 149ms/step
ROC AUC: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        54
           1       1.00      1.00      1.00        32

    accuracy                           1.00        86
   macro avg       1.00      1.00      1.00        86
weighted avg       1.00      1.00      1.00        86

Confusion matrix:
 [[54  0]
 [ 0 32]]
Threshold for 95% sensitivity: 0.5535893


In [None]:
pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [None]:
# pip install keras-tuner
import kerastuner as kt
from tensorflow import keras

def build_model(hp):
    inputs = keras.Input(shape=(X_train.shape[1],))
    x = inputs
    # pick number of layers
    for i in range(hp.Int("n_layers", 1, 3)):
        units = hp.Int(f"units_{i}", 32, 256, step=32)
        x = keras.layers.Dense(units, activation="relu")(x)
        if hp.Boolean(f"bn_{i}"):
            x = keras.layers.BatchNormalization()(x)
        if hp.Float(f"dropout_{i}", 0.0, 0.5, step=0.1) > 0:
            x = keras.layers.Dropout(hp.Float(f"dropout_{i}", 0.0, 0.5, step=0.1))(x)
    outputs = keras.layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs)
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=hp.Choice("lr", [1e-2, 1e-3, 1e-4])),
        loss="binary_crossentropy",
        metrics=[tf.keras.metrics.AUC(name="auc")]
    )
    return model

tuner = kt.RandomSearch(build_model, objective='val_auc', max_trials=20, executions_per_trial=1, directory='tuner', project_name='bc')

tuner.search(X_train, y_train, epochs=50, validation_data=(X_val, y_val))
best = tuner.get_best_models(num_models=1)[0]


Trial 20 Complete [00h 00m 14s]
val_auc: 0.9935141801834106

Best val_auc So Far: 0.9988207817077637
Total elapsed time: 00h 04m 49s


In [155]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, test_idx in skf.split(X, y):
    # scale within fold, build model, train, evaluate
    pass


In [157]:
import shap
# Use a small background set because KernelExplainer can be slow
explainer = shap.KernelExplainer(lambda x: model.predict(x).ravel(), X_train[:50])
shap_values = explainer.shap_values(X_test[:50])



[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step


  0%|          | 0/50 [00:00<?, ?it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 194ms/step
[1m3294/3294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m3294/3294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m3294/3294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m3294/3294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m3294/3294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m3294/3294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m3294/3294[0m [32m━━━━━━━━━━