### Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.image import ImageDataGenerator

2025-07-28 19:13:37.375576: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-28 19:13:37.536209: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753744417.606389   18164 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753744417.640170   18164 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-28 19:13:37.820980: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

## Preprocessing

### Data Loading

In [2]:
train_df = pd.read_csv("CBIS-DDSM_Clean_Data/train_full.csv")
test_df = pd.read_csv("CBIS-DDSM_Clean_Data/test_full.csv")

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2123 entries, 0 to 2122
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   image_id          2123 non-null   object 
 1   image_type        2123 non-null   object 
 2   image_path        2123 non-null   object 
 3   series_uid        2123 non-null   object 
 4   subject_id        2123 non-null   object 
 5   study_uid         2123 non-null   object 
 6   breast_density    2123 non-null   float64
 7   breast_side       2123 non-null   object 
 8   image_view        2123 non-null   object 
 9   abnormality_type  2123 non-null   object 
 10  pathology         2123 non-null   object 
 11  split             2123 non-null   object 
dtypes: float64(1), object(11)
memory usage: 199.2+ KB


In [4]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 562 entries, 0 to 561
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   image_id          562 non-null    object 
 1   image_type        562 non-null    object 
 2   image_path        562 non-null    object 
 3   series_uid        562 non-null    object 
 4   subject_id        562 non-null    object 
 5   study_uid         562 non-null    object 
 6   breast_density    562 non-null    float64
 7   breast_side       562 non-null    object 
 8   image_view        562 non-null    object 
 9   abnormality_type  562 non-null    object 
 10  pathology         562 non-null    object 
 11  split             562 non-null    object 
dtypes: float64(1), object(11)
memory usage: 52.8+ KB


In [5]:
train_df.iloc[0]

image_id                                                     51547_00
image_type                                                       full
image_path          CBIS-DDSM_Clean_Data/labeled_images_png/1.3.6....
series_uid          1.3.6.1.4.1.9590.100.1.2.100131208110604806117...
subject_id                              Calc-Training_P_01107_LEFT_CC
study_uid           1.3.6.1.4.1.9590.100.1.2.113816182611334006337...
breast_density                                                    2.0
breast_side                                                      LEFT
image_view                                                         CC
abnormality_type                                        calcification
pathology                                                      BENIGN
split                                                           train
Name: 0, dtype: object

In [6]:
print("Size of training data", len(train_df))
print("Size of testing data", len(test_df))

Size of training data 2123
Size of testing data 562


### Encode labels

In [7]:
# create dataframe and transform(encodes) pathology labels
train_full_df = train_df
test_full_df = test_df
print("Non Encoded: ")
print(train_full_df["pathology"].unique())
print(test_full_df["pathology"].unique())
print()

train_full_df["label"] = LabelEncoder().fit_transform(train_full_df["pathology"]).astype(np.int32)
test_full_df["label"] = LabelEncoder().fit_transform(test_full_df["pathology"]).astype(np.int32)

print("Encoded: ")
print(train_full_df["label"].unique())
print(test_full_df["label"].unique())

Non Encoded: 
['BENIGN' 'MALIGNANT']
['MALIGNANT' 'BENIGN']

Encoded: 
[0 1]
[1 0]


### Split training data into validation and training sets

In [8]:
train_data, val_data = train_test_split(train_full_df, 
                                        test_size=0.15, 
                                        stratify=train_full_df["label"], 
                                        random_state=42
                                       )
test_data = test_full_df.copy()

print("Train set", len(train_data))
print("Validation set", len(val_data))
print("Test set", len(test_data))

Train set 1804
Validation set 319
Test set 562


## Create tensors with dataset 

### Preprocess images

In [9]:
# function for preprocess images
def img_preprocessing(path, img_size=(256, 256)):
    # load image
    image = load_img(path, color_mode='grayscale', target_size=img_size)

    # normalize greyscale values between 0-1
    image_arr = img_to_array(image) / 255.0 

    return image_arr

In [10]:
print(train_data["image_path"][0])

CBIS-DDSM_Clean_Data/labeled_images_png/1.3.6.1.4.1.9590.100.1.2.100131208110604806117271735422083351547_full.png


In [11]:
import os

path = train_data["image_path"][0]
print(path)
print("Exists:", os.path.exists(path))

CBIS-DDSM_Clean_Data/labeled_images_png/1.3.6.1.4.1.9590.100.1.2.100131208110604806117271735422083351547_full.png
Exists: True


### Create iterators(generators) after preprocessing 

In [12]:
# Generate dataset for processing 

# initiate generators
t_generator = ImageDataGenerator()
v_generator = ImageDataGenerator()

# setup generators
train_gen = t_generator.flow_from_dataframe(
    dataframe=train_data,
    x_col="image_path",
    y_col="label",
    target_size=(256, 256),
    color_mode="grayscale",
    class_mode="raw",
    batch_size=32,
    shuffle=True,
    seed=42
)
val_gen = v_generator.flow_from_dataframe(
    dataframe=val_data,
    x_col="image_path",
    y_col="label",
    target_size=(256, 256),
    color_mode="grayscale",
    class_mode="raw",
    batch_size=32,
    shuffle=False,
    seed=42
)


Found 1804 validated image filenames.
Found 319 validated image filenames.


In [13]:
test_generator = ImageDataGenerator()

# setup generators
test_gen = test_generator.flow_from_dataframe(
    dataframe=test_data,
    x_col="image_path",
    y_col="label",
    target_size=(256, 256),
    color_mode="grayscale",
    class_mode="raw",
    batch_size=32,
    shuffle=False,
    seed=42
)

Found 562 validated image filenames.


## Model 

### Model Architecture 

In [14]:
# build model architecture function
# ====== The following model architecture is based on  (Chollet, 2025, p. 216) =====
def custom_Xray_CNN(input_shape, classes):
    inputs = keras.Input(shape=input_shape)
    model = models.Sequential([
        inputs,
        layers.Rescaling(1./255),                                           
        layers.Conv2D(filters=32, kernel_size=3, activation='relu'),        # kernel size 3x3
        layers.MaxPool2D(pool_size=2),                                      # pool size 2x2

        layers.Conv2D(filters=64, kernel_size=3, activation='relu'),
        layers.MaxPool2D(pool_size=2),

        layers.Conv2D(filters=128, kernel_size=3, activation='relu'),
        layers.MaxPool2D(pool_size=2),

        layers.Flatten(),
        layers.Dense(classes, activation='softmax')
        
    ])
    model. compile(loss='sparse_categorical_crossentropy',
                   optimizer='adam',
                   metrics=['accuracy']
    )

    return model

### Train model

In [15]:
# train model with an input image of 256x256 and 3 class labels
model = custom_Xray_CNN((256, 256, 1), len(train_full_df["label"].unique()))

I0000 00:00:1753744606.428519   18164 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5563 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9


### Fit Data to Model

In [None]:
# fit data to model
history = model.fit(train_gen, validation_data=val_gen, epochs=10)
history

  self._warn_if_super_not_called()


Epoch 1/10


I0000 00:00:1753744626.250482   18359 service.cc:148] XLA service 0x7360cc00a790 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1753744626.250718   18359 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 4060 Laptop GPU, Compute Capability 8.9
2025-07-28 19:17:06.296299: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1753744626.406881   18359 cuda_dnn.cc:529] Loaded cuDNN version 90501


[1m 2/57[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 39ms/step - accuracy: 0.4531 - loss: 0.7008  

I0000 00:00:1753744631.275974   18359 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m374s[0m 6s/step - accuracy: 0.5082 - loss: 0.6932 - val_accuracy: 0.4984 - val_loss: 0.6846
Epoch 2/10
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m323s[0m 6s/step - accuracy: 0.5747 - loss: 0.6696 - val_accuracy: 0.5517 - val_loss: 0.6822
Epoch 3/10
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m324s[0m 6s/step - accuracy: 0.6241 - loss: 0.6445 - val_accuracy: 0.5329 - val_loss: 0.6711
Epoch 4/10
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m317s[0m 6s/step - accuracy: 0.6805 - loss: 0.5962 - val_accuracy: 0.5768 - val_loss: 0.7161
Epoch 5/10
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m351s[0m 6s/step - accuracy: 0.7619 - loss: 0.4749 - val_accuracy: 0.5831 - val_loss: 0.7609
Epoch 6/10
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11778s[0m 210s/step - accuracy: 0.8072 - loss: 0.3951 - val_accuracy: 0.5831 - val_loss: 0.9505
Epoch 7/10
[1m57/57[0m [32m━━━━━━━━━━━

In [None]:
# save history
history_basic = pd.DataFrame(history.history)
history_basic.to_csv("Outputs/binary_history_basic.csv", index=False)

In [None]:
loss, accuracy = model.evaluate(test_gen)

In [None]:
print("Model Accuracy in Test Data", accuracy)