### Import Libraries

In [43]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.image import ImageDataGenerator

## Preprocessing

### Data Loading

In [44]:
train_df = pd.read_csv("CBIS-DDSM_Clean_Data/train_full.csv")
test_df = pd.read_csv("CBIS-DDSM_Clean_Data/test_full.csv")

In [45]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2443 entries, 0 to 2442
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   image_id          2443 non-null   object 
 1   image_type        2443 non-null   object 
 2   image_path        2443 non-null   object 
 3   series_uid        2443 non-null   object 
 4   subject_id        2443 non-null   object 
 5   study_uid         2443 non-null   object 
 6   breast_density    2443 non-null   float64
 7   breast_side       2443 non-null   object 
 8   image_view        2443 non-null   object 
 9   abnormality_type  2443 non-null   object 
 10  pathology         2443 non-null   object 
 11  split             2443 non-null   object 
dtypes: float64(1), object(11)
memory usage: 229.2+ KB


In [46]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 641 entries, 0 to 640
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   image_id          641 non-null    object 
 1   image_type        641 non-null    object 
 2   image_path        641 non-null    object 
 3   series_uid        641 non-null    object 
 4   subject_id        641 non-null    object 
 5   study_uid         641 non-null    object 
 6   breast_density    641 non-null    float64
 7   breast_side       641 non-null    object 
 8   image_view        641 non-null    object 
 9   abnormality_type  641 non-null    object 
 10  pathology         641 non-null    object 
 11  split             641 non-null    object 
dtypes: float64(1), object(11)
memory usage: 60.2+ KB


In [47]:
train_df.iloc[0]

image_id                                                     51547_00
image_type                                                       full
image_path          CBIS-DDSM_Clean_Data/labeled_images_png/1.3.6....
series_uid          1.3.6.1.4.1.9590.100.1.2.100131208110604806117...
subject_id                              Calc-Training_P_01107_LEFT_CC
study_uid           1.3.6.1.4.1.9590.100.1.2.113816182611334006337...
breast_density                                                    2.0
breast_side                                                      LEFT
image_view                                                         CC
abnormality_type                                        calcification
pathology                                                      BENIGN
split                                                           train
Name: 0, dtype: object

In [48]:
print("Size of training data", len(train_df))
print("Size of testing data", len(test_df))

Size of training data 2443
Size of testing data 641


### Encode labels

In [49]:
# create dataframe and transform(encodes) pathology labels
train_full_df = train_df
test_full_df = test_df
print("Non Encoded: ")
print(train_full_df["pathology"].unique())
print(test_full_df["pathology"].unique())
print()

train_full_df["label"] = LabelEncoder().fit_transform(train_full_df["pathology"]).astype(np.int32)
test_full_df["label"] = LabelEncoder().fit_transform(test_full_df["pathology"]).astype(np.int32)

print("Encoded: ")
print(train_full_df["label"].unique())
print(test_full_df["label"].unique())

Non Encoded: 
['BENIGN' 'MALIGNANT' 'BENIGN_WITHOUT_CALLBACK']
['MALIGNANT' 'BENIGN' 'BENIGN_WITHOUT_CALLBACK']

Encoded: 
[0 2 1]
[2 0 1]


### Split training data into validation and training sets

In [50]:
train_data, val_data = train_test_split(train_full_df, 
                                        test_size=0.15, 
                                        stratify=train_full_df["label"], 
                                        random_state=42
                                       )
test_data = test_full_df.copy()

print("Train set", len(train_data))
print("Validation set", len(val_data))
print("Test set", len(test_data))

Train set 2076
Validation set 367
Test set 641


## Create tensors with dataset 

### Preprocess images

In [51]:
# function for preprocess images
def img_preprocessing(path, img_size=(256, 256)):
    # load image
    image = load_img(path, color_mode='grayscale', target_size=img_size)

    # normalize greyscale values between 0-1
    image_arr = img_to_array(image) / 255.0 

    return image_arr

In [52]:
print(train_data["image_path"][0])

CBIS-DDSM_Clean_Data/labeled_images_png/1.3.6.1.4.1.9590.100.1.2.100131208110604806117271735422083351547_full.png


In [53]:
import os

path = train_data["image_path"][0]
print(path)
print("Exists:", os.path.exists(path))

CBIS-DDSM_Clean_Data/labeled_images_png/1.3.6.1.4.1.9590.100.1.2.100131208110604806117271735422083351547_full.png
Exists: True


### Create tensors after preprocessing 

In [54]:
# Generate dataset for processing 

# initiate generators
t_generator = ImageDataGenerator()
v_generator = ImageDataGenerator()

# setup generators
train_gen = t_generator.flow_from_dataframe(
    dataframe=train_data,
    x_col="image_path",
    y_col="label",
    target_size=(256, 256),
    color_mode="grayscale",
    class_mode="raw",
    batch_size=32,
    shuffle=True,
    seed=42
)
val_gen = v_generator.flow_from_dataframe(
    dataframe=val_data,
    x_col="image_path",
    y_col="label",
    target_size=(256, 256),
    color_mode="grayscale",
    class_mode="raw",
    batch_size=32,
    shuffle=False,
    seed=42
)


Found 2076 validated image filenames.
Found 367 validated image filenames.


In [55]:
test_generator = ImageDataGenerator()

# setup generators
test_gen = test_generator.flow_from_dataframe(
    dataframe=test_data,
    x_col="image_path",
    y_col="label",
    target_size=(256, 256),
    color_mode="grayscale",
    class_mode="raw",
    batch_size=32,
    shuffle=False,
    seed=42
)

Found 641 validated image filenames.


## Model 

### Model Architecture 

In [56]:
# build model architecture function
# ====== The following model architecture is based on  (Chollet, 2025, p. 216) =====
def custom_Xray_CNN(input_shape, classes):
    inputs = keras.Input(shape=input_shape)
    model = models.Sequential([
        inputs,
        layers.Rescaling(1./255),                                           
        layers.Conv2D(filters=32, kernel_size=3, activation='relu'),        # kernel size 3x3
        layers.MaxPool2D(pool_size=2),                                      # pool size 2x2

        layers.Conv2D(filters=64, kernel_size=3, activation='relu'),
        layers.MaxPool2D(pool_size=2),

        layers.Conv2D(filters=128, kernel_size=3, activation='relu'),
        layers.MaxPool2D(pool_size=2),

        layers.Flatten(),
        layers.Dense(classes, activation='softmax')
        
    ])
    model. compile(loss='sparse_categorical_crossentropy',
                   optimizer='adam',
                   metrics=['accuracy']
    )

    return model

### Train model

In [57]:
# train model with an input image of 256x256 and 3 class labels
model = custom_Xray_CNN((256, 256, 1), len(train_full_df["label"].unique()))

### Fit Data to Model

In [58]:
# fit data to model
history = model.fit(train_gen, validation_data=val_gen, epochs=10)
history

  self._warn_if_super_not_called()


Epoch 1/10


I0000 00:00:1753269740.275704    5763 service.cc:148] XLA service 0x779768009da0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1753269740.276506    5763 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 4060 Laptop GPU, Compute Capability 8.9
2025-07-23 07:22:20.430738: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1753269740.744329    5763 cuda_dnn.cc:529] Loaded cuDNN version 90501


[1m 2/65[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 42ms/step - accuracy: 0.1953 - loss: 1.152728

I0000 00:00:1753269746.121394    5763 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m352s[0m 5s/step - accuracy: 0.4216 - loss: 1.0184 - val_accuracy: 0.4414 - val_loss: 0.9361
Epoch 2/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m364s[0m 6s/step - accuracy: 0.4886 - loss: 0.9112 - val_accuracy: 0.4741 - val_loss: 0.9786
Epoch 3/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m352s[0m 5s/step - accuracy: 0.5795 - loss: 0.8200 - val_accuracy: 0.4986 - val_loss: 0.9374
Epoch 4/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m389s[0m 6s/step - accuracy: 0.6753 - loss: 0.6957 - val_accuracy: 0.4877 - val_loss: 0.9493
Epoch 5/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m371s[0m 6s/step - accuracy: 0.7504 - loss: 0.5544 - val_accuracy: 0.5232 - val_loss: 1.1286
Epoch 6/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m356s[0m 5s/step - accuracy: 0.8050 - loss: 0.4342 - val_accuracy: 0.5341 - val_loss: 1.2171
Epoch 7/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7797f43dd3d0>

In [59]:
# save history
history_basic = pd.DataFrame(history.history)
history_basic.to_csv("full_history_basic.csv", index=False)

In [60]:
loss, accuracy = model.evaluate(test_gen)

[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 5s/step - accuracy: 0.5324 - loss: 2.4364


In [61]:
print("Model Accuracy in Test Data", accuracy)

Model Accuracy in Test Data 0.5226209163665771
