### Import Libraries

In [122]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.image import ImageDataGenerator

## Preprocessing

### Data Loading

In [106]:
train_df = pd.read_csv("CBIS-DDSM_Clean_Data/train_descriptions.csv")
test_df = pd.read_csv("CBIS-DDSM_Clean_Data/test_descriptions.csv")

In [107]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2864 entries, 0 to 2863
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   breast_density    2864 non-null   int64 
 1   breast_side       2864 non-null   object
 2   image_view        2864 non-null   object
 3   abnormality_type  2864 non-null   object
 4   pathology         2864 non-null   object
 5   series_uid        2864 non-null   object
 6   images_new_paths  2864 non-null   object
 7   image_type        2864 non-null   object
dtypes: int64(1), object(7)
memory usage: 179.1+ KB


In [108]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 704 entries, 0 to 703
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   breast_density    704 non-null    int64 
 1   breast_side       704 non-null    object
 2   image_view        704 non-null    object
 3   abnormality_type  704 non-null    object
 4   pathology         704 non-null    object
 5   series_uid        704 non-null    object
 6   images_new_paths  704 non-null    object
 7   image_type        704 non-null    object
dtypes: int64(1), object(7)
memory usage: 44.1+ KB


In [109]:
train_df.iloc[0]

breast_density                                                      3
breast_side                                                      LEFT
image_view                                                         CC
abnormality_type                                                 mass
pathology                                                   MALIGNANT
series_uid          1.3.6.1.4.1.9590.100.1.2.342386194811267636608...
images_new_paths    CBIS-DDSM_Clean_Data/images_png/1.3.6.1.4.1.95...
image_type                                      full mammogram images
Name: 0, dtype: object

In [110]:
print("Size of training data", len(train_df))
print("Size of testing data", len(test_df))

Size of training data 2864
Size of testing data 704


### Encode labels

In [123]:
# create dataframe and transform(encodes) pathology labels
train_full_df = train_df
test_full_df = test_df
print("Non Encoded: ")
print(train_full_df["pathology"].unique())
print(test_full_df["pathology"].unique())
print()

train_full_df["label"] = LabelEncoder().fit_transform(train_full_df["pathology"]).astype(np.int32)
test_full_df["label"] = LabelEncoder().fit_transform(test_full_df["pathology"]).astype(np.int32)

print("Encoded: ")
print(train_full_df["label"].unique())
print(test_full_df["label"].unique())

Non Encoded: 
['MALIGNANT' 'BENIGN' 'BENIGN_WITHOUT_CALLBACK']
['MALIGNANT' 'BENIGN' 'BENIGN_WITHOUT_CALLBACK']

Encoded: 
[2 0 1]
[2 0 1]


### Split Training data into validation and training sets

In [124]:
train_data, val_data = train_test_split(train_full_df, 
                                        test_size=0.15, 
                                        stratify=train_full_df["label"], 
                                        random_state=42
                                       )
test_data = test_full_df.copy()

print("Train set", len(train_data))
print("Validation set", len(val_data))
print("Test set", len(test_data))

Train set 2434
Validation set 430
Test set 704


## Create Tensors with dataset 

### Preprocess images

In [113]:
# function for preprocess images
def img_preprocessing(path, img_size=(256, 256)):
    # load image
    image = load_img(path, color_mode='grayscale', target_size=img_size)

    # normalize greyscale values between 0-1
    image_arr = img_to_array(image) / 255.0 

    return image_arr

In [125]:
print(train_data["images_new_paths"][0])

CBIS-DDSM_Clean_Data/images_png/1.3.6.1.4.1.9590.100.1.2.3423861948112676366086941325904829245151-1.png


In [126]:
import os
path = train_data["images_new_paths"].iloc[0]
print(path)
print("Exists:", os.path.exists(path))

CBIS-DDSM_Clean_Data/images_png/1.3.6.1.4.1.9590.100.1.2.3394399359126144235388160858318961541681-1.png
Exists: True


### Create tensors after preprocessing 

In [127]:
# Generate dataset for processing 

# initiate generators
t_generator = ImageDataGenerator()
v_generator = ImageDataGenerator()

# setup generators
train_gen = t_generator.flow_from_dataframe(
    dataframe=train_data,
    x_col="images_new_paths",
    y_col="label",
    target_size=(256, 256),
    color_mode="grayscale",
    class_mode="raw",
    batch_size=32,
    shuffle=True,
    seed=42
)
val_gen = v_generator.flow_from_dataframe(
    dataframe=val_data,
    x_col="images_new_paths",
    y_col="label",
    target_size=(256, 256),
    color_mode="grayscale",
    class_mode="raw",
    batch_size=32,
    shuffle=False,
    seed=42
)


Found 2434 validated image filenames.
Found 430 validated image filenames.


In [151]:
test_generator = ImageDataGenerator()

# setup generators
test_gen = test_generator.flow_from_dataframe(
    dataframe=test_data,
    x_col="images_new_paths",
    y_col="label",
    target_size=(256, 256),
    color_mode="grayscale",
    class_mode="raw",
    batch_size=32,
    shuffle=False,
    seed=42
)

Found 704 validated image filenames.


## Model 

### Model Architecture 

In [128]:
# build model architecture function
# ====== The following model architecture is based on  (Chollet, 2025, p. 216) =====
def custom_Xray_CNN(input_shape, classes):
    inputs = keras.Input(shape=input_shape)
    model = models.Sequential([
        inputs,
        layers.Rescaling(1./255),                                           
        layers.Conv2D(filters=32, kernel_size=3, activation='relu'),        # kernel size 3x3
        layers.MaxPool2D(pool_size=2),                                      # pool size 2x2

        layers.Conv2D(filters=64, kernel_size=3, activation='relu'),
        layers.MaxPool2D(pool_size=2),

        layers.Conv2D(filters=128, kernel_size=3, activation='relu'),
        layers.MaxPool2D(pool_size=2),

        layers.Flatten(),
        layers.Dense(classes, activation='softmax')
        
    ])
    model. compile(loss='sparse_categorical_crossentropy',
                   optimizer='adam',
                   metrics=['accuracy']
    )

    return model

### Train model

In [129]:
# train model with an input image of 256x256 and 3 class labels
model = custom_Xray_CNN((256, 256, 1), len(train_full_df["label"].unique()))

### Fit Data to Model

In [149]:
# fit data to model
history = model.fit(train_gen, validation_data=val_gen, epochs=10)
history

Epoch 1/10
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - accuracy: 0.5254 - loss: 0.9053

  self._warn_if_super_not_called()


[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m388s[0m 5s/step - accuracy: 0.5256 - loss: 0.9052 - val_accuracy: 0.5233 - val_loss: 0.9287
Epoch 2/10
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m380s[0m 5s/step - accuracy: 0.6460 - loss: 0.7531 - val_accuracy: 0.5767 - val_loss: 0.8822
Epoch 3/10
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m376s[0m 5s/step - accuracy: 0.7464 - loss: 0.6081 - val_accuracy: 0.5581 - val_loss: 0.9728
Epoch 4/10
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m343s[0m 4s/step - accuracy: 0.8113 - loss: 0.4467 - val_accuracy: 0.5581 - val_loss: 1.0843
Epoch 5/10
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m324s[0m 4s/step - accuracy: 0.8637 - loss: 0.3446 - val_accuracy: 0.5791 - val_loss: 1.2640
Epoch 6/10
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4011s[0m 53s/step - accuracy: 0.8986 - loss: 0.2527 - val_accuracy: 0.5860 - val_loss: 1.3706
Epoch 7/10
[1m77/77[0m [32m━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x76ae341cbd90>

In [150]:
# save history
history_basic = pd.DataFrame(history.history)
history_basic.to_csv("history_basic.csv", index=False)

In [152]:
loss, accuracy = model.evaluate(test_gen)

[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 5s/step - accuracy: 0.5098 - loss: 2.4139


In [153]:
print("Model Accuracy in Test Data", accuracy)

Model Accuracy in Test Data 0.4857954680919647
