## Phase 2: Find Processing Techniques

In [14]:
# %reload_ext autoreload
# %autoreload 2

In [2]:
# connects to utils and run a test for connectivity
from Utils.test_class_func import Test_py 
from Utils.test_class_func import test_py  
print(Test_py("My_Test_Class").print_(), "\n")
print(test_py("My_Test_Function"))

Class -> Try if python utils connects to notebook: My_Test_Class 

Function -> Try if python utils connects to notebook: My_Test_Function


In [1]:
# import functions 
from Utils.preporcessing_utils import data_loading 
from Utils.preporcessing_utils import labels_encoding
from Utils.preporcessing_utils import split_data
from Utils.preporcessing_utils import image_iterators
from Utils.preporcessing_utils import ablation
from Utils.models_utils import Basic_Custom_CNN
from Utils.evaluation_utils import Evaluation
from Utils.save_data_utils import Save_Data

2025-08-21 21:42:12.084380: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-21 21:42:12.298282: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755826932.322211   69218 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755826932.333239   69218 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-08-21 21:42:12.383390: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
# import libraries
from tensorflow.keras import backend as K

### Pipeline Workflow

#### Data Preparation and Preprocessing

In [3]:
# Loads data
train_df, test_df = data_loading("train_full.csv", "test_full.csv")

In [4]:
# Create dataframe and transform(encodes) pathology labels
train_df, test_df = labels_encoding(train_df, test_df)

In [5]:
train_df.columns

Index(['image_id', 'image_type', 'image_path', 'series_uid', 'subject_id',
       'study_uid', 'breast_density', 'breast_side', 'image_view',
       'abnormality_type', 'pathology', 'split', 'label'],
      dtype='object')

In [6]:
# Split data
train_data, val_data, test_data = split_data(train_df, test_df, 0.11)

Train set: 1889 cases, 70.35 %
Validation set: 234 cases, 8.72 %
Test set: 562 cases, 20.93 %


#### Iteration 1: Finding best preprocessing technique using custom CNN

In [7]:
# variables 
project_phase = "P2"
epochs = 10
options = ['apply_background_removal',
           'apply_crop',
           'apply_noise_reduction',
           'apply_contrast_enhancement',
           'apply_edge_enhancement',
           'apply_lbp_texturizer']

y_true = test_data["label"]

In [8]:
# create group of techniques to try
techniques_groups = ablation(options)

In [9]:
techniques_groups

{'Baseline Basic Preporcessing': {'apply_background_removal': False,
  'apply_crop': False,
  'apply_noise_reduction': False,
  'apply_contrast_enhancement': False,
  'apply_edge_enhancement': False,
  'apply_lbp_texturizer': False},
 'All Preporcessing Techniques': {'apply_background_removal': True,
  'apply_crop': True,
  'apply_noise_reduction': True,
  'apply_contrast_enhancement': True,
  'apply_edge_enhancement': True,
  'apply_lbp_texturizer': True},
 'No Background removal': {'apply_background_removal': False,
  'apply_crop': True,
  'apply_noise_reduction': True,
  'apply_contrast_enhancement': True,
  'apply_edge_enhancement': True,
  'apply_lbp_texturizer': True},
 'No Crop': {'apply_background_removal': True,
  'apply_crop': False,
  'apply_noise_reduction': True,
  'apply_contrast_enhancement': True,
  'apply_edge_enhancement': True,
  'apply_lbp_texturizer': True},
 'No Noise reduction': {'apply_background_removal': True,
  'apply_crop': True,
  'apply_noise_reduction': F

In [12]:
# train_data = train_data[:20]
# val_data = val_data[:10]
# test_data = test_data[:10]
# y_true = y_true[:10]

In [10]:
# iterate trough techniques groups for training a model with each group
for technique_name, techniques in techniques_groups.items():
    
    # create model name
    model_name = "Custom CNN " + str(epochs) + " - "+ technique_name
    print("Training " + model_name)
    
    # reset and clears variables before creating a new model 
    K.clear_session()
    
    # Create image iterators with preprocessing function for each set of preprocessing techniques 
    train_generator, val_generator, test_generator = image_iterators((train_data, val_data, test_data), 
                                                    is_resnet_vgg=False,
                                                    preprocessing_techniques=techniques
                                                  )
    
    # initiate model class
    model_instance = Basic_Custom_CNN(input_shape=(256, 256, 1), num_classes=2, epochs=epochs)
    
    # create model architecture
    model_instance.architecture()
    
    # train model
    history = model_instance.train_model(train_generator, val_gen=val_generator)
    
    # save model and get path
    name = model_name.lower().replace(" ", "_") + ".keras"
    model_path = model_instance.save_model(models_directory="Models", model_file=name)

    # evaluate model by making predictions
    evaluation = Evaluation(model_instance.get_model())
    y_probs = evaluation.predict(test_generator)

    # calculate metrics
    metrics = evaluation.calculate_metrics(y_true, y_probs)

    # get labels dictionary
    y_labels = evaluation.get_labels()

    # save data
    save_data = Save_Data(file_name="models_data.json", out_directory="Outputs")
    save_data.add_model_data(model_name, model_path, epochs, history, metrics, y_labels, project_phase, comments="")
    save_data.save_model_data()


Training Custom CNN 10 - Baseline Basic Preporcessing
Found 1889 validated image filenames.
Found 234 validated image filenames.
Found 562 validated image filenames.


I0000 00:00:1755827112.637563   69218 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5563 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9
  self._warn_if_super_not_called()


Epoch 1/10


I0000 00:00:1755827130.085777   69390 service.cc:148] XLA service 0x748be800a700 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1755827130.086879   69390 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 4060 Laptop GPU, Compute Capability 8.9
2025-08-21 21:45:30.231696: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1755827130.584095   69390 cuda_dnn.cc:529] Loaded cuDNN version 90501
E0000 00:00:1755827131.391814   69390 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.


[1m 1/60[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m12:37[0m 13s/step - accuracy: 0.5312 - auc: 0.4286 - loss: 0.6943 - precision: 0.4286 - recall: 0.2143

I0000 00:00:1755827135.242243   69390 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m366s[0m 6s/step - accuracy: 0.5212 - auc: 0.5108 - loss: 0.6929 - precision: 0.5262 - recall: 0.6163 - val_accuracy: 0.4744 - val_auc: 0.5311 - val_loss: 0.6929 - val_precision: 0.4800 - val_recall: 0.1983
Epoch 2/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m434s[0m 7s/step - accuracy: 0.5058 - auc: 0.5438 - loss: 0.6910 - precision: 0.5333 - recall: 0.4365 - val_accuracy: 0.4786 - val_auc: 0.5094 - val_loss: 0.6956 - val_precision: 0.4977 - val_recall: 0.8926
Epoch 3/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m429s[0m 7s/step - accuracy: 0.5656 - auc: 0.6038 - loss: 0.6794 - precision: 0.5611 - recall: 0.8085 - val_accuracy: 0.5000 - val_auc: 0.5249 - val_loss: 0.6910 - val_precision: 0.5110 - val_recall: 0.7686
Epoch 4/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m333s[0m 6s/step - accuracy: 0.6071 - auc: 0.6368 - loss: 0.6733 - precision: 0.5863 - recall: 0.6975 - val_accuracy: 

  self._warn_if_super_not_called()


Epoch 1/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m467s[0m 8s/step - accuracy: 0.5273 - auc: 0.4832 - loss: 0.6929 - precision: 0.5273 - recall: 1.0000 - val_accuracy: 0.5171 - val_auc: 0.5177 - val_loss: 0.6927 - val_precision: 0.5171 - val_recall: 1.0000
Epoch 2/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m379s[0m 6s/step - accuracy: 0.5079 - auc: 0.5036 - loss: 0.6933 - precision: 0.5074 - recall: 0.9956 - val_accuracy: 0.5171 - val_auc: 0.5381 - val_loss: 0.6927 - val_precision: 0.5171 - val_recall: 1.0000
Epoch 3/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m400s[0m 7s/step - accuracy: 0.5336 - auc: 0.4816 - loss: 0.6925 - precision: 0.5336 - recall: 1.0000 - val_accuracy: 0.5171 - val_auc: 0.5365 - val_loss: 0.6928 - val_precision: 0.5171 - val_recall: 1.0000
Epoch 4/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m361s[0m 6s/step - accuracy: 0.5217 - auc: 0.5291 - loss: 0.6923 - precision: 0.5217 - recall: 1.0000 - val

  self._warn_if_super_not_called()


Epoch 1/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m459s[0m 8s/step - accuracy: 0.4923 - auc: 0.5000 - loss: 0.6932 - precision: 0.4503 - recall: 0.1888 - val_accuracy: 0.5171 - val_auc: 0.5000 - val_loss: 0.6930 - val_precision: 0.5171 - val_recall: 1.0000
Epoch 2/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m424s[0m 7s/step - accuracy: 0.5132 - auc: 0.4952 - loss: 0.6929 - precision: 0.5132 - recall: 1.0000 - val_accuracy: 0.5171 - val_auc: 0.5330 - val_loss: 0.6925 - val_precision: 0.5171 - val_recall: 1.0000
Epoch 3/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m467s[0m 8s/step - accuracy: 0.5268 - auc: 0.5137 - loss: 0.6925 - precision: 0.5268 - recall: 1.0000 - val_accuracy: 0.5171 - val_auc: 0.5196 - val_loss: 0.6923 - val_precision: 0.5171 - val_recall: 1.0000
Epoch 4/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m380s[0m 6s/step - accuracy: 0.5125 - auc: 0.5193 - loss: 0.6927 - precision: 0.5110 - recall: 0.9817 - val

  self._warn_if_super_not_called()


Epoch 1/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m467s[0m 8s/step - accuracy: 0.5078 - auc: 0.5006 - loss: 0.6932 - precision: 0.2984 - recall: 0.1872 - val_accuracy: 0.5171 - val_auc: 0.5000 - val_loss: 0.6927 - val_precision: 0.5171 - val_recall: 1.0000
Epoch 2/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m406s[0m 7s/step - accuracy: 0.5168 - auc: 0.4859 - loss: 0.6929 - precision: 0.5168 - recall: 1.0000 - val_accuracy: 0.5171 - val_auc: 0.4701 - val_loss: 0.6926 - val_precision: 0.5171 - val_recall: 1.0000
Epoch 3/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m364s[0m 6s/step - accuracy: 0.5162 - auc: 0.4987 - loss: 0.6928 - precision: 0.5162 - recall: 1.0000 - val_accuracy: 0.5171 - val_auc: 0.5000 - val_loss: 0.6928 - val_precision: 0.5171 - val_recall: 1.0000
Epoch 4/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m410s[0m 7s/step - accuracy: 0.4870 - auc: 0.4741 - loss: 0.6937 - precision: 0.4870 - recall: 1.0000 - val

  self._warn_if_super_not_called()


Epoch 1/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m371s[0m 6s/step - accuracy: 0.4795 - auc: 0.5000 - loss: 0.6932 - precision: 0.4804 - recall: 0.6485 - val_accuracy: 0.5171 - val_auc: 0.5000 - val_loss: 0.6927 - val_precision: 0.5171 - val_recall: 1.0000
Epoch 2/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m359s[0m 6s/step - accuracy: 0.5101 - auc: 0.5065 - loss: 0.6931 - precision: 0.5101 - recall: 1.0000 - val_accuracy: 0.5171 - val_auc: 0.5000 - val_loss: 0.6927 - val_precision: 0.5171 - val_recall: 1.0000
Epoch 3/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m459s[0m 8s/step - accuracy: 0.5132 - auc: 0.4979 - loss: 0.6929 - precision: 0.5132 - recall: 1.0000 - val_accuracy: 0.5171 - val_auc: 0.4823 - val_loss: 0.6926 - val_precision: 0.5171 - val_recall: 1.0000
Epoch 4/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m437s[0m 7s/step - accuracy: 0.5219 - auc: 0.4925 - loss: 0.6924 - precision: 0.5219 - recall: 1.0000 - val

  self._warn_if_super_not_called()


Epoch 1/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m487s[0m 8s/step - accuracy: 0.5196 - auc: 0.4974 - loss: 0.6926 - precision: 0.5260 - recall: 0.9432 - val_accuracy: 0.5171 - val_auc: 0.5000 - val_loss: 0.6929 - val_precision: 0.5171 - val_recall: 1.0000
Epoch 2/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m435s[0m 7s/step - accuracy: 0.5298 - auc: 0.5285 - loss: 0.6929 - precision: 0.5310 - recall: 0.9882 - val_accuracy: 0.5171 - val_auc: 0.5000 - val_loss: 0.6931 - val_precision: 0.5171 - val_recall: 1.0000
Epoch 3/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m426s[0m 7s/step - accuracy: 0.5276 - auc: 0.4955 - loss: 0.6930 - precision: 0.5276 - recall: 1.0000 - val_accuracy: 0.5171 - val_auc: 0.5000 - val_loss: 0.6928 - val_precision: 0.5171 - val_recall: 1.0000
Epoch 4/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m540s[0m 9s/step - accuracy: 0.5197 - auc: 0.4999 - loss: 0.6927 - precision: 0.5197 - recall: 1.0000 - val

  self._warn_if_super_not_called()


Epoch 1/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m451s[0m 7s/step - accuracy: 0.5088 - auc: 0.4873 - loss: 0.6930 - precision: 0.5237 - recall: 0.8325 - val_accuracy: 0.4829 - val_auc: 0.5000 - val_loss: 0.6932 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 2/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m425s[0m 7s/step - accuracy: 0.4871 - auc: 0.5000 - loss: 0.6932 - precision: 0.3236 - recall: 0.2566 - val_accuracy: 0.5171 - val_auc: 0.5000 - val_loss: 0.6931 - val_precision: 0.5171 - val_recall: 1.0000
Epoch 3/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m395s[0m 7s/step - accuracy: 0.5252 - auc: 0.5000 - loss: 0.6931 - precision: 0.5252 - recall: 1.0000 - val_accuracy: 0.5171 - val_auc: 0.5000 - val_loss: 0.6930 - val_precision: 0.5171 - val_recall: 1.0000
Epoch 4/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m447s[0m 7s/step - accuracy: 0.5167 - auc: 0.5027 - loss: 0.6929 - precision: 0.5167 - recall: 1.00

  self._warn_if_super_not_called()


Epoch 1/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m563s[0m 9s/step - accuracy: 0.5026 - auc: 0.4884 - loss: 0.6931 - precision: 0.5118 - recall: 0.9428 - val_accuracy: 0.5171 - val_auc: 0.5000 - val_loss: 0.6927 - val_precision: 0.5171 - val_recall: 1.0000
Epoch 2/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m477s[0m 8s/step - accuracy: 0.5117 - auc: 0.4542 - loss: 0.6937 - precision: 0.5117 - recall: 1.0000 - val_accuracy: 0.5171 - val_auc: 0.5003 - val_loss: 0.6927 - val_precision: 0.5171 - val_recall: 1.0000
Epoch 3/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m398s[0m 7s/step - accuracy: 0.5116 - auc: 0.4947 - loss: 0.6931 - precision: 0.5116 - recall: 1.0000 - val_accuracy: 0.5171 - val_auc: 0.5000 - val_loss: 0.6928 - val_precision: 0.5171 - val_recall: 1.0000
Epoch 4/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m422s[0m 7s/step - accuracy: 0.5069 - auc: 0.4866 - loss: 0.6936 - precision: 0.5069 - recall: 1.0000 - val

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# 1. Get predictions for the test set
y_probs = evaluation.predict(test_generator)
y_true = test_generator.labels

# 2. Flatten predictions if sigmoid output
y_probs = y_probs.ravel()

# 3. Separate probabilities by class
pos_probs = y_probs[y_true == 1]  # predicted probs for actual positives
neg_probs = y_probs[y_true == 0]  # predicted probs for actual negatives

# 4. Plot histograms
plt.figure(figsize=(8, 5))
plt.hist(neg_probs, bins=20, alpha=0.6, label="Class 0 (negatives)")
plt.hist(pos_probs, bins=20, alpha=0.6, label="Class 1 (positives)")
plt.axvline(0.5, color='red', linestyle='--', label="Decision threshold 0.5")

plt.xlabel("Predicted probability for class 1")
plt.ylabel("Count")
plt.title("Distribution of predicted probabilities")
plt.legend()
plt.show()