## Phase 2: Find Processing Techniques

In [1]:
# %reload_ext autoreload
# %autoreload 2

In [2]:
# connects to utils and run a test for connectivity
from Utils.test_class_func import Test_py 
from Utils.test_class_func import test_py  
print(Test_py("My_Test_Class").print_(), "\n")
print(test_py("My_Test_Function"))

Class -> Try if python utils connects to notebook: My_Test_Class 

Function -> Try if python utils connects to notebook: My_Test_Function


In [2]:
# import functions 
from Utils.preporcessing_utils import data_loading 
from Utils.preporcessing_utils import labels_encoding
from Utils.preporcessing_utils import split_data
from Utils.preporcessing_utils import image_iterators
from Utils.preporcessing_utils import ablation
from Utils.models_utils import Basic_Custom_CNN
from Utils.evaluation_utils import Evaluation
from Utils.save_data_utils import Save_Data

2025-08-20 00:11:47.931813: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-20 00:11:47.982346: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755663108.015094   19767 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755663108.021424   19767 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-08-20 00:11:48.063644: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [3]:
# import libraries
from tensorflow.keras import backend as K

### Pipeline Workflow

#### Data Preparation and Preprocessing

In [4]:
# Loads data
train_df, test_df = data_loading("train_full.csv", "test_full.csv")

In [5]:
# Create dataframe and transform(encodes) pathology labels
train_df, test_df = labels_encoding(train_df, test_df)

In [6]:
train_df.columns

Index(['image_id', 'image_type', 'image_path', 'series_uid', 'subject_id',
       'study_uid', 'breast_density', 'breast_side', 'image_view',
       'abnormality_type', 'pathology', 'split', 'label'],
      dtype='object')

In [7]:
# Split data
train_data, val_data, test_data = split_data(train_df, test_df, 0.11)

Train set: 1889 cases, 70.35 %
Validation set: 234 cases, 8.72 %
Test set: 562 cases, 20.93 %


#### Iteration 1: Finding best preprocessing technique using custom CNN

In [8]:
# variables 
project_phase = "P2"
options = ['apply_background_removal',
           'apply_crop',
           'apply_noise_reduction',
           'apply_contrast_enhancement',
           'apply_edge_enhancement',
           'apply_lbp_texturizer']

y_true = test_data["label"]

In [9]:
# create group of techniques to try
techniques_groups = ablation(options)

In [10]:
techniques_groups

{'Baseline Basic Preporcessing': {'apply_background_removal': False,
  'apply_crop': False,
  'apply_noise_reduction': False,
  'apply_contrast_enhancement': False,
  'apply_edge_enhancement': False,
  'apply_lbp_texturizer': False},
 'All Preporcessing Techniques': {'apply_background_removal': True,
  'apply_crop': True,
  'apply_noise_reduction': True,
  'apply_contrast_enhancement': True,
  'apply_edge_enhancement': True,
  'apply_lbp_texturizer': True},
 'No Background removal': {'apply_background_removal': False,
  'apply_crop': True,
  'apply_noise_reduction': True,
  'apply_contrast_enhancement': True,
  'apply_edge_enhancement': True,
  'apply_lbp_texturizer': True},
 'No Crop': {'apply_background_removal': True,
  'apply_crop': False,
  'apply_noise_reduction': True,
  'apply_contrast_enhancement': True,
  'apply_edge_enhancement': True,
  'apply_lbp_texturizer': True},
 'No Noise reduction': {'apply_background_removal': True,
  'apply_crop': True,
  'apply_noise_reduction': F

In [None]:
# iterate trough techniques groups for training a model with each group
for technique_name, techniques in techniques_groups.items():
    
    # create model name
    model_name = "Custom CNN - " + technique_name
    print("Training " + model_name)
    
    # reset and clears variables before creating a new model 
    K.clear_session()
    
    # Create image iterators with preprocessing function for each set of preprocessing techniques 
    train_generator, val_generator, test_generator = image_iterators((train_data, val_data, test_data), 
                                                    is_resnet_vgg=False,
                                                    preprocessing_techniques=techniques
                                                  )
    
    # initiate model class
    model_instance = Basic_Custom_CNN(input_shape=(256, 256, 1), num_classes=2, epochs=10)
    
    # create model architecture
    model_instance.architecture()
    
    # train model
    history = model_instance.train_model(train_generator, val_gen=val_generator)
    
    # save model and get path
    name = technique_name.lower().replace(" ", "_") + ".keras"
    model_path = model_instance.save_model(models_directory="Models", model_file=name)

    # evaluate model by making predictions
    evaluation = Evaluation(model_instance.get_model())
    y_probs = evaluation.predict(test_generator)

    # calculate metrics
    metrics = evaluation.calculate_metrics(y_true, y_probs)

    # get labels dictionary
    y_labels = evaluation.get_labels()

    # save data
    save_data = Save_Data(file_name="models_data.json", out_directory="Outputs")
    save_data.add_model_data(model_name, model_path, history, metrics, y_labels, project_phase, comments="")
    save_data.save_model_data()


Training Custom CNN - Baseline Basic Preporcessing
Found 1889 validated image filenames.
Found 234 validated image filenames.
Found 562 validated image filenames.


I0000 00:00:1755663252.962258   19767 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5563 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9
  self._warn_if_super_not_called()


Epoch 1/10


I0000 00:00:1755663268.510683   19939 service.cc:148] XLA service 0x70b764009480 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1755663268.510782   19939 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 4060 Laptop GPU, Compute Capability 8.9
2025-08-20 00:14:28.593192: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1755663268.818522   19939 cuda_dnn.cc:529] Loaded cuDNN version 90501
E0000 00:00:1755663273.325003   19939 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.


[1m 2/60[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m6s[0m 105ms/step - accuracy: 0.6562 - loss: 0.6915 

I0000 00:00:1755663273.991845   19939 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m58/60[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m11s[0m 6s/step - accuracy: 0.5244 - loss: 0.6978