## Phase 2: Find Processing Techniques

In [20]:
%reload_ext autoreload
%autoreload 2

In [2]:
# connects to utils and run a test for connectivity
from Utils.test_class_func import Test_py 
from Utils.test_class_func import test_py  
print(Test_py("My_Test_Class").print_(), "\n")
print(test_py("My_Test_Function"))

Class -> Try if python utils connects to notebook: My_Test_Class 

Function -> Try if python utils connects to notebook: My_Test_Function


In [1]:
# import functions 
from Utils.preporcessing_utils import data_loading 
from Utils.preporcessing_utils import labels_encoding
from Utils.preporcessing_utils import split_data
from Utils.preporcessing_utils import image_iterators
from Utils.preporcessing_utils import ablation
from Utils.models_utils import Basic_Custom_CNN
from Utils.evaluation_utils import Evaluation
from Utils.save_data_utils import Save_Data

2025-08-20 11:53:23.883264: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-20 11:53:23.915993: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755705203.934492   30737 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755705203.940276   30737 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-08-20 11:53:23.968992: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
# import libraries
from tensorflow.keras import backend as K

### Pipeline Workflow

#### Data Preparation and Preprocessing

In [3]:
# Loads data
train_df, test_df = data_loading("train_full.csv", "test_full.csv")

In [4]:
# Create dataframe and transform(encodes) pathology labels
train_df, test_df = labels_encoding(train_df, test_df)

In [5]:
train_df.columns

Index(['image_id', 'image_type', 'image_path', 'series_uid', 'subject_id',
       'study_uid', 'breast_density', 'breast_side', 'image_view',
       'abnormality_type', 'pathology', 'split', 'label'],
      dtype='object')

In [6]:
# Split data
train_data, val_data, test_data = split_data(train_df, test_df, 0.11)

Train set: 1889 cases, 70.35 %
Validation set: 234 cases, 8.72 %
Test set: 562 cases, 20.93 %


#### Iteration 1: Finding best preprocessing technique using custom CNN

In [7]:
# variables 
project_phase = "P2"
options = ['apply_background_removal',
           'apply_crop',
           'apply_noise_reduction',
           'apply_contrast_enhancement',
           'apply_edge_enhancement',
           'apply_lbp_texturizer']

y_true = test_data["label"]

In [9]:
# create group of techniques to try
techniques_groups = ablation(options)

In [10]:
techniques_groups

{'Baseline Basic Preporcessing': {'apply_background_removal': False,
  'apply_crop': False,
  'apply_noise_reduction': False,
  'apply_contrast_enhancement': False,
  'apply_edge_enhancement': False,
  'apply_lbp_texturizer': False},
 'All Preporcessing Techniques': {'apply_background_removal': True,
  'apply_crop': True,
  'apply_noise_reduction': True,
  'apply_contrast_enhancement': True,
  'apply_edge_enhancement': True,
  'apply_lbp_texturizer': True},
 'No Background removal': {'apply_background_removal': False,
  'apply_crop': True,
  'apply_noise_reduction': True,
  'apply_contrast_enhancement': True,
  'apply_edge_enhancement': True,
  'apply_lbp_texturizer': True},
 'No Crop': {'apply_background_removal': True,
  'apply_crop': False,
  'apply_noise_reduction': True,
  'apply_contrast_enhancement': True,
  'apply_edge_enhancement': True,
  'apply_lbp_texturizer': True},
 'No Noise reduction': {'apply_background_removal': True,
  'apply_crop': True,
  'apply_noise_reduction': F

In [11]:
# iterate trough techniques groups for training a model with each group
for technique_name, techniques in techniques_groups.items():
    
    # create model name
    model_name = "Custom CNN - " + technique_name
    print("Training " + model_name)
    
    # reset and clears variables before creating a new model 
    K.clear_session()
    
    # Create image iterators with preprocessing function for each set of preprocessing techniques 
    train_generator, val_generator, test_generator = image_iterators((train_data, val_data, test_data), 
                                                    is_resnet_vgg=False,
                                                    preprocessing_techniques=techniques
                                                  )
    
    # initiate model class
    model_instance = Basic_Custom_CNN(input_shape=(256, 256, 1), num_classes=2, epochs=10)
    
    # create model architecture
    model_instance.architecture()
    
    # train model
    history = model_instance.train_model(train_generator, val_gen=val_generator)
    
    # save model and get path
    name = technique_name.lower().replace(" ", "_") + ".keras"
    model_path = model_instance.save_model(models_directory="Models", model_file=name)

    # evaluate model by making predictions
    evaluation = Evaluation(model_instance.get_model())
    y_probs = evaluation.predict(test_generator)

    # calculate metrics
    metrics = evaluation.calculate_metrics(y_true, y_probs)

    # get labels dictionary
    y_labels = evaluation.get_labels()

    # save data
    save_data = Save_Data(file_name="models_data.json", out_directory="Outputs")
    save_data.add_model_data(model_name, model_path, history, metrics, y_labels, project_phase, comments="")
    save_data.save_model_data()


Training Custom CNN - Baseline Basic Preporcessing
Found 1889 validated image filenames.
Found 234 validated image filenames.
Found 562 validated image filenames.


I0000 00:00:1755705390.946366   30737 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5563 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9
  self._warn_if_super_not_called()


Epoch 1/10


I0000 00:00:1755705414.483434   30943 service.cc:148] XLA service 0x70c67800ac90 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1755705414.483494   30943 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 4060 Laptop GPU, Compute Capability 8.9
2025-08-20 11:56:54.515680: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1755705414.662412   30943 cuda_dnn.cc:529] Loaded cuDNN version 90501


[1m 1/60[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m14:22[0m 15s/step - accuracy: 0.5000 - auc: 0.5000 - loss: 0.6932 - precision: 0.5000 - recall: 0.1250

I0000 00:00:1755705418.956087   30943 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m514s[0m 8s/step - accuracy: 0.5252 - auc: 0.5000 - loss: 0.6931 - precision: 0.5294 - recall: 0.9290 - val_accuracy: 0.5171 - val_auc: 0.5000 - val_loss: 0.6931 - val_precision: 0.5171 - val_recall: 1.0000
Epoch 2/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m378s[0m 6s/step - accuracy: 0.5268 - auc: 0.4994 - loss: 0.6928 - precision: 0.5268 - recall: 1.0000 - val_accuracy: 0.5171 - val_auc: 0.5000 - val_loss: 0.6930 - val_precision: 0.5171 - val_recall: 1.0000
Epoch 3/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m405s[0m 7s/step - accuracy: 0.5084 - auc: 0.5036 - loss: 0.6930 - precision: 0.5084 - recall: 1.0000 - val_accuracy: 0.5171 - val_auc: 0.5000 - val_loss: 0.6926 - val_precision: 0.5171 - val_recall: 1.0000
Epoch 4/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m456s[0m 8s/step - accuracy: 0.5346 - auc: 0.4881 - loss: 0.6917 - precision: 0.5346 - recall: 1.0000 - val_accuracy: 

AttributeError: module 'posixpath' has no attribute 'exist'

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# 1. Get predictions for the test set
y_probs = evaluation.predict(test_generator)
y_true = test_generator.labels

# 2. Flatten predictions if sigmoid output
y_probs = y_probs.ravel()

# 3. Separate probabilities by class
pos_probs = y_probs[y_true == 1]  # predicted probs for actual positives
neg_probs = y_probs[y_true == 0]  # predicted probs for actual negatives

# 4. Plot histograms
plt.figure(figsize=(8, 5))
plt.hist(neg_probs, bins=20, alpha=0.6, label="Class 0 (negatives)")
plt.hist(pos_probs, bins=20, alpha=0.6, label="Class 1 (positives)")
plt.axvline(0.5, color='red', linestyle='--', label="Decision threshold 0.5")

plt.xlabel("Predicted probability for class 1")
plt.ylabel("Count")
plt.title("Distribution of predicted probabilities")
plt.legend()
plt.show()

In [21]:
y_p = evaluation.predict(test_generator)
print(y_p[:5])

[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 6s/step
[[0.51568824]
 [0.42866644]
 [0.57756436]
 [0.48283404]
 [0.39064458]]


In [12]:
metrics

{'confusion_matrix': array([[  0, 302],
        [  0, 260]]),
 'accuracy': 0.4626334519572954,
 'precision': 0.4626334519572954,
 'recall': 1.0,
 'f1_score': 0.6326034063260341,
 'roc_auc': 0.5280883851248089,
 'specificity': np.float64(0.0),
 'fpr': np.float64(1.0),
 'fnr': np.float64(0.0)}

In [14]:
import numpy as np
unique, counts = np.unique(test_data["label"], return_counts=True)
print(dict(zip(unique, counts)))

{np.int32(0): np.int64(302), np.int32(1): np.int64(260)}


In [None]:
y_probs[:10]

In [None]:
print(np.unique(train_data["label"], return_counts=True))
print(np.unique(test_data["label"], return_counts=True))

In [18]:
print("Train:", np.unique(train_generator.labels, return_counts=True))
print("Val:", np.unique(val_generator.labels, return_counts=True))
print("Test:", np.unique(test_generator.labels, return_counts=True))

Train: (array([0, 1], dtype=int32), array([912, 977]))
Val: (array([0, 1], dtype=int32), array([113, 121]))
Test: (array([0, 1], dtype=int32), array([302, 260]))


In [19]:
x_batch, y_batch = next(iter(train_generator))
print("X batch range:", x_batch.min(), "to", x_batch.max())
print("Y batch sample:", y_batch[:10])

X batch range: 0.0 to 1.0
Y batch sample: [1 0 1 0 1 1 0 0 1 1]
