In [None]:
import os

# Needed for exact reproducibility of results when using TensorFlow
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"

In [1]:
import mlflow
from experiments_config import (
    CommonConfig,
    MlflowConfig,
    DatasetConfig,
    ModelConfig,
    ModelTrainingConfig,
    ModelEvaluationConfig,
)

common_config = CommonConfig()
mlflow_config = MlflowConfig()
dataset_config = DatasetConfig()
model_config = ModelConfig()
training_config = ModelTrainingConfig()
evaluation_config = ModelEvaluationConfig()

2025-09-19 18:02:01.332049: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-19 18:02:01.369950: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-09-19 18:02:02.223657: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
I0000 00:00:1758297723.050573   82954 gpu_device.cc:2020] Created device /job:lo

In [2]:
# log system metrics in the mlflow server
if mlflow_config.ENABLE_SYSTEM_METRICS_LOGGING:
    mlflow.enable_system_metrics_logging()

In [3]:
# Set the tracking URI and experiment for subsequent runs

mlflow.set_tracking_uri(mlflow_config.MLFLOW_TRACKING_URI)
mlflow.set_experiment(mlflow_config.MLFLOW_EXPERIMENT_NAME)

<Experiment: artifact_location='mlflow-artifacts:/2', creation_time=1758296474574, experiment_id='2', last_update_time=1758296474574, lifecycle_stage='active', name='Smart_Recycling_AI', tags={'dataset': 'garbage-dataset-v1',
 'framework': 'tensorflow-keras',
 'mlflow.experimentKind': 'custom_model_development',
 'mlflow.note.content': ' This experiment focuses on Smart Recycling using '
                        'computer vision. \n'
                        '    The goal is to classify waste items into 8 '
                        'categories: battery, biological, clothes, glass, '
                        'metal, paper, plastic, and trash. \n'
                        '    Different model architectures and augmentation '
                        'strategies are tested to evaluate their performance '
                        'and identify the best approach \n'
                        '    for robust and scalable recycling classification.',
 'num_classes': '8',
 'project_name': 'smart-recycli

In [None]:
# Configure TensorFlow autologging with custom settings
mlflow.tensorflow.autolog(**mlflow_config.MLFLOW_TENSORFLOW_AUTOLOG_CONFIG)

# Set a custom run name for better identification in the MLflow UI
mlflow.set_tag("mlflow.runName", mlflow_config.MLFLOW_RUN_NAME)

# set the dataset as tag in the mlflow run
mlflow.set_tag("dataset", dataset_config.DATASET)

# set a description for the MLflow run
mlflow.set_tag("mlflow.note.content", mlflow_config.MLFLOW_RUN_DESCRIPTION)

In [5]:
from smart_recycling.utils import get_tensorflow_dataset

# Tensorflow Dataset loading

train_dataset = get_tensorflow_dataset(
    image_folder=f"{dataset_config.DATASET_FOLDER}/train",
    image_size=dataset_config.IMAGE_SIZE,
    batch_size=dataset_config.TRAIN_BATCH_SIZE,
    label_mode=dataset_config.LABEL_MODE,
    shuffle=True,  # shuffle True for training dataset
    seed=common_config.SEED,
)

val_dataset = get_tensorflow_dataset(
    image_folder=f"{dataset_config.DATASET_FOLDER}/val",
    image_size=dataset_config.IMAGE_SIZE,
    batch_size=dataset_config.VALIDATION_BATCH_SIZE,
    label_mode=dataset_config.LABEL_MODE,
    shuffle=False,  # shuffle False for validation dataset
    seed=common_config.SEED,
)

test_dataset = get_tensorflow_dataset(
    image_folder=f"{dataset_config.DATASET_FOLDER}/test",
    image_size=dataset_config.IMAGE_SIZE,
    batch_size=dataset_config.TEST_BATCH_SIZE,
    label_mode=dataset_config.LABEL_MODE,
    shuffle=False,  # shuffle needs to be false for later evaluation
    seed=common_config.SEED,
)

Found 14819 files belonging to 8 classes.
Found 1973 files belonging to 8 classes.
Found 2970 files belonging to 8 classes.


In [6]:
import tensorflow as tf

# setup mixed precision if wanted
# Reference: https://www.tensorflow.org/guide/mixed_precision
if model_config.ENABLE_MIXED_PRECISION:
    tf.keras.mixed_precision.set_global_policy("mixed_float16")

model = model_config.MODEL

model.compile(
    optimizer=model_config.OPTIMIZER,
    loss=model_config.LOSS,
    metrics=model_config.METRICS,
)

In [7]:
from smart_recycling.utils import compute_class_weights
import numpy as np

class_weight = None

# Optionally compute class weights to handle class imbalance
if training_config.COMPUTE_CLASS_WEIGHTS:
    y_true = np.concatenate([y for x, y in train_dataset], axis=0)
    class_weight = compute_class_weights(
        np.argmax(y_true, axis=1), class_weight=training_config.CLASS_WEIGHTING_METHOD
    )

2025-09-19 17:43:55.033299: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [8]:
# Actual train the model

history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=training_config.EPOCHS,
    callbacks=training_config.TRAINING_CALLBACKS,
    class_weight=class_weight,
)

2025-09-19 17:44:05.893054: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 1/50


2025-09-19 17:44:10.972051: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91300


[1m462/464[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 21ms/step - f1_score: 0.7345 - loss: 0.9533



[1m464/464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 26ms/step - f1_score: 0.8283 - loss: 0.6015 - val_f1_score: 0.9289 - val_loss: 0.2384 - learning_rate: 0.0010
Epoch 2/50
[1m463/464[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 21ms/step - f1_score: 0.9015 - loss: 0.3266



[1m464/464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 24ms/step - f1_score: 0.9044 - loss: 0.3182 - val_f1_score: 0.9432 - val_loss: 0.1899 - learning_rate: 0.0010
Epoch 3/50
[1m464/464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - f1_score: 0.9139 - loss: 0.2832



[1m464/464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 24ms/step - f1_score: 0.9165 - loss: 0.2768 - val_f1_score: 0.9479 - val_loss: 0.1730 - learning_rate: 0.0010
Epoch 4/50
[1m463/464[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 21ms/step - f1_score: 0.9261 - loss: 0.2358



[1m464/464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 24ms/step - f1_score: 0.9264 - loss: 0.2399 - val_f1_score: 0.9480 - val_loss: 0.1723 - learning_rate: 0.0010
Epoch 5/50
[1m464/464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 24ms/step - f1_score: 0.9314 - loss: 0.2222 - val_f1_score: 0.9434 - val_loss: 0.1847 - learning_rate: 0.0010
Epoch 6/50
[1m464/464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 24ms/step - f1_score: 0.9331 - loss: 0.2103 - val_f1_score: 0.9425 - val_loss: 0.1867 - learning_rate: 0.0010
Epoch 7/50
[1m462/464[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 22ms/step - f1_score: 0.9387 - loss: 0.2056



[1m464/464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 25ms/step - f1_score: 0.9391 - loss: 0.1965 - val_f1_score: 0.9509 - val_loss: 0.1694 - learning_rate: 0.0010
Epoch 8/50
[1m462/464[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 21ms/step - f1_score: 0.9400 - loss: 0.1839



[1m464/464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 24ms/step - f1_score: 0.9412 - loss: 0.1853 - val_f1_score: 0.9544 - val_loss: 0.1567 - learning_rate: 0.0010
Epoch 9/50
[1m464/464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 23ms/step - f1_score: 0.9431 - loss: 0.1762 - val_f1_score: 0.9495 - val_loss: 0.1655 - learning_rate: 0.0010
Epoch 10/50
[1m462/464[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 21ms/step - f1_score: 0.9434 - loss: 0.1788



[1m464/464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 24ms/step - f1_score: 0.9452 - loss: 0.1734 - val_f1_score: 0.9552 - val_loss: 0.1546 - learning_rate: 0.0010
Epoch 11/50
[1m464/464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 23ms/step - f1_score: 0.9468 - loss: 0.1670 - val_f1_score: 0.9515 - val_loss: 0.1726 - learning_rate: 0.0010
Epoch 12/50
[1m464/464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 24ms/step - f1_score: 0.9496 - loss: 0.1601 - val_f1_score: 0.9482 - val_loss: 0.1698 - learning_rate: 0.0010
Epoch 13/50
[1m464/464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 23ms/step - f1_score: 0.9498 - loss: 0.1561 - val_f1_score: 0.9565 - val_loss: 0.1569 - learning_rate: 0.0010
Epoch 14/50
[1m464/464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 24ms/step - f1_score: 0.9508 - loss: 0.1523 - val_f1_score: 0.9541 - val_loss: 0.1610 - learning_rate: 0.0010
Epoch 15/50
[1m464/464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

2025-09-19 17:47:55.460253: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 794ms/step




In [9]:
from smart_recycling.utils import save_model_history


# Optional save the model history plot and the history as json to MLflow
if evaluation_config.SAVE_MODEL_HISTORY:
    save_model_history(history)

In [10]:
from smart_recycling.utils import (
    save_prediction_time,
    save_confusion_matrix,
    save_prediction_csv,
)

# Optional evaluate the model on the test set
if evaluation_config.INCLUDE_EVALUATION_ON_TEST_SET:
    test_results = model.evaluate(test_dataset, return_dict=True)

    # log the test results to mlflow with a "test_" prefix
    for name, value in test_results.items():
        mlflow.log_metric(f"test_{name}", value)

    # optional save the prediction time to mlflow (in milliseconds)
    if evaluation_config.SAVE_PREDICTION_TIME:
        y_probs = save_prediction_time(model, test_dataset)

    # if confusion matrix or prediction csv should be saved, we need the predicted and true labels
    # additional the file paths and class names are needed
    if evaluation_config.SAVE_CONFUSION_MATRIX or evaluation_config.SAVE_PREDICTION_CSV:
        y_pred = np.argmax(y_probs, axis=1)

        y_true = np.concatenate([y for x, y in test_dataset], axis=0)
        y_true = np.argmax(y_true, axis=1)

        file_paths = test_dataset.file_paths
        class_names = test_dataset.class_names

    # optional save the confusion matrix to mlflow as plot
    if evaluation_config.SAVE_CONFUSION_MATRIX:
        save_confusion_matrix(
            y_true,
            y_pred,
            class_names,
        )

    # optional save two csv, one with all predictions and one with missclassified samples
    if evaluation_config.SAVE_PREDICTION_CSV:
        save_prediction_csv(file_paths, y_true, y_pred, y_probs, class_names)

[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - f1_score: 0.9395 - loss: 0.1948
