In [1]:
import os

# Needed for a bit more reproducibility of results when using TensorFlow
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"

In [2]:
import mlflow
from experiments_config import (
    CommonConfig,
    MlflowConfig,
    DatasetConfig,
    ModelConfig,
    ModelTrainingConfig,
    ModelEvaluationConfig,
)

common_config = CommonConfig()
mlflow_config = MlflowConfig()
dataset_config = DatasetConfig()
model_config = ModelConfig()
training_config = ModelTrainingConfig()
evaluation_config = ModelEvaluationConfig()

2025-09-21 17:52:13.399934: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
I0000 00:00:1758469935.066820  178369 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9171 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4070, pci bus id: 0000:01:00.0, compute capability: 8.9


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/efficientnet_v2/efficientnetv2-b0_notop.h5
[1m24274472/24274472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 0us/step


In [3]:
# log system metrics in the mlflow server
if mlflow_config.ENABLE_SYSTEM_METRICS_LOGGING:
    mlflow.enable_system_metrics_logging()

In [4]:
# Set the tracking URI and experiment for subsequent runs

mlflow.set_tracking_uri(mlflow_config.MLFLOW_TRACKING_URI)
mlflow.set_experiment(mlflow_config.MLFLOW_EXPERIMENT_NAME)

<Experiment: artifact_location='mlflow-artifacts:/2', creation_time=1758296474574, experiment_id='2', last_update_time=1758296474574, lifecycle_stage='active', name='Smart_Recycling_AI', tags={'dataset': 'garbage-dataset-v1',
 'framework': 'tensorflow-keras',
 'mlflow.experimentKind': 'custom_model_development',
 'mlflow.note.content': ' This experiment focuses on Smart Recycling using '
                        'computer vision. \n'
                        '    The goal is to classify waste items into 8 '
                        'categories: battery, biological, clothes, glass, '
                        'metal, paper, plastic, and trash. \n'
                        '    Different model architectures and augmentation '
                        'strategies are tested to evaluate their performance '
                        'and identify the best approach \n'
                        '    for robust and scalable recycling classification.',
 'num_classes': '8',
 'project_name': 'smart-recycli

In [5]:
# Set a custom run name for better identification in the MLflow UI
mlflow.set_tag("mlflow.runName", mlflow_config.MLFLOW_RUN_NAME)

# set the dataset as tag in the mlflow run
mlflow.set_tag("dataset", dataset_config.DATASET)

# set a description for the MLflow run
mlflow.set_tag("mlflow.note.content", mlflow_config.MLFLOW_RUN_DESCRIPTION)

In [6]:
import git

if mlflow_config.MLFLOW_LOG_GIT_SHA:
    repo = git.Repo(search_parent_directories=True)
    sha = repo.head.object.hexsha
    # set the git commit sha as a tag in the mlflow run for better traceability
    mlflow.set_tag("git_commit", sha)

In [7]:
# log the experiments_config.py for future auditability
mlflow.log_artifact(
    common_config.PATH_TO_CONFIG_FILE,
    artifact_path="config.py",
)

In [8]:
from smart_recycling.utils import get_tensorflow_dataset

# Tensorflow Dataset loading

train_dataset = get_tensorflow_dataset(
    image_folder=f"{dataset_config.DATASET_FOLDER}/train",
    image_size=dataset_config.IMAGE_SIZE,
    batch_size=dataset_config.TRAIN_BATCH_SIZE,
    label_mode=dataset_config.LABEL_MODE,
    shuffle=True,  # shuffle True for training dataset
    seed=common_config.SEED,
)

val_dataset = get_tensorflow_dataset(
    image_folder=f"{dataset_config.DATASET_FOLDER}/val",
    image_size=dataset_config.IMAGE_SIZE,
    batch_size=dataset_config.VALIDATION_BATCH_SIZE,
    label_mode=dataset_config.LABEL_MODE,
    shuffle=False,  # shuffle False for validation dataset
    seed=common_config.SEED,
)

Found 14819 files belonging to 8 classes.
Found 1973 files belonging to 8 classes.


In [9]:
import tensorflow as tf

# setup mixed precision if wanted
# Reference: https://www.tensorflow.org/guide/mixed_precision
if model_config.ENABLE_MIXED_PRECISION:
    tf.keras.mixed_precision.set_global_policy("mixed_float16")

model = model_config.MODEL

model.compile(
    optimizer=model_config.OPTIMIZER,
    loss=model_config.LOSS,
    metrics=model_config.METRICS,
)

In [10]:
from smart_recycling.utils import compute_class_weights, get_true_labels

class_weight = None

# Optionally compute class weights to handle class imbalance
if training_config.COMPUTE_CLASS_WEIGHTS:
    y_true = get_true_labels(train_dataset)
    class_weight = compute_class_weights(
        y_true, class_weight=training_config.CLASS_WEIGHTING_METHOD
    )

2025-09-21 17:52:23.691972: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [11]:
# Actual train the model

history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=training_config.EPOCHS,
    callbacks=training_config.TRAINING_CALLBACKS,
    class_weight=class_weight,
)

Epoch 1/50


2025-09-21 17:52:30.900450: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91300


[1m232/232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 71ms/step - f1_score: 0.8242 - loss: 0.7100 - val_f1_score: 0.9354 - val_loss: 0.2524 - learning_rate: 0.0010
Epoch 2/50
[1m232/232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 60ms/step - f1_score: 0.8932 - loss: 0.3786 - val_f1_score: 0.9424 - val_loss: 0.2029 - learning_rate: 0.0010
Epoch 3/50
[1m232/232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 61ms/step - f1_score: 0.9072 - loss: 0.3152 - val_f1_score: 0.9498 - val_loss: 0.1776 - learning_rate: 0.0010
Epoch 4/50
[1m232/232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 61ms/step - f1_score: 0.9163 - loss: 0.2895 - val_f1_score: 0.9521 - val_loss: 0.1670 - learning_rate: 0.0010
Epoch 5/50
[1m232/232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 60ms/step - f1_score: 0.9192 - loss: 0.2766 - val_f1_score: 0.9487 - val_loss: 0.1637 - learning_rate: 0.0010
Epoch 6/50
[1m232/232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1

In [12]:
# Optional log the model

if mlflow_config.MLFLOW_LOG_MODEL:
    # get one batch of images to use as input example to infer the signature for
    # logging the model
    for x, y in train_dataset.take(1):
        input_example = x[:1].numpy()
        break

    mlflow.tensorflow.log_model(
        model, **mlflow_config.MLFLOW_LOG_MODEL_CONFIG, input_example=input_example
    )

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step


In [13]:
from smart_recycling.utils import (
    save_prediction_time,
    save_confusion_matrix,
    save_prediction_csv,
    save_model_history,
)
import numpy as np

# Optional evaluate the model on the test set
if evaluation_config.INCLUDE_EVALUATION_ON_TEST_SET:
    # load the test dataset
    test_dataset = get_tensorflow_dataset(
        image_folder=f"{dataset_config.DATASET_FOLDER}/test",
        image_size=dataset_config.IMAGE_SIZE,
        batch_size=dataset_config.TEST_BATCH_SIZE,
        label_mode=dataset_config.LABEL_MODE,
        shuffle=False,  # shuffle needs to be false for later evaluation
        seed=common_config.SEED,
    )

    test_results = model.evaluate(test_dataset, return_dict=True)
    # log the test results to mlflow with a "test_" prefix
    for name, value in test_results.items():
        mlflow.log_metric(f"test_{name}", value)

    if evaluation_config.SAVE_MODEL_HISTORY:
        save_model_history(history)

    # optional save the prediction time to mlflow (in milliseconds)
    if evaluation_config.SAVE_PREDICTION_TIME:
        y_probs = save_prediction_time(model, test_dataset)

    # if confusion matrix or prediction csv should be saved, we need the predicted and true labels
    # additional the file paths and class names are needed
    if evaluation_config.SAVE_CONFUSION_MATRIX or evaluation_config.SAVE_PREDICTION_CSV:
        y_pred = np.argmax(y_probs, axis=1)

        y_true = get_true_labels(test_dataset)

        file_paths = test_dataset.file_paths
        class_names = test_dataset.class_names

    # optional save the confusion matrix to mlflow as plot
    if evaluation_config.SAVE_CONFUSION_MATRIX:
        save_confusion_matrix(
            y_true,
            y_pred,
            class_names,
        )

    # optional save two csv, one with all predictions and one with missclassified samples
    if evaluation_config.SAVE_PREDICTION_CSV:
        save_prediction_csv(file_paths, y_true, y_pred, y_probs, class_names)

Found 2970 files belonging to 8 classes.
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - f1_score: 0.9503 - loss: 0.1656


2025-09-21 17:59:58.133206: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
