# Evaluation Script to Compare Semantic Segmentation Models

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
from pprint import pprint

import dagshub
import pandas as pd
from mlflow.client import MlflowClient

from model.mlflow import download_all_runs

In [None]:
REPO_NAME = 'driver-seg'
USER_NAME = 'matejfric'
dagshub.init(REPO_NAME, USER_NAME, mlflow=True)  # type: ignore

pd.set_option('display.max_columns', None)

client = MlflowClient()

OUTPUT_DIR = Path('outputs')
OUTPUT_DIR.mkdir(exist_ok=True)

In [None]:
experiments = client.search_experiments()
pprint([experiment.name for experiment in experiments])

In [None]:
runs_df = download_all_runs(client=client, experiments=experiments)

# Display the DataFrame
print(f'Total runs: {len(runs_df)}')
runs_df.head()

In [None]:
df_sel = runs_df[
    (runs_df['tag.Dataset'] == '2025-04-23-driver-segmentation-dataset')
    & (runs_df['image_size'] == '224')
].copy()
print(f'Selected runs: {len(df_sel)}')
df_sel.head()

## Test Set and Validation Set

In [None]:
PARAMS_COLUMNS = ['encoder_params', 'decoder_params']
METRICS_COLUMS = [
    'metric.test_jaccard_index',
    'metric.test_f1_score',
    'metric.valid_jaccard_index',
    'metric.valid_f1_score',
]
MODEL_COLUMNS = ['encoder', 'decoder']

In [None]:
df_table = (
    df_sel[MODEL_COLUMNS + PARAMS_COLUMNS + METRICS_COLUMS].copy().drop_duplicates()
)
df_table[PARAMS_COLUMNS] = df_table[PARAMS_COLUMNS].astype(float).div(1e6).round(1)
df_table['decoder'] = df_table['decoder'].replace(
    {'unet': 'U-Net', 'unetplusplus': 'UNet++'}
)
df_table['encoder'] = df_table['encoder'].str.replace(
    'efficientnet-b', 'EfficientNet-B'
)
df_table['encoder'] = df_table['encoder'].str.replace('resnet', 'ResNet')
df_table['encoder'] = df_table['encoder'].str.replace('mit_b', 'MixViT-B')
df_table[PARAMS_COLUMNS] = df_table[PARAMS_COLUMNS].astype(str)
df_table['encoder'] += ' (' + df_table['encoder_params'] + 'M)'
df_table['decoder'] += ' (' + df_table['decoder_params'] + 'M)'

df_export = df_table.drop(columns=PARAMS_COLUMNS).sort_values(
    by=['encoder', 'decoder'], ascending=True
)
df_export.columns = pd.MultiIndex.from_tuples(
    [
        (arch := 'Architecture', 'Encoder (\\#params)'),
        (arch, 'Decoder (\\#params)'),
        (test_set := 'Test set', 'IoU'),
        (test_set, 'F1 score'),
        (val_set := 'Validation set', 'IoU'),
        (val_set, 'F1 score'),
    ]
)

In [None]:
df_export.sort_values(
    by=[(test_set, 'IoU'), (test_set, 'F1 score')],  # type: ignore
    ascending=False,
    inplace=True,
)
df_export

In [None]:
df_export.to_latex(
    OUTPUT_DIR / 'table-sem-seg-results.tex',
    index=False,
    float_format='%.4f',
    multicolumn_format='c',
    column_format='r' * len(df_export.columns),
    caption=(
        'Performance comparison of semantic segmentation models on test and validation sets for different encoder-decoder combinations. Sorted by test set performance.',
        'Performance comparison of semantic segmentation models',
    ),
    label='tab:sem-seg-results',
    position='t',
)

## Test Set With Precision and Recall

In [None]:
METRICS_COLUMS = [
    'metric.test_jaccard_index',
    'metric.test_f1_score',
    'metric.test_precision',
    'metric.test_recall',
]
df_table = (
    df_sel[MODEL_COLUMNS + PARAMS_COLUMNS + METRICS_COLUMS].copy().drop_duplicates()
)
df_table[PARAMS_COLUMNS] = df_table[PARAMS_COLUMNS].astype(float).div(1e6).round(1)
df_table['decoder'] = df_table['decoder'].replace(
    {'unet': 'U-Net', 'unetplusplus': 'UNet++'}
)
df_table['encoder'] = df_table['encoder'].str.replace(
    'efficientnet-b', 'EfficientNet-B'
)
df_table['encoder'] = df_table['encoder'].str.replace('resnet', 'ResNet')
df_table['encoder'] = df_table['encoder'].str.replace('mit_b', 'MixViT-B')
df_table[PARAMS_COLUMNS] = df_table[PARAMS_COLUMNS].astype(str)
df_table['encoder'] += ' (' + df_table['encoder_params'] + 'M)'
df_table['decoder'] += ' (' + df_table['decoder_params'] + 'M)'

df_export = df_table.drop(columns=PARAMS_COLUMNS).sort_values(
    by=['encoder', 'decoder'], ascending=True
)
df_export.columns = pd.MultiIndex.from_tuples(
    [
        (arch := 'Architecture', 'Encoder (\\#params)'),
        (arch, 'Decoder (\\#params)'),
        (test_set := 'Test set', 'IoU'),
        (test_set, 'F1 score'),
        (test_set, 'Precision'),
        (test_set, 'Recall'),
    ]
)
df_export.sort_values(
    by=[(test_set, 'IoU'), (test_set, 'F1 score')],  # type: ignore
    ascending=False,
    inplace=True,
)
df_export

In [None]:
df_export.to_latex(
    OUTPUT_DIR / 'table-sem-seg-results-pr.tex',
    index=False,
    float_format='%.4f',
    multicolumn_format='c',
    column_format='r' * len(df_export.columns),
    caption=(
        'Performance comparison of semantic segmentation models on the test set, ranked by IoU, for different encoder-decoder combinations.',
        'Performance comparison of semantic segmentation models',
    ),
    label='tab:sem-seg-results',
    position='t',
)