# Comparative Analysis

- Analysis of experiments logged into MLflow.
- The data is downloaded in form of a CSV file from the MLflow GUI.

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import torch

from corrosion import CorrosionModel

pd.set_option('display.max_columns', 500)
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams.update({'font.size': 14})

SAVE_FIGURES = True

In [None]:
df = pd.read_csv('runs46.csv')
df.head()

In [None]:
df.columns

In [None]:
META_COLUMS = [
    'Start Time',
    'Duration',
    'Run ID',
    'Source Type',
    'Source Name',
    'User',
    'Status',
    'Dataset',
    'Commit ID',
    'Branch',
]

CONST_COLUMNS = [
    'early_stopping',
    'image_size',
    'learning_rate',
    'max_epochs',
    'monitor',
    'patience',
]

METRICS_COLUMS = [
    'test_f1s',
    'test_jaccard',
    'val_f1s',
    'val_jaccard',
]

In [None]:
df = df.drop(columns=META_COLUMS + CONST_COLUMNS)
df.head()

In [None]:
# fill NaNs: augmentation -> True, batch_size ->  8, encoder_weights -> imagenet, frozen_encoder -> True
# (earlier runs did not log these values)

df['augmentation'] = df['augmentation'].fillna(True)
df['batch_size'] = df['batch_size'].fillna(8)
df['encoder_weights'] = df['encoder_weights'].fillna('imagenet')
df['frozen_encoder'] = df['frozen_encoder'].fillna(True)

In [None]:
df.isna().sum().sort_values(ascending=False)

In [None]:
df.sort_values(by='test_f1s', ascending=False).head(3)

## Model Parameters

- The number of parameters (weights) of the model is not logged, here we add them to the table.

In [None]:
def count_parameters(model: torch.nn.Module) -> dict:
    trainable_params = sum(p.numel()
                           for p in model.parameters() if p.requires_grad)
    non_trainable_params = sum(
        p.numel() for p in model.parameters() if not p.requires_grad
    )
    total_params = trainable_params + non_trainable_params
    param_dict = {
        'trainable': trainable_params,
        'non-trainable': non_trainable_params,
        'total': total_params,
    }
    return param_dict

In [None]:
decoders = ['unet', 'unetplusplus']
encoders = ['resnet18', 'resnet34', 'resnet50', 'resnet101'] + [
    f'efficientnet-b{n}' for n in range(8)
]
params_list = []

for decoder in decoders:
    for encoder in encoders:
        model = CorrosionModel(
            decoder,
            encoder,
            in_channels=3,
            out_classes=1,
            batch_size_dict={},
            encoder_weights=None,
        )
        d = count_parameters(model)
        params_list.append(
            (decoder, encoder, d['trainable'], d['non-trainable'], d['total'])
        )

In [None]:
df_params = pd.DataFrame(
    params_list,
    columns=[
        'decoder',
        'encoder',
        'trainable_params',
        'non_trainable_params',
        'total_params',
    ],
)
df_params.head(3)

In [None]:
PARAMS_COLUMNS = list(
    df_params.columns[df_params.columns.str.contains('params')])
df_params[PARAMS_COLUMNS] = df_params[PARAMS_COLUMNS].astype(
    float).div(1e6).round(1)
df_params.head(3)

In [None]:
# Join the dataframes
df = df.merge(df_params, how='left', on=[
              'decoder', 'encoder'], validate='many_to_one')
df.head(3)

## EfficientNet

In [None]:
# Select rows where encoder starts with 'efficientnet'
# and decoder is 'unet' or 'unetplusplus'.
df_sel = df[
    (df['encoder'].str.startswith('efficientnet'))
    & (df['decoder'].isin(['unet', 'unetplusplus']))
    & (df['Name'].str.contains('imagenet'))
].sort_values(by='test_f1s', ascending=False)

df_sorted = df_sel.sort_values(
    'encoder', key=lambda x: x.str.extract(r'(\d+)')[0].astype(int)
)
df_sorted['encoder'] = df_sorted['encoder'].str.replace('efficientnet-b', 'B')

# Get unique decoders for plotting
decoders = sorted(df_sorted['decoder'].unique())

plt.figure(figsize=(12, 7))

for decoder in decoders:
    subset = df_sorted[df_sorted['decoder'] == decoder]
    plt.plot(
        subset['encoder'],
        subset['test_f1s'],
        marker='o',
        linestyle='-' if decoder == 'unetplusplus' else '--',
        linewidth=2,
    )

# Annotate the highest score
max_row = df_sorted.loc[df_sorted['test_f1s'].idxmax()]
max_encoder = max_row['encoder']
max_test_f1s = max_row['test_f1s']
plt.annotate(
    f'{max_test_f1s:.3f}',
    xy=(max_encoder, max_test_f1s),  # type: ignore
    # Adjust text position
    xytext=(max_encoder, max_test_f1s + 0.01),  # type: ignore
    arrowprops=dict(facecolor='black', shrink=0.1),
    ha='center',
)

plt.legend(['U-Net', 'UNet++'], title='Decoder', frameon=True)
plt.xlabel('EfficientNet Encoder')
plt.ylabel('Test F1 Scores')

if SAVE_FIGURES:
    plt.savefig('efficientnet_unet_unetplusplus.pdf')

plt.show()

In [None]:
# Format table for export
df_table = df_sel[['decoder', 'encoder'] + METRICS_COLUMS + PARAMS_COLUMNS].copy()
df_table['decoder'] = df_table['decoder'].replace(
    {'unet': 'U-Net', 'unetplusplus': 'UNet++'}
)
df_table['encoder'] = df_table['encoder'].str.replace(
    'efficientnet-b', 'EfficientNet-B'
)

# Add number of parameters to encoder column e.g. 'B0 (5.3M)'
# default_params = {
#     'B0': '5.3M',
#     'B1': '7.8M',
#     'B2': '9.2M',
#     'B3': '12M',
#     'B4': '19M',
#     'B5': '30M',
#     'B6': '43M',
#     'B7': '66M',
# }
df_table[PARAMS_COLUMNS] = df_table[PARAMS_COLUMNS].astype(str)
df_table['encoder'] += ' (' + df_table['non_trainable_params'] + 'M)'
df_table['decoder'] += ' (' + df_table['trainable_params'] + 'M)'
df_export = df_table.drop(columns=PARAMS_COLUMNS)
# df_export.insert(2, 'Total params', df_table['total_params'])

df_export.columns = pd.MultiIndex.from_tuples(
    [
        (arch := 'Architecture', 'Decoder (\\#params)'),
        (arch, 'Encoder (\\#params)'),
        # (arch, 'Total params'),
        (test_set := 'Test set', 'F1 score'),
        (test_set, 'IoU'),
        # (test_set, 'Dice loss'),
        (val_set := 'Validation set', 'F1 score'),
        (val_set, 'IoU'),
        # (val_set, 'Dice loss'),
    ]
)
df_export.head()

In [None]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_latex.html
df_export.to_latex(
    'table-efficientnet.tex',
    index=False,
    float_format='%.3f',
    multicolumn_format='c',
    caption='Best performing models with EfficientNet encoder and U-Net or UNet++ decoder',
    label='tab:efficientnet_unet_unetplusplus',
)

## ResNet

In [None]:
default_params_resnet = {
    '18': '11.2M',  # 14.3M Total params with U-net
    '34': '21.3M',  # 24.4M Total params with U-net
    '50': '23.5M',  # 32.5M Total params with U-net
    '101': '42.5M',  # 51.5M Total params with U-net
}

In [None]:
# Select rows where encoder starts with 'efficientnet'
# and decoder is 'unet' or 'unetplusplus'.
df_sel = (
    df[
        (df['encoder'].str.startswith('resnet'))
        & (df['decoder'].isin(['unet', 'unetplusplus']))
        & (df['Name'].str.contains('imagenet-with-augmentation'))
    ]
    .drop_duplicates()
    .sort_values(by='test_f1s', ascending=False)
    .head(10)
)

df_sorted = df_sel.sort_values(
    'encoder', key=lambda x: x.str.extract(r'(\d+)')[0].astype(int)
)
df_sorted['encoder'] = df_sorted['encoder'].str.replace('resnet', 'ResNet-')

# Get unique decoders for plotting
decoders = sorted(df_sorted['decoder'].unique())

plt.figure(figsize=(12, 7))

for decoder in decoders:
    subset = df_sorted[df_sorted['decoder'] == decoder]
    plt.plot(
        subset['encoder'],
        subset['test_f1s'],
        marker='o',
        linestyle='-' if decoder == 'unetplusplus' else '--',
        linewidth=2,
    )

# Annotate the highest score
max_row = df_sorted.loc[df_sorted['test_f1s'].idxmax()]
max_encoder = max_row['encoder']
max_test_f1s = max_row['test_f1s']
plt.annotate(
    f'{max_test_f1s:.3f}',
    xy=(max_encoder, max_test_f1s),  # type: ignore
    # Adjust text position
    xytext=(max_encoder, max_test_f1s + 0.01),  # type: ignore
    arrowprops=dict(facecolor='black', shrink=0.1),
    ha='center',
)

plt.legend(['U-Net', 'UNet++'], title='Decoder', frameon=True)
plt.xlabel('Encoder')
plt.ylabel('Test F1 Scores')

if SAVE_FIGURES:
    plt.savefig('resnet_unet_unetplusplus.pdf')

plt.show()

In [None]:
# Format table for export
df_table = df_sel[['decoder', 'encoder'] + METRICS_COLUMS + PARAMS_COLUMNS].copy()
df_table['decoder'] = df_table['decoder'].replace(
    {'unet': 'U-Net', 'unetplusplus': 'UNet++'}
)
df_table['encoder'] = df_table['encoder'].str.replace('resnet', 'ResNet-')

# df_table['encoder'] += (
#     ' (' + df_table['encoder'].str.extract(r'(\d+)')[0].map(n_params_resnet) + ')'
# )
df_table[PARAMS_COLUMNS] = df_table[PARAMS_COLUMNS].astype(str)
df_table['encoder'] += ' (' + df_table['non_trainable_params'] + 'M)'
df_table['decoder'] += ' (' + df_table['trainable_params'] + 'M)'
df_export = df_table.drop(columns=PARAMS_COLUMNS)

df_export.columns = pd.MultiIndex.from_tuples(
    [
        (arch := 'Architecture', 'Decoder (\\#params)'),
        (arch, 'Encoder (\\#params)'),
        (test_set := 'Test set', 'F1 score'),
        (test_set, 'IoU'),
        # (test_set, 'Dice loss'),
        (val_set := 'Validation set', 'F1 score'),
        (val_set, 'IoU'),
        # (val_set, 'Dice loss'),
    ]
)
df_export.head(10)

In [None]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_latex.html
df_export.to_latex(
    'table-resnet.tex',
    index=False,
    float_format='%.3f',
    multicolumn_format='c',
    caption='Comparison of models with ResNet encoder and U-Net or UNet++ decoder.',
    label='tab:resnet_unet_unetplusplus',
)