Models that are evaluated here:
- `models/post-vgg-aug-model.keras`
- `models/post-vgg-model.keras`
- `models/vgg-ft-model.keras`

In [103]:
import keras
import numpy as np
from tabulate import tabulate
from torchvision.ops import Permute
from torch.utils.data import DataLoader

from data import ICImagesDataset
from config import CFG

### Load model(s)

In [100]:
post_vgg = keras.models.load_model("models/post-vgg-model.keras")

post_vgg_aug = keras.models.load_model("models/post-vgg-aug-model.keras")
post_vgg_aug_2 = keras.models.load_model("models/experiments/post-vgg-aug-low-lr-xavier-init-model.keras")
post_vgg_aug_3 = keras.models.load_model("models/experiments/post-vgg-aug-rescaled-1em4.keras")

vgg_ft_aug = keras.models.load_model("models/vgg-ft-model.keras")
vgg_ft_aug_2 = keras.models.load_model("models/experiments/vgg_ft-6_layers-100_epochs-nodecay-1e-05_lr.keras")

### Load test data and set up metric utilities

In [49]:
move_channels_inward_fn = Permute([1, 2, 0])

# load test dataset
test_dataset = ICImagesDataset(
    annotations=CFG.annotations_fname.format("test"),
    directory=CFG.images_dirname.format("test"),
    transform=move_channels_inward_fn
)
test_dataloader = DataLoader(test_dataset, batch_size=CFG.val_batch_size, shuffle=True)

In [88]:
def top_k_accuracy(model, dataloader, k=3):
    met = keras.metrics.TopKCategoricalAccuracy(k=k)
    if isinstance(dataloader, DataLoader):
        for images, gt in dataloader:
            preds = model.predict(images, verbose=0)
            met.update_state(gt, preds)
    elif isinstance(dataloader, tuple):
        # when input has been vectorized in one go, pass as tuple of (np.ndarray, np.ndarray)
        features, labels = dataloader[0], dataloader[1]
        for vec, gt in zip(features, labels):
            preds = model.predict(np.expand_dims(vec, axis=0), verbose=0)            
            met.update_state(np.expand_dims(gt, axis=0), preds)
    return met.result().numpy()

def accuracy(model, dataloader):
    met = keras.metrics.CategoricalAccuracy()
    if isinstance(dataloader, DataLoader):
        for images, gt in dataloader:
            preds = model.predict(images, verbose=0)
            met.update_state(gt, preds)
    elif isinstance(dataloader, tuple):
        # when input has been vectorized in one go, pass as tuple of (np.ndarray, np.ndarray)
        features, labels = dataloader[0], dataloader[1]
        for vec, gt in zip(features, labels):
            preds = model.predict(np.expand_dims(vec, axis=0), verbose=0)            
            met.update_state(np.expand_dims(gt, axis=0), preds)
    return met.result().numpy()

def f1_score(model, dataloader, avg=None):
    met = keras.metrics.F1Score(average=avg)
    if isinstance(dataloader, DataLoader):
        for images, gt in dataloader:
            preds = model.predict(images, verbose=0)
            met.update_state(gt, preds)
    elif isinstance(dataloader, tuple):
        # when input has been vectorized in one go, pass as tuple of (np.ndarray, np.ndarray)
        features, labels = dataloader[0], dataloader[1]
        for vec, gt in zip(features, labels):
            preds = model.predict(np.expand_dims(vec, axis=0), verbose=0)            
            met.update_state(np.expand_dims(gt, axis=0), preds)
    return met.result().numpy()

## Evaluate each approach (Post-VGG, Post-VGG-Aug, VGG-FT-Aug)

#### Post-VGG

In [90]:
# will have to vectorize test data with VGG base to use post-VGG model
vgg_base = keras.applications.vgg16.VGG16(
    weights="imagenet",
    include_top=False,
    input_shape=(CFG.input_shape, CFG.input_shape, 3)
)

def vgg_extract_features(dataloader):
    all_features = []; all_labels = []
    for images, labels in dataloader:
        preprocessed = keras.applications.vgg16.preprocess_input(images)
        features = vgg_base.predict(preprocessed, verbose=0)        
        all_features.append(features)
        all_labels.append(labels)

    return np.concatenate(all_features), np.concatenate(all_labels)

vgg_extracted_test_features, vgg_test_labels = vgg_extract_features(test_dataloader)

In [93]:
print('MODEL: Post-VGG')

print("Test accuracy:", accuracy(post_vgg, (vgg_extracted_test_features, vgg_test_labels)))
print("Top-3 test accuracy:", top_k_accuracy(post_vgg, (vgg_extracted_test_features, vgg_test_labels)))
print("F1 score (macro averaged):", f1_score(post_vgg, (vgg_extracted_test_features, vgg_test_labels), avg="macro"))

MODEL: Post-VGG
Test accuracy: 0.48
Top-3 test accuracy: 0.70431376
F1 score (macro averaged): 0.3885477


#### Post-VGG-Aug
Reporting multiple experiments below with hyperparameter modifications (`post_vgg_aug`, `post_vgg_aug_2`, `post_vgg_aug_3`). Final table is built with best performing approach.

In [94]:
print('MODEL: Post-VGG-Aug')

print("Test accuracy:", accuracy(post_vgg_aug, test_dataloader))
print("Top-3 test accuracy:", top_k_accuracy(post_vgg_aug, test_dataloader))
print("F1 score (macro averaged):", f1_score(post_vgg_aug, test_dataloader, avg="macro"))

MODEL: Post-VGG-Aug
Test accuracy: 0.5058824
Top-3 test accuracy: 0.7207843
F1 score (macro averaged): 0.40812707


In [96]:
print('MODEL: Post-VGG-Aug v2 (lower learning rate of 1e-4 + Xavier weights initialization)')

print("Test accuracy:", accuracy(post_vgg_aug_2, test_dataloader))
print("Top-3 test accuracy:", top_k_accuracy(post_vgg_aug_2, test_dataloader))
print("F1 score (macro averaged):", f1_score(post_vgg_aug_3, test_dataloader, avg="macro"))

MODEL: Post-VGG-Aug v2 (lower learning rate of 1e-4 + Xavier weights initialization)
Test accuracy: 0.49411765
Top-3 test accuracy: 0.7262745
F1 score (macro averaged): 0.44843793


In [97]:
print('MODEL: Post-VGG-Aug v3 (input rescaling + lower learning rate of 1e-4 + Xavier weights initialization)')

print("Test accuracy:", accuracy(post_vgg_aug_3, test_dataloader))
print("Top-3 test accuracy:", top_k_accuracy(post_vgg_aug_3, test_dataloader))
print("F1 score (macro averaged):", f1_score(post_vgg_aug_3, test_dataloader, avg="macro"))

MODEL: Post-VGG-Aug v3 (input rescaling + lower learning rate of 1e-4 + Xavier weights initialization)
Test accuracy: 0.5427451
Top-3 test accuracy: 0.7513726
F1 score (macro averaged): 0.44843793


#### VGG-FT-Aug
Reporting 2 best experiments below with several hyperparameter modifications. Training logs for all experiments can be seen at https://wandb.ai/muhammadali/ic-chip-net.

In [99]:
print('MODEL: VGG-FT-Aug')

print("Test accuracy:", accuracy(vgg_ft_aug, test_dataloader))
print("Top-3 test accuracy:", top_k_accuracy(vgg_ft_aug, test_dataloader))
print("F1 score (macro averaged):", f1_score(vgg_ft_aug, test_dataloader, avg="macro"))

MODEL: VGG-FT-Aug
Test accuracy: 0.74666667
Top-3 test accuracy: 0.89098036
F1 score (macro averaged): 0.668292


In [101]:
print('MODEL: VGG-FT-Aug v2')

print("Test accuracy:", accuracy(vgg_ft_aug_2, test_dataloader))
print("Top-3 test accuracy:", top_k_accuracy(vgg_ft_aug_2, test_dataloader))
print("F1 score (macro averaged):", f1_score(vgg_ft_aug_2, test_dataloader, avg="macro"))

MODEL: VGG-FT-Aug v2
Test accuracy: 0.7584314
Top-3 test accuracy: 0.88941175
F1 score (macro averaged): 0.6835555


### Build final comparison table

In [121]:
# build table
table = [
    # columns
    ["Model", "Accuracy", "Top-3 Accuracy", "Macro F1"],
    [
        "Post-VGG",
        accuracy(post_vgg, (vgg_extracted_test_features, vgg_test_labels)),
        top_k_accuracy(post_vgg, (vgg_extracted_test_features, vgg_test_labels)),
        f1_score(post_vgg, (vgg_extracted_test_features, vgg_test_labels), avg="macro")
    ],
    [
        "Post-VGG-Aug",
        accuracy(post_vgg_aug_3, test_dataloader),
        top_k_accuracy(post_vgg_aug_3, test_dataloader),
        f1_score(post_vgg_aug_3, test_dataloader, avg="macro")
    ],
    [
        "VGG-FT-Aug",
        accuracy(vgg_ft_aug_2, test_dataloader),
        top_k_accuracy(vgg_ft_aug_2, test_dataloader),
        f1_score(vgg_ft_aug_2, test_dataloader, avg="macro")
    ]
]

In [123]:
print(tabulate(table))

------------  -------------------  ------------------  ------------------
Model         Accuracy             Top-3 Accuracy      Macro F1
Post-VGG      0.47999998927116394  0.7043137550354004  0.3885476887226105
Post-VGG-Aug  0.5427451133728027   0.7513725757598877  0.4484379291534424
VGG-FT-Aug    0.7584313750267029   0.8894117474555969  0.6835554838180542
------------  -------------------  ------------------  ------------------


## 🎆 Final Performance Table 🎆 

| Model    | Accuracy | Top-3 Accuracy | Macro F1
| -------- | -------   | ------ | -----|
Post-VGG  | 47.999 | 70.431 | 38.854|
Post-VGG-Aug |  54.274 |  75.137 | 44.843 |
**VGG-FT-Aug**  |  **75.843** |  **88.941**  | **68.355** |