# Models Evaluator
Evaluate all the models on all the test datasets

**Authors**

`Marco Alecci <https://github.com/MarcoAlecci>`

`Francesco Marchiori <https://github.com/FrancescoMarchiori>`

`Luca Martinelli <https://github.com/luca-martinelli-09>`

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/luca-martinelli-09/orco-gan/blob/main/modelEvaluator.ipynb)

## General Setup

In [1]:
import os

if not os.path.exists("./datasets"):
    !git clone "https://github.com/luca-martinelli-09/learn-the-art.git"

    %cd learn-the-art/

In [2]:
import sys
IN_COLAB = 'google.colab' in sys.modules

googleModelsDir = None

if IN_COLAB:
  from google.colab import drive
  drive.mount('/content/drive')

  googleModelsDir = "/content/drive/MyDrive/Università/Magistrale/II Anno/I Semestre/Advanced Topics in Computer and Network Security/Project/Models"

In [3]:
import torch
import pandas as pd
import torchvision
from torchvision import transforms
from torch.utils.data import DataLoader
import numpy as np

print("PyTorch Version:", torch.__version__)
print("Torchvision Version:", torchvision.__version__)

PyTorch Version: 1.10.1
Torchvision Version: 0.11.2


  warn(f"Failed to load image Python extension: {e}")


In [4]:
# Detect if we have a GPU available
print("CUDA available:", torch.cuda.is_available())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

CUDA available: True


### Set a manual seed

In [5]:
SEED = 151836

def setSeed(seed):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)


setSeed(SEED)

## Utils

In [6]:
def printGPUStats():
    print('Using device:', device)
    print()

    # Additional Info when using cuda
    if device.type == 'cuda':
        print(torch.cuda.get_device_name(0))
        print('[💻 MEMORY USAGE]')
        print('[📌 ALLOCATED]', round(
            torch.cuda.memory_allocated(0) / 1024 ** 3, 1), 'GB')
        print('[🧮 CACHED]', round(
            torch.cuda.memory_reserved(0) / 1024 ** 3, 1), 'GB')


In [7]:
def getSubDirs(dir):
    return [x for x in os.listdir(dir) if os.path.isdir(os.path.join(dir, x))]

In [8]:
def getClassPercents(sizes):
    totalSize = np.sum(np.array(sizes))
    percents = []
    for size in sizes:
        percents.append(int(round((size / totalSize) * 100)))
    
    return percents

In [9]:
def getBestScores(hist, key, min=False):
    scores = [x[key] for x in hist]

    if min:
        i = np.argmin(np.array(scores))
    else:
        i = np.argmax(np.array(scores))

    return hist[i], i

In [10]:
def getMeanAndSDT(dataloader):
    channels_sum, channels_squared_sum, num_batches = 0, 0, 0
    for data, _ in dataloader:
        # Mean over batch, height and width, but not over the channels
        channels_sum += torch.mean(data, dim=[0, 2, 3])
        channels_squared_sum += torch.mean(data**2, dim=[0, 2, 3])
        num_batches += 1

    mean = channels_sum / num_batches

    # std = sqrt(E[X^2] - (E[X])^2)
    std = (channels_squared_sum / num_batches - mean ** 2) ** 0.5

    return mean, std

## Settings

In [11]:
datasetsDir = "./datasets"
modelsDir = googleModelsDir if googleModelsDir else "./models"
adversarialsDir = "./adversarial_samples"

inputSize = 224 # Specified for alexnet, resnet, vgg

## Models evaluations

In [12]:
def getScores(labels, predicted):
    acc = torch.sum(predicted == labels) / len(predicted)

    tp = (labels * predicted).sum()
    tn = ((1 - labels) * (1 - predicted)).sum()
    fp = ((1 - labels) * predicted).sum()
    fn = (labels * (1 - predicted)).sum()

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    f1 = 2 * (precision * recall) / (precision + recall)

    return acc, precision, recall, f1

In [13]:
def evaluateModel(model, dataloader):
    model.eval()
    labelsOutputs = torch.tensor([]).to(device, non_blocking=True)
    labelsTargets = torch.tensor([]).to(device, non_blocking=True)

    for inputs, labels in dataloader:
        inputs = inputs.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        with torch.set_grad_enabled(False):
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)

        labelsOutputs = torch.cat([labelsOutputs, preds], dim=0)
        labelsTargets = torch.cat([labelsTargets, labels], dim=0)

    acc, precision, recall, f1 = getScores(labelsTargets, labelsOutputs)

    return {
        "acc": acc.cpu().numpy(),
        "precision": precision.cpu().numpy(),
        "recall": recall.cpu().numpy(),
        "f1": f1.cpu().numpy()
    }

### Informations about models

In [14]:
print("[🧠 MODELS INFORMATION]")

modelsInformation = []

for dataset in getSubDirs(modelsDir):
    print("\n" + "-" * 15)
    print("[🗃️ DATASET] {}".format(dataset))

    datasetDir = os.path.join(modelsDir, dataset)

    for modelType in getSubDirs(datasetDir):
        print("\n[🧮 MODEL TYPE] {}".format(modelType))

        modelsTypeDir = os.path.join(datasetDir, modelType)
        
        for model in os.listdir(modelsTypeDir):
            print("\n\t[🧠 MODEL] {}".format(model))

            path = os.path.join(modelsTypeDir, model)

            checkpoint = torch.load(path)
            
            bestScore, i = getBestScores(checkpoint["scores_history"], "f1")
            classBalancing = getClassPercents(checkpoint["dataset_sizes"])
            balancingStr = "/".join([str(x) for x in classBalancing])

            modelsInformation.append({
                "dataset": dataset,
                "model": checkpoint["model_name"],
                "epochs": len(checkpoint["scores_history"]),
                "balancing": balancingStr,
                "f-score": bestScore["f1"],
            })

            print("\tModel:", checkpoint["model_name"])
            print("\tEpochs:", len(checkpoint["scores_history"]))
            print("\tBalancing:", classBalancing)
            print("\tBest epoch:", i)
            print("\tBest F-Score:", bestScore["f1"])
            print("\tHistory:", [float(x["f1"]) for x in checkpoint["scores_history"]])

            torch.cuda.empty_cache()

modelsInformationDF = pd.DataFrame(modelsInformation)

printGPUStats()

[🧠 MODELS INFORMATION]

---------------
[🗃️ DATASET] bing

[🧮 MODEL TYPE] alexnet

	[🧠 MODEL] alexnet_20_80_all.pt
	Model: alexnet
	Epochs: 38
	Balancing: [20, 80]
	Best epoch: 17
	Best F-Score: 0.9436486
	History: [0.9077625274658203, 0.923933207988739, 0.929311990737915, 0.9321871995925903, 0.9340866208076477, 0.9332079291343689, 0.9357277750968933, 0.9391635060310364, 0.9417383074760437, 0.9417383074760437, 0.942638635635376, 0.942638635635376, 0.942638635635376, 0.942638635635376, 0.942638635635376, 0.942638635635376, 0.942638635635376, 0.9436485767364502, 0.9436485767364502, 0.9427481293678284, 0.9427481293678284, 0.9428571462631226, 0.9419600367546082, 0.9419600367546082, 0.9419600367546082, 0.9419600367546082, 0.9419600367546082, 0.9401708841323853, 0.9392789602279663, 0.9383886456489563, 0.9383886456489563, 0.9383886456489563, 0.9383886456489563, 0.9383886456489563, 0.9383886456489563, 0.9383886456489563, 0.9383886456489563, 0.9383886456489563]

	[🧠 MODEL] alexnet_30_70_all.pt


In [15]:
modelsInformationDF

Unnamed: 0,dataset,model,epochs,balancing,f-score
0,bing,alexnet,38,20/80,0.9436486
1,bing,alexnet,24,30/70,0.9522863
2,bing,alexnet,23,40/60,0.94939274
3,bing,alexnet,42,50/50,0.95114654
4,bing,resnet,49,20/80,0.9659201
5,bing,resnet,25,30/70,0.98513377
6,bing,resnet,33,40/60,0.98701304
7,bing,resnet,26,50/50,0.98217815
8,bing,vgg,28,20/80,0.9765166
9,bing,vgg,38,30/70,0.98128074


### Evaluations

In [16]:
def evaluateModelsOnDataset(datasetFolder, datasetInfo):
    global modelsDir, inputSize

    modelsEvals = []

    # Get the images and calculate mean and standard deviation
    imageDataset = torchvision.datasets.ImageFolder(
        datasetFolder, transform=transforms.Compose([transforms.ToTensor()]))
        
    for cls in imageDataset.classes:
        cls_index = imageDataset.class_to_idx[cls]
        num_cls = np.count_nonzero(
            np.array(imageDataset.targets) == cls_index)
        
        print("\t[🧮 # ELEMENTS] {}: {}".format(cls, num_cls))
    
    imageDataloader = DataLoader(imageDataset, batch_size=128)
    
    mean, std = getMeanAndSDT(imageDataloader)

    # Setup for normalization
    dataTransform = transforms.Compose([
        transforms.Resize(inputSize),
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])

    testDataset = ImageLimitedDataset(
        datasetFolder, transform=dataTransform, use_cache=True, check_images=False)

    setSeed(SEED)
    testDataLoader = DataLoader(
        testDataset, batch_size=64, shuffle=True, num_workers=0, pin_memory=True)
    
    # Evaluate every model
    for root, _, fnames in sorted(os.walk(modelsDir, followlinks=True)):
        for fname in sorted(fnames):
            path = os.path.join(root, fname)

            try:
                modelData = torch.load(path)
            except:
                continue

            modelDataset = modelData["dataset"]
            modelName = modelData["model_name"]
            modelPercents = "/".join([str(x)
                                     for x in getClassPercents(modelData["dataset_sizes"])])

            print()
            print("[🧮 EVALUATING] {} - {} {}".format(
                modelDataset,
                modelName,
                modelPercents
            ))

            modelToTest = modelData["model"]
            modelToTest = modelToTest.to(device, non_blocking=True)

            scores = evaluateModel(modelToTest, testDataLoader)

            modelsEvals.append({
                    "dataset": datasetInfo["dataset"],
                    "isMath": datasetInfo["math"],
                    "attack": datasetInfo["attack"],
                    "advModel": datasetInfo["model"],
                    "advBalancing": datasetInfo["balancing"],

                    "model": modelName,
                    "modelDataset": modelDataset,
                    "balancing": modelPercents,
                    "acc": scores["acc"],
                    "precision": scores["precision"],
                    "recall": scores["recall"],
                    "f1": scores["f1"],
                })
            
            print("\tAcc: {:.4f}".format(scores["acc"]))
            print("\tPre: {:.4f}".format(scores["precision"]))
            print("\tRec: {:.4f}".format(scores["recall"]))
            print("\tF-Score: {:.4f}".format(scores["f1"]))

            torch.cuda.empty_cache()
        
    return modelsEvals


In [17]:
from imageLimitedDataset import ImageLimitedDataset

modelsEvals = []

In [18]:
print("[🧠 MODELS EVALUATION - NO ATTACKS]")

# Evaluate models on test folders
for dataset in getSubDirs(datasetsDir):
    print("\n" + "-" * 15)
    print("[🗃️ TEST DATASET] {}".format(dataset))
    
    datasetDir = os.path.join(datasetsDir, dataset)
    testDir = os.path.join(datasetDir, "test")

    advDatasetInfo = {
        "dataset": dataset,
        "math": None,
        "attack": None,
        "balancing": None,
        "model": None,
    }

    evals = evaluateModelsOnDataset(testDir, advDatasetInfo)
    modelsEvals.extend(evals)

[🧠 MODELS EVALUATION - NO ATTACKS]

---------------
[🗃️ TEST DATASET] bing
	[🧮 # ELEMENTS] cat: 1000
	[🧮 # ELEMENTS] dog: 1000

[🧮 EVALUATING] bing - alexnet 20/80
	Acc: 0.9230
	Pre: 0.8790
	Rec: 0.9810
	F-Score: 0.9272

[🧮 EVALUATING] bing - alexnet 30/70
	Acc: 0.9400
	Pre: 0.9288
	Rec: 0.9530
	F-Score: 0.9408

[🧮 EVALUATING] bing - alexnet 40/60
	Acc: 0.9460
	Pre: 0.9579
	Rec: 0.9330
	F-Score: 0.9453

[🧮 EVALUATING] bing - alexnet 50/50
	Acc: 0.9485
	Pre: 0.9419
	Rec: 0.9560
	F-Score: 0.9489

[🧮 EVALUATING] bing - resnet 20/80
	Acc: 0.9545
	Pre: 0.9228
	Rec: 0.9920
	F-Score: 0.9561

[🧮 EVALUATING] bing - resnet 30/70
	Acc: 0.9695
	Pre: 0.9572
	Rec: 0.9830
	F-Score: 0.9699

[🧮 EVALUATING] bing - resnet 40/60
	Acc: 0.9680
	Pre: 0.9689
	Rec: 0.9670
	F-Score: 0.9680

[🧮 EVALUATING] bing - resnet 50/50
	Acc: 0.9700
	Pre: 0.9590
	Rec: 0.9820
	F-Score: 0.9704

[🧮 EVALUATING] bing - vgg 20/80
	Acc: 0.9710
	Pre: 0.9486
	Rec: 0.9960
	F-Score: 0.9717

[🧮 EVALUATING] bing - vgg 30/70
	Acc: 0.973

In [19]:
print("[🧠 MODELS EVALUATION - NON MATH ATTACKS]")

# Evaluate models on non math attacks folders
for dataset in getSubDirs(adversarialsDir):
    datasetDir = os.path.join(adversarialsDir, dataset)
    nonMathAdvDir = os.path.join(datasetDir, "nonMath")

    for attack in getSubDirs(nonMathAdvDir):
        print("\n" + "-" * 15)
        print("[🗃️ ADVERSARIAL DATASET] {}/{}".format(dataset, attack))

        attackDir = os.path.join(nonMathAdvDir, attack)

        advDatasetInfo = {
            "dataset": dataset,
            "math": False,
            "attack": attack,
            "balancing": None,
            "model": None,
        }

        evals = evaluateModelsOnDataset(attackDir, advDatasetInfo)
        modelsEvals.extend(evals)


[🧠 MODELS EVALUATION - NON MATH ATTACKS]

---------------
[🗃️ ADVERSARIAL DATASET] bing/BoxBlur
	[🧮 # ELEMENTS] cat: 1000
	[🧮 # ELEMENTS] dog: 1000

[🧮 EVALUATING] bing - alexnet 20/80
	Acc: 0.9325
	Pre: 0.9179
	Rec: 0.9500
	F-Score: 0.9337

[🧮 EVALUATING] bing - alexnet 30/70
	Acc: 0.9305
	Pre: 0.9353
	Rec: 0.9250
	F-Score: 0.9301

[🧮 EVALUATING] bing - alexnet 40/60
	Acc: 0.9390
	Pre: 0.9631
	Rec: 0.9130
	F-Score: 0.9374

[🧮 EVALUATING] bing - alexnet 50/50
	Acc: 0.9380
	Pre: 0.9433
	Rec: 0.9320
	F-Score: 0.9376

[🧮 EVALUATING] bing - resnet 20/80
	Acc: 0.9535
	Pre: 0.9274
	Rec: 0.9840
	F-Score: 0.9549

[🧮 EVALUATING] bing - resnet 30/70
	Acc: 0.9615
	Pre: 0.9520
	Rec: 0.9720
	F-Score: 0.9619

[🧮 EVALUATING] bing - resnet 40/60
	Acc: 0.9625
	Pre: 0.9695
	Rec: 0.9550
	F-Score: 0.9622

[🧮 EVALUATING] bing - resnet 50/50
	Acc: 0.9620
	Pre: 0.9565
	Rec: 0.9680
	F-Score: 0.9622

[🧮 EVALUATING] bing - vgg 20/80
	Acc: 0.9595
	Pre: 0.9315
	Rec: 0.9920
	F-Score: 0.9608

[🧮 EVALUATING] bing - 

In [20]:
print("[🧠 MODELS EVALUATION - MATH ATTACKS]")

# Evaluate models on math attacks folders
for dataset in getSubDirs(adversarialsDir):
    datasetDir = os.path.join(adversarialsDir, dataset)
    mathAdvDir = os.path.join(datasetDir, "math")

    if not os.path.exists(mathAdvDir):
        continue

    for attack in getSubDirs(mathAdvDir):
        attackDir = os.path.join(mathAdvDir, attack)

        for advModel in getSubDirs(attackDir):
            advModelDir = os.path.join(attackDir, advModel)

            for advBalancing in getSubDirs(advModelDir):
                advDatasetDir = os.path.join(advModelDir, advBalancing)

                print("\n" + "-" * 15)
                print("[🗃️ ADVERSARIAL DATASET] {}/{}/{}/{}".format(dataset, attack, advModel, advBalancing))

                advDatasetInfo = {
                    "dataset": dataset,
                    "math": True,
                    "attack": attack,
                    "balancing": advBalancing.replace("_", "/"),
                    "model": advModel,
                }

                evals = evaluateModelsOnDataset(advDatasetDir, advDatasetInfo)
                modelsEvals.extend(evals)

[🧠 MODELS EVALUATION - MATH ATTACKS]

---------------
[🗃️ ADVERSARIAL DATASET] bing/CW/alexnet/20_80
	[🧮 # ELEMENTS] cat: 100
	[🧮 # ELEMENTS] dog: 100

[🧮 EVALUATING] bing - alexnet 20/80
	Acc: 0.9150
	Pre: 0.8609
	Rec: 0.9900
	F-Score: 0.9209

[🧮 EVALUATING] bing - alexnet 30/70
	Acc: 0.9300
	Pre: 0.8909
	Rec: 0.9800
	F-Score: 0.9333

[🧮 EVALUATING] bing - alexnet 40/60
	Acc: 0.9450
	Pre: 0.9495
	Rec: 0.9400
	F-Score: 0.9447

[🧮 EVALUATING] bing - alexnet 50/50
	Acc: 0.9300
	Pre: 0.9135
	Rec: 0.9500
	F-Score: 0.9314

[🧮 EVALUATING] bing - resnet 20/80
	Acc: 0.9500
	Pre: 0.9091
	Rec: 1.0000
	F-Score: 0.9524

[🧮 EVALUATING] bing - resnet 30/70
	Acc: 0.9700
	Pre: 0.9519
	Rec: 0.9900
	F-Score: 0.9706

[🧮 EVALUATING] bing - resnet 40/60
	Acc: 0.9750
	Pre: 0.9703
	Rec: 0.9800
	F-Score: 0.9751

[🧮 EVALUATING] bing - resnet 50/50
	Acc: 0.9750
	Pre: 0.9612
	Rec: 0.9900
	F-Score: 0.9754

[🧮 EVALUATING] bing - vgg 20/80
	Acc: 0.9700
	Pre: 0.9519
	Rec: 0.9900
	F-Score: 0.9706

[🧮 EVALUATING] bing

In [21]:
modelsEvalsDF = pd.DataFrame(modelsEvals)

In [22]:
modelsEvalsDF

Unnamed: 0,dataset,isMath,attack,advModel,advBalancing,model,modelDataset,balancing,acc,precision,recall,f1
0,bing,,,,,alexnet,bing,20/80,0.92300004,0.87903225,0.981,0.9272212
1,bing,,,,,alexnet,bing,30/70,0.94000006,0.9288499,0.953,0.94077
2,bing,,,,,alexnet,bing,40/60,0.94600004,0.95790553,0.933,0.9452887
3,bing,,,,,alexnet,bing,50/50,0.94850004,0.94187194,0.956,0.94888335
4,bing,,,,,resnet,bing,20/80,0.9545,0.9227907,0.992,0.95614463
...,...,...,...,...,...,...,...,...,...,...,...,...
7447,google,True,FGSM,vgg,50/50,resnet,google,50/50,0.905,0.9550562,0.85,0.8994709
7448,google,True,FGSM,vgg,50/50,vgg,google,20/80,0.7,0.6886792,0.73,0.70873785
7449,google,True,FGSM,vgg,50/50,vgg,google,30/70,0.74,0.7553192,0.71,0.73195875
7450,google,True,FGSM,vgg,50/50,vgg,google,40/60,0.705,0.7356322,0.64,0.68449193


### Save evaluations

In [23]:
modelsEvalsDF.to_csv("modelsEvaluations.csv")