# Explainability of Image classifier

The current notebook consists of the implementation of our project. It is organized as follows:

## Import Packages

In [1]:
import warnings
warnings.filterwarnings('ignore')
import cv2
import requests
from pytorch_grad_cam import GradCAM
from pytorch_grad_cam.utils.model_targets import ClassifierOutputTarget
from pytorch_grad_cam.utils.image import show_cam_on_image,deprocess_image,preprocess_image
import matplotlib.pyplot as plt
from IPython import display
import os
import shutil
import torch
import torchvision
import random
import torch.nn as nn
from torchvision       import transforms, datasets, models
import typing 
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch.backends.cudnn as cudnn
from rise import *
from PIL import ImageFile, Image
ImageFile.LOAD_TRUNCATED_IMAGES = True
torch.manual_seed(12345)
np.random.seed(12345)
cudnn.benchmark = True
%matplotlib inline
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')



In [2]:
#Read dog csv file.
dog_df = pd.read_csv("/Users/omer/Desktop/AMMI/Research_project/Implementation/Project_XAI/dogs_classes_ImageNet.csv")
class_indx=dog_df['ImageNet original class index '].to_list()
imagenet_dataset = datasets.ImageFolder("/Users/omer/Desktop/AMMI/Research_project/Implementation/Image")
class_names=os.listdir("/Users/omer/Desktop/AMMI/Research_project/Implementation/Image")
class_names.remove('.DS_Store')
class_names=[int(i) for i in class_names]
class_names.sort()
class_indxs= [class_indx[idx] for idx in class_names]
class_indx=None
class_nam = dog_df['class name'].to_list()
class_nam1 = [ class_nam[int(x)] for x in class_names]
#class_nam = None

In [3]:
class_nam1 #['Chihuahua', 'Japanese spaniel', 'Maltese dog, Maltese terrier, Maltese']

['Chihuahua', 'Japanese spaniel', 'Maltese dog, Maltese terrier, Maltese']

In [4]:
class_indxs #[151, 152, 153]

[151, 152, 153]

## Test function

In [5]:
def test(model, data_loader):
    """Measures the accuracy of a model on a data set."""

    # Make sure the model is in evaluation mode.
    model.eval()
    correct = 0

    # We do not need to maintain intermediate activations while testing.
    with torch.no_grad():

        # Loop over test data.
        for data, target in data_loader:

            # Forward pass.
            output = model(data.to(device))

            # Get the label corresponding to the highest predicted probability.
            pred = output.argmax(dim=1, keepdim=True)

            # Count number of correct predictions.
            correct += pred.cpu().eq(target.view_as(pred)).sum().item()

    # Print test accuracy.
    percent = 100.0 * correct / len(data_loader.dataset)
    print(f"Accuracy: {correct}/{len(data_loader.dataset)} ({percent:.0f}%)")
    return percent

In [6]:
%matplotlib inline
def show(img):
    """Show PyTorch tensor img as an image in matplotlib."""
    npimg = img.cpu().detach().numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)), interpolation="nearest")
    plt.grid(False)
    plt.gca().axis("off")


def display_thumb(img):
    display.display(transforms.Resize(224)(img))

## Some of the dataset



In [None]:
random.seed(12345)
print("ImageNet classes:", *imagenet_dataset.classes)

# Show a random image and the corresponding target.
for i in range(3):
    img, target = random.choice(imagenet_dataset)
    print(
        "Label of image: %d (%s). Original size: %s"
        % (target, class_nam[int(imagenet_dataset.classes[target])], img.size)
    )

    # Reduce image size by half to fit the images on the page :)
    display.display(img.resize((img.size[0] // 2, img.size[1] // 2)))


Data Preparation

In [8]:
# This is the default transform used in ImageNet models.
inference_transform = transforms.Compose(
    [
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ]
)

# For visualization purposes we'll create a separate transform that operates in image space.
inference_transform_show = transforms.Compose(
    [
        transforms.Resize(256),
        transforms.CenterCrop(224),
    ]
)

# Reload out dataset with this transform.
transformed_imagenet_dataset = datasets.ImageFolder(
    "/Users/omer/Desktop/AMMI/Research_project/Implementation/Image",
    transform=inference_transform,
)

transformed_imagenet_loader = torch.utils.data.DataLoader(
    transformed_imagenet_dataset, batch_size=16, shuffle=False
)

## Pre-trained models:

Load AlexNet

In [9]:
# Load a pretrained model.
imagenet_alexnet = torchvision.models.alexnet(weights='IMAGENET1K_V1')

# The loaded model is trained to classify image across 1000 classes. We have the classes in class_indx that we are going to consider.

imagenet_alexnet.classifier[6].weight.data = imagenet_alexnet.classifier[6].weight.data[class_indxs]
imagenet_alexnet.classifier[6].bias.data = imagenet_alexnet.classifier[6].bias.data[class_indxs]

imagenet_alexnet = imagenet_alexnet.to(device)

Test AlexNet on the dataset

In [7]:
# Check the model accuracy.
test(imagenet_alexnet, transformed_imagenet_loader)

Accuracy: 547/559 (98%)


97.85330948121646

Mistakes AlexNet made

In [None]:
imagenet_alexnet.eval()  # make sure the model is in evaluation mode
mistak_alexnet = []
for i in tqdm(range(len(transformed_imagenet_dataset))):
    img, _ = imagenet_dataset[i]
    tensor, target = transformed_imagenet_dataset[i]

    
    tensor = tensor.to(device)

    _, prediction = imagenet_alexnet(tensor.unsqueeze(0)).squeeze(0).cpu().max(-1)
    if prediction != target:
        mistak_alexnet.append(i)
        print(
            "Img id=%d. Excpected class %s, but predicted class %s."
            % (
                i,
                class_nam[int(imagenet_dataset.classes[target])],
                class_nam[int(imagenet_dataset.classes[prediction])],
            )
        )
        display_thumb(img)
        

Load ResNet50

In [11]:
# Load a pretrained model.
imagenet_resnet = torchvision.models.resnet50(weights='IMAGENET1K_V1')

# The loaded model is trained to classify image across 1000 classes. We have the classes in class_indx that we are going to consider.

imagenet_resnet.fc.weight.data = imagenet_resnet.fc.weight.data[class_indxs]
imagenet_resnet.fc.bias.data = imagenet_resnet.fc.bias.data[class_indxs]

imagenet_resnet = imagenet_resnet.to(device)


Test ResNet50 on the dataset

In [10]:
# Check the model accuracy.
test(imagenet_resnet, transformed_imagenet_loader)

Accuracy: 558/559 (100%)


99.82110912343471

Mistakes ResNet50 made

In [None]:
imagenet_resnet.eval()  # make sure the model is in evaluation mode
mistak_resnet50 = []
for i in tqdm(range(len(transformed_imagenet_dataset))):
    img, _ = imagenet_dataset[i]
    tensor, target = transformed_imagenet_dataset[i]

    
    tensor = tensor.to(device)

    _, prediction = imagenet_resnet(tensor.unsqueeze(0)).squeeze(0).cpu().max(-1)
    if prediction != target:
        mistak_resnet50.append(i)
        print(
            "Img id=%d. Excpected class %s, but predicted class %s."
            % (
                i,
                class_nam[int(imagenet_dataset.classes[target])],
                class_nam[int(imagenet_dataset.classes[prediction])],
            )
        )
        display_thumb(img)
        

Load ResNet152

In [13]:
# Load a pretrained model.
imagenet_resnet152 = torchvision.models.resnet152(weights='IMAGENET1K_V1')

# The loaded model is trained to classify image across 1000 classes. We have the classes in class_indx that we are going to consider.

imagenet_resnet152.fc.weight.data = imagenet_resnet152.fc.weight.data[class_indxs]
imagenet_resnet152.fc.bias.data = imagenet_resnet152.fc.bias.data[class_indxs]

imagenet_resnet152 = imagenet_resnet152.to(device)

Test ResNet152 on the dataset

In [14]:
# Check the model accuracy.
test(imagenet_resnet152, transformed_imagenet_loader)

Accuracy: 557/559 (100%)


99.6422182468694

Mistakes ResNet152 made

In [None]:
imagenet_resnet152.eval()  # make sure the model is in evaluation mode
mistak_resnet152 = []
for i in tqdm(range(len(transformed_imagenet_dataset))):
    img, _ = imagenet_dataset[i]
    tensor, target = transformed_imagenet_dataset[i]

    
    tensor = tensor.to(device)

    _, prediction = imagenet_resnet152(tensor.unsqueeze(0)).squeeze(0).cpu().max(-1)
    if prediction != target:
        mistak_resnet152.append(i)
        print(
            "Img id=%d. Excpected class %s, but predicted class %s."
            % (
                i,
                class_nam[int(imagenet_dataset.classes[target])],
                class_nam[int(imagenet_dataset.classes[prediction])],
            )
        )
        display_thumb(img)
        

Load ResNeXt101

In [16]:
# Load a pretrained model.
imagenet_ResNeXt101 = torch.hub.load('pytorch/vision:v0.10.0', 'resnext101_32x8d', pretrained=True)#torchvision.models.resnet152(weights='IMAGENET1K_V1')

# The loaded model is trained to classify image across 1000 classes. We have the classes in class_indx that we are going to consider.

imagenet_ResNeXt101.fc.weight.data = imagenet_ResNeXt101.fc.weight.data[class_indxs]
imagenet_ResNeXt101.fc.bias.data = imagenet_ResNeXt101.fc.bias.data[class_indxs]

imagenet_ResNeXt101 = imagenet_ResNeXt101.to(device)

Using cache found in /Users/omer/.cache/torch/hub/pytorch_vision_v0.10.0


Test ResNeXt101 on the dataset

In [17]:
# Check the model accuracy.
test(imagenet_ResNeXt101, transformed_imagenet_loader)

Accuracy: 558/559 (100%)


99.82110912343471

Mistakes ResNeXt101 made

In [None]:
imagenet_ResNeXt101.eval()  # make sure the model is in evaluation mode
mistak_resNext101 = []
for i in tqdm(range(len(transformed_imagenet_dataset))):
    img, _ = imagenet_dataset[i]
    tensor, target = transformed_imagenet_dataset[i]

    
    tensor = tensor.to(device)

    _, prediction = imagenet_ResNeXt101(tensor.unsqueeze(0)).squeeze(0).cpu().max(-1)
    if prediction != target:
        mistak_resNext101.append(i)
        print(
            "Img id=%d. Excpected class %s, but predicted class %s."
            % (
                i,
                class_nam[int(imagenet_dataset.classes[target])],
                class_nam[int(imagenet_dataset.classes[prediction])],
            )
        )
        display_thumb(img)
        

Load EfficientNet_b4

In [19]:
# Load a pretrained model.
imagenet_EfficientNet_b4 = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_efficientnet_b4', pretrained=True)

# The loaded model is trained to classify image across 1000 classes. We have the classes in class_indx that we are going to consider.

imagenet_EfficientNet_b4.classifier.fc.weight.data = imagenet_EfficientNet_b4.classifier.fc.weight.data[class_indxs]
imagenet_EfficientNet_b4.classifier.fc.bias.data = imagenet_EfficientNet_b4.classifier.fc.bias.data[class_indxs]

imagenet_EfficientNet_b4 = imagenet_EfficientNet_b4.to(device)

Using cache found in /Users/omer/.cache/torch/hub/NVIDIA_DeepLearningExamples_torchhub


Test EfficientNet_b4 on the dataset

In [20]:
# Check the model accuracy.
test(imagenet_EfficientNet_b4, transformed_imagenet_loader)

Accuracy: 551/559 (99%)


98.56887298747763

Mistakes EfficientNet_b4 made

In [None]:
imagenet_EfficientNet_b4.eval()  # make sure the model is in evaluation mode
mistak_effic = []
for i in tqdm(range(len(transformed_imagenet_dataset))):
    img, _ = imagenet_dataset[i]
    tensor, target = transformed_imagenet_dataset[i]

    
    tensor = tensor.to(device)

    _, prediction = imagenet_EfficientNet_b4(tensor.unsqueeze(0)).squeeze(0).cpu().max(-1)
    if prediction != target:
        mistak_effic.append(i)
        print(
            "Img id=%d. Excpected class %s, but predicted class %s."
            % (
                i,
                class_nam[int(imagenet_dataset.classes[target])],
                class_nam[int(imagenet_dataset.classes[prediction])],
            )
        )
        display_thumb(img)
        

Load ViT

In [22]:
# Load a pretrained model.
imagenet_ViT =  torchvision.models.vit_b_16(weights='IMAGENET1K_V1')
# The loaded model is trained to classify image across 1000 classes. We have the classes in class_indx that we are going to consider.
imagenet_ViT.heads.head.weight.data = imagenet_ViT.heads.head.weight.data[class_indxs]
imagenet_ViT.heads.head.bias.data = imagenet_ViT.heads.head.bias.data[class_indxs]

imagenet_ViT = imagenet_ViT.to(device)

Test ViT on the dataset

In [23]:
# Check the model accuracy.
test(imagenet_ViT, transformed_imagenet_loader)

Accuracy: 558/559 (100%)


99.82110912343471

Mistakes ViT made

In [None]:
imagenet_ViT.eval()  # make sure the model is in evaluation mode
mistak_vit = []
for i in tqdm(range(len(transformed_imagenet_dataset))):
    img, _ = imagenet_dataset[i]
    tensor, target = transformed_imagenet_dataset[i]

    
    tensor = tensor.to(device)

    _, prediction = imagenet_ViT(tensor.unsqueeze(0)).squeeze(0).cpu().max(-1)
    if prediction != target:
        mistak_vit.append(i)
        print(
            "Img id=%d. Excpected class %s, but predicted class %s."
            % (
                i,
                class_nam[int(imagenet_dataset.classes[target])],
                class_nam[int(imagenet_dataset.classes[prediction])],
            )
        )
        display_thumb(img)
        

## Grad-CAM:

In this approach, we forward pass an image with its desirable label to the CNN until we got the probability of that label. Then, we set the gardients of all classes to zero except the desired class will be set to one. Then, we backpropagate to the rectified covolutional feature maps of interest. Finally, we combine these feature maps to get where the model has focused to give such prediction.

For AlexNet

In [None]:
imagenet_alexnet.eval()
id=36
img = np.array(imagenet_dataset[id][0])
img = cv2.resize(img, (224, 224))
img = np.float32(img) / 255
input_tensor = preprocess_image(img, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

# As usual for classication, the target is the logit output
# before softmax, for that category.
targets = [ClassifierOutputTarget( transformed_imagenet_dataset[id][1])]
target_layers = [imagenet_alexnet.features]
with GradCAM(model=imagenet_alexnet, target_layers=target_layers) as cam:
    grayscale_cams = cam(input_tensor=input_tensor.to(device), targets=targets)
    cam_image = show_cam_on_image(img, grayscale_cams[0, :], use_rgb=True)
images = np.hstack((np.uint8(255*img) , cam_image))
Image.fromarray(images)

In [26]:
for i in [54,142,136,117,198,153,191,220,248]:
    img = np.array(imagenet_dataset[i][0])
    img = cv2.resize(img, (224, 224))
    img = np.float32(img) / 255
    input_tensor = preprocess_image(img, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

    # As usual for classication, the target is the logit output
    # before softmax, for that category.
    targets = [ClassifierOutputTarget(transformed_imagenet_dataset[i][1])]
    target_layers = [imagenet_alexnet.features]
    with GradCAM(model=imagenet_alexnet, target_layers=target_layers) as cam:
        grayscale_cams = cam(input_tensor=input_tensor.to(device), targets=targets)
        cam_image = show_cam_on_image(img, grayscale_cams[0, :], use_rgb=True)
    images = np.hstack((np.uint8(255*img) , cam_image))
    Image.fromarray(images).show()

In [27]:
#Mistakes AlexNet made 
for i in mistak_alexnet:
    img = np.array(imagenet_dataset[i][0])
    img = cv2.resize(img, (224, 224))
    img = np.float32(img) / 255
    input_tensor = preprocess_image(img, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

    # As usual for classication, the target is the logit output
    # before softmax, for that category.
    targets = [ClassifierOutputTarget(transformed_imagenet_dataset[i][1])]
    target_layers = [imagenet_alexnet.features]
    with GradCAM(model=imagenet_alexnet, target_layers=target_layers) as cam:
        grayscale_cams = cam(input_tensor=input_tensor.to(device), targets=targets)
        cam_image = show_cam_on_image(img, grayscale_cams[0, :], use_rgb=True)
    images = np.hstack((np.uint8(255*img) , cam_image))
    Image.fromarray(images).show()

For ResNet50

In [None]:
imagenet_resnet.eval()
id=36
img = np.array(imagenet_dataset[id][0])
img = cv2.resize(img, (224, 224))
img = np.float32(img) / 255
input_tensor = preprocess_image(img, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

# As usual for classication, the target is the logit output
# before softmax, for that category.
targets = [ClassifierOutputTarget( transformed_imagenet_dataset[id][1])]
target_layers = [imagenet_resnet.layer4]
with GradCAM(model=imagenet_resnet, target_layers=target_layers) as cam:
    grayscale_cams = cam(input_tensor=input_tensor.to(device), targets=targets)
    cam_image = show_cam_on_image(img, grayscale_cams[0, :], use_rgb=True)
images = np.hstack((np.uint8(255*img) , cam_image))
Image.fromarray(images)

In [28]:
for i in [54,142,136,117,198,153,191,220,248]:
    img = np.array(imagenet_dataset[i][0])
    img = cv2.resize(img, (224, 224))
    img = np.float32(img) / 255
    input_tensor = preprocess_image(img, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

    # As usual for classication, the target is the logit output
    # before softmax, for that category.
    targets = [ClassifierOutputTarget(transformed_imagenet_dataset[i][1])]
    target_layers = [imagenet_resnet.layer4]
    with GradCAM(model=imagenet_resnet, target_layers=target_layers) as cam:
        grayscale_cams = cam(input_tensor=input_tensor.to(device), targets=targets)
        cam_image = show_cam_on_image(img, grayscale_cams[0, :], use_rgb=True)
    images = np.hstack((np.uint8(255*img) , cam_image))
    Image.fromarray(images).show()

    

In [29]:
#Mistakes ResNet50 made 
for i in mistak_resnet50:
    img = np.array(imagenet_dataset[i][0])
    img = cv2.resize(img, (224, 224))
    img = np.float32(img) / 255
    input_tensor = preprocess_image(img, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

    # As usual for classication, the target is the logit output
    # before softmax, for that category.
    targets = [ClassifierOutputTarget(transformed_imagenet_dataset[i][1])]
    target_layers = [imagenet_resnet.layer4]
    with GradCAM(model=imagenet_resnet, target_layers=target_layers) as cam:
        grayscale_cams = cam(input_tensor=input_tensor.to(device), targets=targets)
        cam_image = show_cam_on_image(img, grayscale_cams[0, :], use_rgb=True)
    images = np.hstack((np.uint8(255*img) , cam_image))
    Image.fromarray(images).show()

For ResNet152

In [None]:
imagenet_resnet152.eval()
id=36
img = np.array(imagenet_dataset[id][0])
img = cv2.resize(img, (224, 224))
img = np.float32(img) / 255
input_tensor = preprocess_image(img, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

# As usual for classication, the target is the logit output
# before softmax, for that category.
targets = [ClassifierOutputTarget( transformed_imagenet_dataset[id][1])]
target_layers = [imagenet_resnet152.layer4]
with GradCAM(model=imagenet_resnet152, target_layers=target_layers) as cam:
    grayscale_cams = cam(input_tensor=input_tensor.to(device), targets=targets)
    cam_image = show_cam_on_image(img, grayscale_cams[0, :], use_rgb=True)
images = np.hstack((np.uint8(255*img) , cam_image))
Image.fromarray(images)

In [30]:
for i in [54,142,136,117,198,153,191,220,248]:
    img = np.array(imagenet_dataset[i][0])
    img = cv2.resize(img, (224, 224))
    img = np.float32(img) / 255
    input_tensor = preprocess_image(img, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

    # As usual for classication, the target is the logit output
    # before softmax, for that category.
    targets = [ClassifierOutputTarget(transformed_imagenet_dataset[i][1])]
    target_layers = [imagenet_resnet152.layer4]
    with GradCAM(model=imagenet_resnet152, target_layers=target_layers) as cam:
        grayscale_cams = cam(input_tensor=input_tensor.to(device), targets=targets)
        cam_image = show_cam_on_image(img, grayscale_cams[0, :], use_rgb=True)
    images = np.hstack((np.uint8(255*img) , cam_image))
    Image.fromarray(images).show()

In [31]:
#Mistakes ResNet152 made 
for i in mistak_resnet152:
    img = np.array(imagenet_dataset[i][0])
    img = cv2.resize(img, (224, 224))
    img = np.float32(img) / 255
    input_tensor = preprocess_image(img, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

    # As usual for classication, the target is the logit output
    # before softmax, for that category.
    targets = [ClassifierOutputTarget(transformed_imagenet_dataset[i][1])]
    target_layers = [imagenet_resnet152.layer4]
    with GradCAM(model=imagenet_resnet152, target_layers=target_layers) as cam:
        grayscale_cams = cam(input_tensor=input_tensor.to(device), targets=targets)
        cam_image = show_cam_on_image(img, grayscale_cams[0, :], use_rgb=True)
    images = np.hstack((np.uint8(255*img) , cam_image))
    Image.fromarray(images).show()

For ResNeXt101

In [None]:
imagenet_ResNeXt101.eval()
id=36
img = np.array(imagenet_dataset[id][0])
img = cv2.resize(img, (224, 224))
img = np.float32(img) / 255
input_tensor = preprocess_image(img, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

# As usual for classication, the target is the logit output
# before softmax, for that category.
targets = [ClassifierOutputTarget( transformed_imagenet_dataset[id][1])]
target_layers = [imagenet_ResNeXt101.layer4]
with GradCAM(model=imagenet_ResNeXt101, target_layers=target_layers) as cam:
    grayscale_cams = cam(input_tensor=input_tensor.to(device), targets=targets)
    cam_image = show_cam_on_image(img, grayscale_cams[0, :], use_rgb=True)
images = np.hstack((np.uint8(255*img) , cam_image))
Image.fromarray(images)

In [32]:
for i in [54,142,136,117,198,153,191,220,248]:
    img = np.array(imagenet_dataset[i][0])
    img = cv2.resize(img, (224, 224))
    img = np.float32(img) / 255
    input_tensor = preprocess_image(img, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

    # As usual for classication, the target is the logit output
    # before softmax, for that category.
    targets = [ClassifierOutputTarget(transformed_imagenet_dataset[i][1])]
    target_layers = [imagenet_ResNeXt101.layer4]
    with GradCAM(model=imagenet_ResNeXt101, target_layers=target_layers) as cam:
        grayscale_cams = cam(input_tensor=input_tensor.to(device), targets=targets)
        cam_image = show_cam_on_image(img, grayscale_cams[0, :], use_rgb=True)
    images = np.hstack((np.uint8(255*img) , cam_image))
    Image.fromarray(images).show()

In [33]:
#Mistakes ResNeXt101 made 
for i in mistak_resNext101:
    img = np.array(imagenet_dataset[i][0])
    img = cv2.resize(img, (224, 224))
    img = np.float32(img) / 255
    input_tensor = preprocess_image(img, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

    # As usual for classication, the target is the logit output
    # before softmax, for that category.
    targets = [ClassifierOutputTarget(transformed_imagenet_dataset[i][1])]
    target_layers = [imagenet_ResNeXt101.layer4]
    with GradCAM(model=imagenet_ResNeXt101, target_layers=target_layers) as cam:
        grayscale_cams = cam(input_tensor=input_tensor.to(device), targets=targets)
        cam_image = show_cam_on_image(img, grayscale_cams[0, :], use_rgb=True)
    images = np.hstack((np.uint8(255*img) , cam_image))
    Image.fromarray(images).show()

For EfficientNet_b4

In [None]:
imagenet_EfficientNet_b4.eval()
id=36
img = np.array(imagenet_dataset[id][0])
img = cv2.resize(img, (224, 224))
img = np.float32(img) / 255
input_tensor = preprocess_image(img, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

# As usual for classication, the target is the logit output
# before softmax, for that category.
targets = [ClassifierOutputTarget( transformed_imagenet_dataset[id][1])]
target_layers = [imagenet_EfficientNet_b4.layers[6]]
with GradCAM(model=imagenet_EfficientNet_b4, target_layers=target_layers) as cam:
    grayscale_cams = cam(input_tensor=input_tensor.to(device), targets=targets)
    cam_image = show_cam_on_image(img, grayscale_cams[0, :], use_rgb=True)
images = np.hstack((np.uint8(255*img) , cam_image))
Image.fromarray(images)

In [34]:
for i in [54,142,136,117,198,153,191,220,248]:
    img = np.array(imagenet_dataset[i][0])
    img = cv2.resize(img, (224, 224))
    img = np.float32(img) / 255
    input_tensor = preprocess_image(img, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

    # As usual for classication, the target is the logit output
    # before softmax, for that category.
    targets = [ClassifierOutputTarget(transformed_imagenet_dataset[i][1])]
    target_layers = [imagenet_EfficientNet_b4.layers[6]]
    with GradCAM(model=imagenet_EfficientNet_b4, target_layers=target_layers) as cam:
        grayscale_cams = cam(input_tensor=input_tensor.to(device), targets=targets)
        cam_image = show_cam_on_image(img, grayscale_cams[0, :], use_rgb=True)
    images = np.hstack((np.uint8(255*img) , cam_image))
    Image.fromarray(images).show()

In [35]:
#Mistakes EfficientNet made 
for i in mistak_effic:
    img = np.array(imagenet_dataset[i][0])
    img = cv2.resize(img, (224, 224))
    img = np.float32(img) / 255
    input_tensor = preprocess_image(img, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

    # As usual for classication, the target is the logit output
    # before softmax, for that category.
    targets = [ClassifierOutputTarget(transformed_imagenet_dataset[i][1])]
    target_layers = [imagenet_EfficientNet_b4.layers[6]]
    with GradCAM(model=imagenet_EfficientNet_b4, target_layers=target_layers) as cam:
        grayscale_cams = cam(input_tensor=input_tensor.to(device), targets=targets)
        cam_image = show_cam_on_image(img, grayscale_cams[0, :], use_rgb=True)
    images = np.hstack((np.uint8(255*img) , cam_image))
    Image.fromarray(images).show()

## Randomized Image Sampling for Explanations (RISE)

For AlexNet

In [9]:
imagenet_alexnet = nn.Sequential(imagenet_alexnet, nn.Softmax(dim=1)).to(device).eval()
for p in imagenet_alexnet.parameters():
    p.requires_grad = False
explainer = RISE(imagenet_alexnet)
#Generate masks.
explainer.generate_masks(Num_mask=6000, mask_size=8, prob=0.1)

100%|██████████| 6000/6000 [00:08<00:00, 706.54it/s]


In [10]:
def show_result(img: torch.Tensor,target:int) -> None:
    #Make predictions with the created masks.
    saliency = explainer(img.to(device)).cpu().numpy()
    img = img[None]
    #To get probability and prediction for the original image without masks.
    prob , prediction = imagenet_alexnet(tensor[None].to(device)).squeeze(0).cpu().max(-1)
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2,1)
    plt.axis('off')
    #The original image.
    tensor_imshow(img[0])

    plt.subplot(1, 2,2)
    plt.axis('off')
    tensor_imshow(img[0])
    sal = saliency[prediction]
    plt.imshow(sal, cmap='jet', alpha=0.5)
    plt.colorbar(fraction=0.046, pad=0.04)
    plt.suptitle("Img Prob.=%.2f. Excpected class: %s; Predicted class: %s."
            % (
                100*prob,
                class_nam[int(imagenet_dataset.classes[target])],
                class_nam[int(imagenet_dataset.classes[prediction])],
            ))
   
    plt.show()

In [None]:
i=36
tensor, target = transformed_imagenet_dataset[i]    
show_result(tensor,target)

In [None]:
for i in [54,142,198,153,191,248]:
    tensor, target = transformed_imagenet_dataset[i]    
    show_result(tensor,target)

In [None]:
for i in [136,117,220]:
    tensor, target = transformed_imagenet_dataset[i]    
    show_result(tensor,target)

In [None]:
#Mistakes AlexNet made
for i in mistak_alexnet:
    tensor, target = transformed_imagenet_dataset[i]    
    show_result(tensor,target)

In [None]:
random.seed(12345)
for i in range(30):
    tensor, target = random.choice(transformed_imagenet_dataset)    
    show_result(tensor,target)

For ResNet50

In [19]:
imagenet_resnet = nn.Sequential(imagenet_resnet, nn.Softmax(dim=1)).to(device).eval()
for p in imagenet_resnet.parameters():
    p.requires_grad = False
explainer = RISE(imagenet_resnet)
#Generate masks.
explainer.generate_masks(Num_mask=6000, mask_size=8, prob=0.1)

100%|██████████| 6000/6000 [00:08<00:00, 695.34it/s]


In [20]:
def show_result(img: torch.Tensor,target:int) -> None:
    #Make predictions with the created masks.
    saliency = explainer(img.to(device)).cpu().numpy()
    img = img[None]
    #To get probability and prediction for the original image without masks.
    prob , prediction = imagenet_resnet(tensor[None].to(device)).squeeze(0).cpu().max(-1)
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2,1)
    plt.axis('off')
    #The original image.
    tensor_imshow(img[0])

    plt.subplot(1, 2,2)
    plt.axis('off')
    tensor_imshow(img[0])
    sal = saliency[prediction]
    plt.imshow(sal, cmap='jet', alpha=0.5)
    plt.colorbar(fraction=0.046, pad=0.04)
    plt.suptitle("Img Prob.=%.2f. Excpected class: %s; Predicted class: %s."
            % (
                100*prob,
                class_nam[int(imagenet_dataset.classes[target])],
                class_nam[int(imagenet_dataset.classes[prediction])],
            ))
   
    plt.show()

In [None]:
i=36
tensor, target = transformed_imagenet_dataset[i]    
show_result(tensor,target)

In [None]:
for i in [54,142,198,153,191,248]:
    tensor, target = transformed_imagenet_dataset[i]    
    show_result(tensor,target)

In [None]:
for i in [136,117,220]:
    tensor, target = transformed_imagenet_dataset[i]    
    show_result(tensor,target)

In [None]:
#Mistakes ResNet50 made
for i in mistak_resnet50:
    tensor, target = transformed_imagenet_dataset[i]    
    show_result(tensor,target)

In [None]:
random.seed(12345)
for i in range(30):
    tensor, target = random.choice(transformed_imagenet_dataset)    
    show_result(tensor,target)

For ResNet152

In [29]:
imagenet_resnet152 = nn.Sequential(imagenet_resnet152, nn.Softmax(dim=1)).to(device).eval()
for p in imagenet_resnet152.parameters():
    p.requires_grad = False
explainer = RISE(imagenet_resnet152)
#Generate masks.
explainer.generate_masks(Num_mask=6000, mask_size=8, prob=0.1)

100%|██████████| 6000/6000 [00:08<00:00, 685.18it/s]


In [30]:
def show_result(img: torch.Tensor,target:int) -> None:
    #Make predictions with the created masks.
    saliency = explainer(img.to(device)).cpu().numpy()
    img = img[None]
    #To get probability and prediction for the original image without masks.
    prob , prediction = imagenet_resnet152(tensor[None].to(device)).squeeze(0).cpu().max(-1)
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2,1)
    plt.axis('off')
    #The original image.
    tensor_imshow(img[0])

    plt.subplot(1, 2,2)
    plt.axis('off')
    tensor_imshow(img[0])
    sal = saliency[prediction]
    plt.imshow(sal, cmap='jet', alpha=0.5)
    plt.colorbar(fraction=0.046, pad=0.04)
    plt.suptitle("Img Prob.=%.2f. Excpected class: %s; Predicted class: %s."
            % (
                100*prob,
                class_nam[int(imagenet_dataset.classes[target])],
                class_nam[int(imagenet_dataset.classes[prediction])],
            ))
   
    plt.show()

In [None]:
i=36
tensor, target = transformed_imagenet_dataset[i]    
show_result(tensor,target)

In [None]:
for i in [54,142,198,153,191,248]:
    tensor, target = transformed_imagenet_dataset[i]    
    show_result(tensor,target)

In [None]:
for i in [136,117,220]:
    tensor, target = transformed_imagenet_dataset[i]    
    show_result(tensor,target)

In [None]:
#Mistakes ResNet152 made
for i in mistak_resnet152:
    tensor, target = transformed_imagenet_dataset[i]    
    show_result(tensor,target)

In [None]:
random.seed(12345)
for i in range(30):
    tensor, target = random.choice(transformed_imagenet_dataset)    
    show_result(tensor,target)

For ResNeXt101

In [39]:
imagenet_ResNeXt101 = nn.Sequential(imagenet_ResNeXt101, nn.Softmax(dim=1)).to(device).eval()
for p in imagenet_ResNeXt101.parameters():
    p.requires_grad = False
explainer = RISE(imagenet_ResNeXt101)
#Generate masks.
explainer.generate_masks(Num_mask=6000, mask_size=8, prob=0.1)

100%|██████████| 6000/6000 [00:08<00:00, 713.75it/s]


In [40]:
def show_result(img: torch.Tensor,target:int) -> None:
    #Make predictions with the created masks.
    saliency = explainer(img.to(device)).cpu().numpy()
    img = img[None]
    #To get probability and prediction for the original image without masks.
    prob , prediction = imagenet_ResNeXt101(tensor[None].to(device)).squeeze(0).cpu().max(-1)
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2,1)
    plt.axis('off')
    #The original image.
    tensor_imshow(img[0])

    plt.subplot(1, 2,2)
    plt.axis('off')
    tensor_imshow(img[0])
    sal = saliency[prediction]
    plt.imshow(sal, cmap='jet', alpha=0.5)
    plt.colorbar(fraction=0.046, pad=0.04)
    plt.suptitle("Img Prob.=%.2f. Excpected class: %s; Predicted class: %s."
            % (
                100*prob,
                class_nam[int(imagenet_dataset.classes[target])],
                class_nam[int(imagenet_dataset.classes[prediction])],
            ))
   
    plt.show()

In [None]:
i=36
tensor, target = transformed_imagenet_dataset[i]    
show_result(tensor,target)

In [None]:
for i in [54,142,198,153,191,248]:
    tensor, target = transformed_imagenet_dataset[i]    
    show_result(tensor,target)

In [None]:
for i in [136,117,220]:
    tensor, target = transformed_imagenet_dataset[i]    
    show_result(tensor,target)

In [None]:
#Mistakes ResNeXt101 made
for i in mistak_resNext101:
    tensor, target = transformed_imagenet_dataset[i]    
    show_result(tensor,target)

In [None]:
random.seed(12345)
for i in range(30):
    tensor, target = random.choice(transformed_imagenet_dataset)    
    show_result(tensor,target)

For EfficientNet_b4

In [49]:
imagenet_EfficientNet_b4 = nn.Sequential(imagenet_EfficientNet_b4, nn.Softmax(dim=1)).to(device).eval()
for p in imagenet_EfficientNet_b4.parameters():
    p.requires_grad = False
explainer = RISE(imagenet_EfficientNet_b4)
#Generate masks.
explainer.generate_masks(Num_mask=6000, mask_size=8, prob=0.1)

  0%|          | 0/6000 [00:00<?, ?it/s]

100%|██████████| 6000/6000 [00:08<00:00, 686.78it/s]


In [50]:
def show_result(img: torch.Tensor,target:int) -> None:
    #Make predictions with the created masks.
    saliency = explainer(img.to(device)).cpu().numpy()
    img = img[None]
    #To get probability and prediction for the original image without masks.
    prob , prediction = imagenet_EfficientNet_b4(tensor[None].to(device)).squeeze(0).cpu().max(-1)
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2,1)
    plt.axis('off')
    #The original image.
    tensor_imshow(img[0])

    plt.subplot(1, 2,2)
    plt.axis('off')
    tensor_imshow(img[0])
    sal = saliency[prediction]
    plt.imshow(sal, cmap='jet', alpha=0.5)
    plt.colorbar(fraction=0.046, pad=0.04)
    plt.suptitle("Img Prob.=%.2f. Excpected class: %s; Predicted class: %s."
            % (
                100*prob,
                class_nam[int(imagenet_dataset.classes[target])],
                class_nam[int(imagenet_dataset.classes[prediction])],
            ))
   
    plt.show()

In [None]:
i=36
tensor, target = transformed_imagenet_dataset[i]    
show_result(tensor,target)

In [None]:
for i in [54,142,198,153,191,248]:
    tensor, target = transformed_imagenet_dataset[i]    
    show_result(tensor,target)

In [None]:
for i in [136,117,220]:
    tensor, target = transformed_imagenet_dataset[i]    
    show_result(tensor,target)

In [None]:
#Mistakes EfficientNet_b4 made
for i in mistak_effic:
    tensor, target = transformed_imagenet_dataset[i]    
    show_result(tensor,target)

In [None]:
random.seed(12345)
for i in range(30):
    tensor, target = random.choice(transformed_imagenet_dataset)    
    show_result(tensor,target)

For ViT

In [58]:
imagenet_ViT = nn.Sequential(imagenet_ViT, nn.Softmax(dim=1)).to(device).eval()
for p in imagenet_ViT.parameters():
    p.requires_grad = False
explainer = RISE(imagenet_ViT)
#Generate masks.
explainer.generate_masks(Num_mask=6000, mask_size=8, prob=0.1)

  0%|          | 0/6000 [00:00<?, ?it/s]

100%|██████████| 6000/6000 [00:08<00:00, 713.22it/s]


In [59]:
def show_result(img: torch.Tensor,target:int) -> None:
    #Make predictions with the created masks.
    saliency = explainer(img.to(device)).cpu().numpy()
    img = img[None]
    #To get probability and prediction for the original image without masks.
    prob , prediction = imagenet_ViT(tensor[None].to(device)).squeeze(0).cpu().max(-1)
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2,1)
    plt.axis('off')
    #The original image.
    tensor_imshow(img[0])

    plt.subplot(1, 2,2)
    plt.axis('off')
    tensor_imshow(img[0])
    sal = saliency[prediction]
    plt.imshow(sal, cmap='jet', alpha=0.5)
    plt.colorbar(fraction=0.046, pad=0.04)
    plt.suptitle("Img Prob.=%.2f. Excpected class: %s; Predicted class: %s."
            % (
                100*prob,
                class_nam[int(imagenet_dataset.classes[target])],
                class_nam[int(imagenet_dataset.classes[prediction])],
            ))
   
    plt.show()

In [None]:
i=36
tensor, target = transformed_imagenet_dataset[i]    
show_result(tensor,target)

In [None]:
for i in [54,142,198,153,191,248]:
    tensor, target = transformed_imagenet_dataset[i]    
    show_result(tensor,target)

In [None]:
for i in [136,117,220]:
    tensor, target = transformed_imagenet_dataset[i]    
    show_result(tensor,target)

In [None]:
#Mistakes ViT made
for i in mistak_vit:
    tensor, target = transformed_imagenet_dataset[i]    
    show_result(tensor,target)

In [None]:
random.seed(12345)
for i in range(30):
    tensor, target = random.choice(transformed_imagenet_dataset)    
    show_result(tensor,target)

## References:

1- The dataset is downloaded from this site: https://www.dropbox.com/sh/yrfmp7hwa2w9gxz/AAATMrfWNLctPq1vnRa3mtZPa?dl=0 (accessed on August 14, 2023).

About the data: https://github.com/megvii-research/FSSD_OoD_Detection/issues/1

2- For RISE implementation, we relied mainly on this repository: https://github.com/eclique/RISE (accessed on August 14, 2023)

3- For Grad-CAM, we relied mainly on this repository: https://github.com/jacobgil/pytorch-grad-cam (accessed on August 14, 2023)



