In [1]:
import cv2
import numpy as np
import pandas as pd
import torch
import timm
import time

from face_alignment.detection.blazeface import blazeface_detector
from pathlib import Path
from PIL import Image
from transformers import (
    ViTForImageClassification, 
    ResNetForImageClassification,
    ViTImageProcessor, 
    ConvNextImageProcessor, 
    PreTrainedModel, 
    PretrainedConfig)
from typing import Tuple

In [2]:
class VggConfig(PretrainedConfig):
    model_type = "vgg16"
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

class myvgg16(PreTrainedModel):
    config_class = VggConfig

    def __init__(self, model, config):
        super().__init__(config)
        self.model = model
    def forward(self, pixel_values, labels=None):
        logits = self.model(pixel_values)
        if labels is not None:
            loss = torch.nn.functional.cross_entropy(logits, labels)
            return {"loss": loss, "logits": logits}
        return {"logits": logits}
    
class InceptionConfig(PretrainedConfig):
    model_type = "Inception V3"
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

class myinceptionv3(PreTrainedModel):
    config_class = InceptionConfig

    def __init__(self, model, config):
        super().__init__(config)
        self.model = model
    def forward(self, pixel_values, labels=None):
        logits = self.model(pixel_values)
        if labels is not None:
            loss = torch.nn.functional.cross_entropy(logits, labels)
            return {"loss": loss, "logits": logits}
        return {"logits": logits}

In [3]:
# = Data (Images + Face masks)
IMAGES_PATH = Path(r'D:\ComputerScience\Mestrado\data\datasets\Occlusion\images')
MASKS_PATH = Path(r'D:\ComputerScience\Mestrado\results\Occlusion\sam')

# = Trained models
VIT_PATH = Path(r'D:\ComputerScience\Mestrado\results\classification_models\ViT')
RESNET_PATH = Path(r'D:\ComputerScience\Mestrado\results\classification_models\Resnet50')
VGG_PATH = Path(r'D:\ComputerScience\Mestrado\results\classification_models\Vgg16')
INCEPTION_PATH = Path(r'D:\ComputerScience\Mestrado\results\classification_models\InceptionV3')

# = Mapping variables
ID2LABEL = {0: 'com_dor', 1: 'sem_dor'}
gt = {
    'f1': 'com_dor',
    'f2': 'sem_dor',
    'f3': 'com_dor',
    'f4': 'com_dor',
    'f5': 'com_dor',
    'f6': 'sem_dor',
    'f7': 'com_dor',
    'f8': 'sem_dor',
    'f9': 'sem_dor',
    'f10': 'sem_dor',
}

# = Image paths
img_files = sorted(list(IMAGES_PATH.glob('*')), key=lambda a: int(a.stem[1:]))[:-1]

# = Output
accuracies = cols = {'ViT': [], 'Resnet50': [], 'Vgg16': [], 'InceptionV3': []}
accuracies_index = []

In [4]:
def load_vit(training_set_name:str) -> Tuple[ViTForImageClassification,ViTImageProcessor]:
    model_path = VIT_PATH / training_set_name
    model = ViTForImageClassification.from_pretrained(model_path)
    processor = ViTImageProcessor.from_pretrained(model_path)
    return model, processor

def load_resnet(training_set_name:str) -> Tuple[ResNetForImageClassification]:
    model_path = RESNET_PATH / training_set_name
    model = ResNetForImageClassification.from_pretrained(model_path)
    processor = ConvNextImageProcessor.from_pretrained(model_path)
    return model, processor

def load_vgg(training_set_name:str):
    vgg = timm.create_model('vgg16.tv_in1k', pretrained=True, num_classes=2)
    data_config = timm.data.resolve_model_data_config(vgg)
    vggcfg = VggConfig()
    model = myvgg16(vgg, vggcfg)
    model.load_state_dict(torch.load(VGG_PATH / training_set_name / 'pytorch_model.bin', map_location=torch.device('cpu')))
    transforms = timm.data.create_transform(**data_config, is_training=False)
    return model, transforms

def load_inception(training_set_name:str):
    vgg = timm.create_model('inception_v3.tv_in1k', pretrained=True, num_classes=2)
    data_config = timm.data.resolve_model_data_config(vgg)
    vggcfg = VggConfig()
    model = myvgg16(vgg, vggcfg)
    model.load_state_dict(torch.load(INCEPTION_PATH / training_set_name / 'pytorch_model.bin', map_location=torch.device('cpu')))
    transforms = timm.data.create_transform(**data_config, is_training=False)
    return model, transforms

In [5]:
def process_images_vit(image_paths, model, processor, crop=False, mask=False):
    images = []
    for img_path in image_paths:
        img = cv2.cvtColor(cv2.imread(str(img_path)), cv2.COLOR_BGR2RGB)
        if crop or mask:
            mask_img = cv2.imread(str(MASKS_PATH / img_path.with_suffix('.png').name), 0)
            if mask_img is None:
                print(f'ViT: mask not found for {img_path.name}... skipping')
                continue
            masked_img = cv2.bitwise_and(img, img, mask=mask_img)
            rows, cols = np.where(mask_img>0)
            img = masked_img if mask else img
            img = img[min(rows):max(rows), min(cols):max(cols)]
        images.append(img)
    start = time.time()
    inputs = processor(images=images, return_tensors="pt") # pt = torch.tensor
    logits = model(**inputs).logits
    print(f"ViT avg time = {(time.time()-start)/len(image_paths)}s")
    return [model.config.id2label[lgt.argmax(-1).item()] for lgt in logits]

def process_images_resnet(image_paths, model, processor, crop=False, mask=False):
    images = []
    for img_path in image_paths:
        img = cv2.cvtColor(cv2.imread(str(img_path)), cv2.COLOR_BGR2RGB)
        if crop or mask:
            mask_img = cv2.imread(str(MASKS_PATH / img_path.with_suffix('.png').name), 0)
            if mask_img is None:
                print(f'Resnet50: mask not found for {img_path.name}... skipping')
                continue
            masked_img = cv2.bitwise_and(img, img, mask=mask_img)
            rows, cols = np.where(mask_img>0)
            img = masked_img if mask else img
            img = img[min(rows):max(rows), min(cols):max(cols)]
        images.append(img)
    start = time.time()
    inputs = processor(images=images, return_tensors="pt") # pt = torch.tensor
    logits = model(**inputs).logits
    print(f"Resnet50 avg time = {(time.time()-start)/len(image_paths)}s")
    return [model.config.id2label[lgt.argmax(-1).item()] for lgt in logits]

def process_images_vgg(image_paths, model, transforms, crop=False, mask=False):
    results = []
    times = []
    for img_path in image_paths:
        img = cv2.cvtColor(cv2.imread(str(img_path)), cv2.COLOR_BGR2RGB)
        if crop or mask:
            mask_img = cv2.imread(str(MASKS_PATH / img_path.with_suffix('.png').name), 0)
            if mask_img is None:
                print(f'VGG: mask not found for {img_path.name}... skipping')
                continue
            masked_img = cv2.bitwise_and(img, img, mask=mask_img)
            rows, cols = np.where(mask_img>0)
            img = masked_img if mask else img
            img = img[min(rows):max(rows), min(cols):max(cols)]
        start = time.time()
        input_data = transforms(Image.fromarray(img)).unsqueeze(0)
        prediction = model.model(input_data)
        times.append(time.time()-start)
        results.append(prediction)
    print(f"VGG avg time = {np.mean(times)}s")
    return [ID2LABEL[lgt.argmax(-1).item()] for lgt in results]

def process_images_inception(image_paths, model, transforms, crop=False, mask=False):
    results = []
    times = []
    for img_path in image_paths:
        img = cv2.cvtColor(cv2.imread(str(img_path)), cv2.COLOR_BGR2RGB)
        if crop or mask:
            mask_img = cv2.imread(str(MASKS_PATH / img_path.with_suffix('.png').name), 0)
            if mask_img is None:
                print(f'InceptionV3: mask not found for {img_path.name}... skipping')
                continue
            masked_img = cv2.bitwise_and(img, img, mask=mask_img)
            rows, cols = np.where(mask_img>0)
            img = masked_img if mask else img
            img = img[min(rows):max(rows), min(cols):max(cols)]
        start = time.time()
        input_data = transforms(Image.fromarray(img)).unsqueeze(0)
        prediction = model.model(input_data)
        times.append(time.time()-start)
        results.append(prediction)
    print(f"InceptionV3 avg time = {np.mean(times)}s")
    return [ID2LABEL[lgt.argmax(-1).item()] for lgt in results]

---

## IMAGEM COMPLETA

In [6]:
# === IMAGEM COMPLETA === #
DATASET = 'UNIFESP360'
vit_model, vit_processor = load_vit(DATASET)
resnet_model, resnet_processor = load_resnet(DATASET)
vgg_model, vgg_transforms = load_vgg(DATASET)
inception_model, inception_transforms = load_inception(DATASET)

In [7]:
vit_predictions = process_images_vit(img_files, vit_model, vit_processor, crop=False, mask=False)
resnet_predictions = process_images_resnet(img_files, vit_model, vit_processor, crop=False, mask=False)
vgg_predictions = process_images_vgg(img_files, vgg_model, vgg_transforms, crop=False, mask=False)
inception_predictions = process_images_inception(img_files, inception_model, inception_transforms, crop=False, mask=False)

cols = {'ViT': [], 'Resnet50': [], 'Vgg16': [], 'InceptionV3': [], 'Ground Truth': []}
index = []
for f, vit_pred, resnet_pred, vgg_pred, inception_pred in zip(img_files, vit_predictions, resnet_predictions, vgg_predictions, inception_predictions):
    index.append(f.stem)
    cols['ViT'].append(vit_pred)
    cols['Resnet50'].append(resnet_pred)
    cols['Vgg16'].append(vgg_pred)
    cols['InceptionV3'].append(inception_pred)
    cols['Ground Truth'].append(gt[f.stem])
    print(f)

df = pd.DataFrame(cols, index=index)
display(df)
acc = lambda df, col: (df[col] == df['Ground Truth']).sum()/len(df)*100
print(f"Correct predictions:\n- ViT: {acc(df,'ViT'):.2f}%\n- Resnet50: {acc(df,'Resnet50'):.2f}%\n- Vgg16: {acc(df,'Vgg16'):.2f}%\n- InceptionV3: {acc(df,'InceptionV3'):.2f}%")

accuracies['ViT'].append(acc(df,'ViT'))
accuracies['Resnet50'].append(acc(df,'Resnet50'))
accuracies['Vgg16'].append(acc(df,'Vgg16'))
accuracies['InceptionV3'].append(acc(df,'InceptionV3'))
accuracies_index.append('Imagem Completa')

ViT avg time = 0.29108532269795734s
Resnet50 avg time = 0.3042195638020833s
VGG avg time = 0.21370326148139107s
InceptionV3 avg time = 0.1442345513237847s
D:\ComputerScience\Mestrado\data\datasets\Occlusion\images\f1.jpg
D:\ComputerScience\Mestrado\data\datasets\Occlusion\images\f2.jpg
D:\ComputerScience\Mestrado\data\datasets\Occlusion\images\f3.jpg
D:\ComputerScience\Mestrado\data\datasets\Occlusion\images\f4.jpg
D:\ComputerScience\Mestrado\data\datasets\Occlusion\images\f5.jpg
D:\ComputerScience\Mestrado\data\datasets\Occlusion\images\f6.jpg
D:\ComputerScience\Mestrado\data\datasets\Occlusion\images\f7.jpg
D:\ComputerScience\Mestrado\data\datasets\Occlusion\images\f8.jpg
D:\ComputerScience\Mestrado\data\datasets\Occlusion\images\f9.jpg


Unnamed: 0,ViT,Resnet50,Vgg16,InceptionV3,Ground Truth
f1,sem_dor,sem_dor,com_dor,com_dor,com_dor
f2,sem_dor,sem_dor,sem_dor,sem_dor,sem_dor
f3,com_dor,com_dor,com_dor,sem_dor,com_dor
f4,com_dor,com_dor,com_dor,sem_dor,com_dor
f5,com_dor,com_dor,sem_dor,sem_dor,com_dor
f6,sem_dor,sem_dor,com_dor,sem_dor,sem_dor
f7,com_dor,com_dor,com_dor,sem_dor,com_dor
f8,com_dor,com_dor,com_dor,sem_dor,sem_dor
f9,com_dor,com_dor,com_dor,sem_dor,sem_dor


Correct predictions:
- ViT: 66.67%
- Resnet50: 66.67%
- Vgg16: 55.56%
- InceptionV3: 55.56%


---

## IMAGEM DA FACE

In [8]:
# === IMAGEM COMPLETA === #
DATASET = 'UNIFESP360Faces'
vit_model, vit_processor = load_vit(DATASET)
resnet_model, resnet_processor = load_resnet(DATASET)
vgg_model, vgg_transforms = load_vgg(DATASET)
inception_model, inception_transforms = load_inception(DATASET)

In [9]:
vit_predictions = process_images_vit(img_files, vit_model, vit_processor, crop=True, mask=False)
resnet_predictions = process_images_resnet(img_files, vit_model, vit_processor, crop=True, mask=False)
vgg_predictions = process_images_vgg(img_files, vgg_model, vgg_transforms, crop=True, mask=False)
inception_predictions = process_images_inception(img_files, inception_model, inception_transforms, crop=True, mask=False)

cols = {'ViT': [], 'Resnet50': [], 'Vgg16': [], 'InceptionV3': [], 'Ground Truth': []}
index = []
for f, vit_pred, resnet_pred, vgg_pred, inception_pred in zip(img_files, vit_predictions, resnet_predictions, vgg_predictions, inception_predictions):
    index.append(f.stem)
    cols['ViT'].append(vit_pred)
    cols['Resnet50'].append(resnet_pred)
    cols['Vgg16'].append(vgg_pred)
    cols['InceptionV3'].append(inception_pred)
    cols['Ground Truth'].append(gt[f.stem])
    print(f)
df = pd.DataFrame(cols, index=index)
display(df)
acc = lambda df, col: (df[col] == df['Ground Truth']).sum()/len(df)*100
print(f"Correct predictions:\n- ViT: {acc(df,'ViT'):.2f}%\n- Resnet50: {acc(df,'Resnet50'):.2f}%\n- Vgg16: {acc(df,'Vgg16'):.2f}%\n- InceptionV3: {acc(df,'InceptionV3'):.2f}%")

accuracies['ViT'].append(acc(df,'ViT'))
accuracies['Resnet50'].append(acc(df,'Resnet50'))
accuracies['Vgg16'].append(acc(df,'Vgg16'))
accuracies['InceptionV3'].append(acc(df,'InceptionV3'))
accuracies_index.append('Face')

ViT avg time = 0.26304962899949813s
Resnet50 avg time = 0.2758337656656901s
VGG avg time = 0.20919283231099448s
InceptionV3 avg time = 0.11867854330274794s
D:\ComputerScience\Mestrado\data\datasets\Occlusion\images\f1.jpg
D:\ComputerScience\Mestrado\data\datasets\Occlusion\images\f2.jpg
D:\ComputerScience\Mestrado\data\datasets\Occlusion\images\f3.jpg
D:\ComputerScience\Mestrado\data\datasets\Occlusion\images\f4.jpg
D:\ComputerScience\Mestrado\data\datasets\Occlusion\images\f5.jpg
D:\ComputerScience\Mestrado\data\datasets\Occlusion\images\f6.jpg
D:\ComputerScience\Mestrado\data\datasets\Occlusion\images\f7.jpg
D:\ComputerScience\Mestrado\data\datasets\Occlusion\images\f8.jpg
D:\ComputerScience\Mestrado\data\datasets\Occlusion\images\f9.jpg


Unnamed: 0,ViT,Resnet50,Vgg16,InceptionV3,Ground Truth
f1,com_dor,com_dor,sem_dor,com_dor,com_dor
f2,sem_dor,sem_dor,sem_dor,com_dor,sem_dor
f3,com_dor,com_dor,com_dor,com_dor,com_dor
f4,com_dor,com_dor,sem_dor,com_dor,com_dor
f5,sem_dor,sem_dor,com_dor,com_dor,com_dor
f6,com_dor,com_dor,sem_dor,com_dor,sem_dor
f7,com_dor,com_dor,sem_dor,com_dor,com_dor
f8,com_dor,com_dor,com_dor,com_dor,sem_dor
f9,com_dor,com_dor,com_dor,com_dor,sem_dor


Correct predictions:
- ViT: 55.56%
- Resnet50: 55.56%
- Vgg16: 44.44%
- InceptionV3: 55.56%


---

## IMAGEM DA FACE C/ MASCARA APLICADA

In [10]:
# === IMAGEM COMPLETA === #
DATASET = 'UNIFESP360FacesMasked'
vit_model, vit_processor = load_vit(DATASET)
resnet_model, resnet_processor = load_resnet(DATASET)
vgg_model, vgg_transforms = load_vgg(DATASET)
inception_model, inception_transforms = load_inception(DATASET)

In [11]:
vit_predictions = process_images_vit(img_files, vit_model, vit_processor, crop=True, mask=True)
resnet_predictions = process_images_resnet(img_files, vit_model, vit_processor, crop=True, mask=True)
vgg_predictions = process_images_vgg(img_files, vgg_model, vgg_transforms, crop=True, mask=True)
inception_predictions = process_images_inception(img_files, inception_model, inception_transforms, crop=True, mask=True)

cols = {'ViT': [], 'Resnet50': [], 'Vgg16': [], 'InceptionV3': [], 'Ground Truth': []}
index = []
for f, vit_pred, resnet_pred, vgg_pred, inception_pred in zip(img_files, vit_predictions, resnet_predictions, vgg_predictions, inception_predictions):
    index.append(f.stem)
    cols['ViT'].append(vit_pred)
    cols['Resnet50'].append(resnet_pred)
    cols['Vgg16'].append(vgg_pred)
    cols['InceptionV3'].append(inception_pred)
    cols['Ground Truth'].append(gt[f.stem])
    print(f)

df = pd.DataFrame(cols, index=index)
display(df)
acc = lambda df, col: (df[col] == df['Ground Truth']).sum()/len(df)*100
print(f"Correct predictions:\n- ViT: {acc(df,'ViT'):.2f}%\n- Resnet50: {acc(df,'Resnet50'):.2f}%\n- Vgg16: {acc(df,'Vgg16'):.2f}%\n- InceptionV3: {acc(df,'InceptionV3'):.2f}%")

accuracies['ViT'].append(acc(df,'ViT'))
accuracies['Resnet50'].append(acc(df,'Resnet50'))
accuracies['Vgg16'].append(acc(df,'Vgg16'))
accuracies['InceptionV3'].append(acc(df,'InceptionV3'))
accuracies_index.append('Face+Mascara')

ViT avg time = 0.26592792405022514s
Resnet50 avg time = 0.28234680493672687s
VGG avg time = 0.20776237381829155s
InceptionV3 avg time = 0.1143656571706136s
D:\ComputerScience\Mestrado\data\datasets\Occlusion\images\f1.jpg
D:\ComputerScience\Mestrado\data\datasets\Occlusion\images\f2.jpg
D:\ComputerScience\Mestrado\data\datasets\Occlusion\images\f3.jpg
D:\ComputerScience\Mestrado\data\datasets\Occlusion\images\f4.jpg
D:\ComputerScience\Mestrado\data\datasets\Occlusion\images\f5.jpg
D:\ComputerScience\Mestrado\data\datasets\Occlusion\images\f6.jpg
D:\ComputerScience\Mestrado\data\datasets\Occlusion\images\f7.jpg
D:\ComputerScience\Mestrado\data\datasets\Occlusion\images\f8.jpg
D:\ComputerScience\Mestrado\data\datasets\Occlusion\images\f9.jpg


Unnamed: 0,ViT,Resnet50,Vgg16,InceptionV3,Ground Truth
f1,com_dor,com_dor,com_dor,com_dor,com_dor
f2,sem_dor,sem_dor,com_dor,com_dor,sem_dor
f3,com_dor,com_dor,com_dor,com_dor,com_dor
f4,com_dor,com_dor,com_dor,com_dor,com_dor
f5,com_dor,com_dor,com_dor,com_dor,com_dor
f6,com_dor,com_dor,sem_dor,com_dor,sem_dor
f7,com_dor,com_dor,com_dor,com_dor,com_dor
f8,com_dor,com_dor,com_dor,com_dor,sem_dor
f9,com_dor,com_dor,com_dor,com_dor,sem_dor


Correct predictions:
- ViT: 66.67%
- Resnet50: 66.67%
- Vgg16: 66.67%
- InceptionV3: 55.56%


In [12]:
pd.DataFrame(accuracies, index=accuracies_index)

Unnamed: 0,ViT,Resnet50,Vgg16,InceptionV3
Imagem Completa,66.666667,66.666667,55.555556,55.555556
Face,55.555556,55.555556,44.444444,55.555556
Face+Mascara,66.666667,66.666667,66.666667,55.555556
