<a href="https://colab.research.google.com/github/mobarakol/tutorial_notebooks/blob/main/VisualBert_EndoVis18_VQA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Downloading the VQA EndoVis18 Dataset https://drive.google.com/file/d/1WGdztykX3nW6pi_BKp4rO8nA7ESNRfVN/view?usp=sharing
!gdown --id 1WGdztykX3nW6pi_BKp4rO8nA7ESNRfVN

# Unzipping the VQA EndoVis18 Dataset\
!unzip -q EndoVis-18-VQA.zip

Downloading...
From (original): https://drive.google.com/uc?id=1WGdztykX3nW6pi_BKp4rO8nA7ESNRfVN
From (redirected): https://drive.google.com/uc?id=1WGdztykX3nW6pi_BKp4rO8nA7ESNRfVN&confirm=t&uuid=256bd529-525c-4dd4-aacc-781f0c9bf4a7
To: /content/EndoVis-18-VQA.zip
100% 2.70G/2.70G [00:32<00:00, 82.7MB/s]


In [2]:
from torch.utils.data import Dataset
from PIL import Image
import os
import glob
import torchvision.transforms as transforms
from torchvision import models
from torch import nn
from transformers import ViTFeatureExtractor, AutoFeatureExtractor

class EndoVis18VQAGPTClassification(Dataset):
    '''
    	seq: train_seq  = [2, 3, 4, 6, 7, 9, 10, 11, 12, 14, 15]
    	     val_seq    = [1, 5, 16]
    	folder_head     = 'dataset/EndoVis-18-VQA/seq_'
    	folder_tail     = '/vqa/Classification/*.txt'
    '''
    def __init__(self, seq, folder_head, folder_tail, transform=None):

        self.transform = transforms.Compose([
                    transforms.Resize((224,224)),
                    transforms.ToTensor(),
                    transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
                    ])



        # files, question and answers
        filenames = []
        for curr_seq in seq: filenames = filenames + glob.glob(folder_head + str(curr_seq) + folder_tail)
        self.vqas = []
        for file in filenames:
            file_data = open(file, "r")
            lines = [line.strip("\n") for line in file_data if line != "\n"]
            file_data.close()
            for line in lines: self.vqas.append([file, line])
        print('Total files: %d | Total question: %.d' %(len(filenames), len(self.vqas)))

        # Labels
        self.labels = ['kidney', 'Idle', 'Grasping', 'Retraction', 'Tissue_Manipulation',
                        'Tool_Manipulation', 'Cutting', 'Cauterization', 'Suction',
                        'Looping', 'Suturing', 'Clipping', 'Staple', 'Ultrasound_Sensing',
                        'left-top', 'right-top', 'left-bottom', 'right-bottom']

    def __len__(self):
        return len(self.vqas)

    def __getitem__(self, idx):
        loc = self.vqas[idx][0].split('/')

        # img
        img_loc = os.path.join(loc[0],loc[1],'left_frames',loc[-1].split('_')[0]+'.png')
        if self.transform:
            img = Image.open(img_loc)
            img = self.transform(img)

        # question and answer
        question = self.vqas[idx][1].split('|')[0]
        label = self.labels.index(str(self.vqas[idx][1].split('|')[1]))

        return img, question, label

train_seq = [2, 3, 4, 6, 7, 9, 10, 11, 12, 14, 15]
val_seq = [1, 5, 16]

folder_head = 'EndoVis-18-VQA/seq_'
folder_tail = '/vqa/Classification/*.txt'

train_dataset = EndoVis18VQAGPTClassification(train_seq, folder_head, folder_tail)
print(len(train_dataset))
imgs, question, label = train_dataset[0]
# train_dataloader = DataLoader(dataset=train_dataset, batch_size= args.batch_size, shuffle=True, num_workers=8)
# val_dataset = EndoVis18VQAGPTClassification(val_seq, folder_head, folder_tail, model_ver=args.model_ver)
# val_dataloader = DataLoader(dataset=val_dataset, batch_size= args.batch_size, shuffle=False, num_workers=8)



Total files: 1560 | Total question: 9014
9014


In [3]:
import torch
import os
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_fscore_support

class AverageMeter(object):
    """
    Keeps track of most recent, average, sum, and count of a metric.
    """

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def adjust_learning_rate(optimizer, shrink_factor):
    """
    Shrinks learning rate by a specified factor.

    :param optimizer: optimizer whose learning rate must be shrunk.
    :param shrink_factor: factor in interval (0, 1) to multiply learning rate with.
    """

    print("\nDECAYING learning rate.")
    for param_group in optimizer.param_groups:
        param_group['lr'] = param_group['lr'] * shrink_factor
    print("The new learning rate is %f\n" % (optimizer.param_groups[0]['lr'],))


def save_clf_checkpoint(checkpoint_dir, epoch, epochs_since_improvement, model, optimizer, Acc, final_args):
    """
    Saves model checkpoint.
    """
    state = {'epoch': epoch,
             'epochs_since_improvement': epochs_since_improvement,
             'Acc': Acc,
             'model': model,
             'optimizer': optimizer,
             'final_args': final_args}
    filename = checkpoint_dir + 'Best.pth.tar'
    torch.save(state, filename)

def calc_acc(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    return acc

def calc_classwise_acc(y_true, y_pred):
    matrix = confusion_matrix(y_true, y_pred)
    classwise_acc = matrix.diagonal()/matrix.sum(axis=1)
    return classwise_acc

def calc_map(y_true, y_scores):
    mAP = average_precision_score(y_true, y_scores,average=None)
    return mAP

def calc_precision_recall_fscore(y_true, y_pred):
    precision, recall, fscore, _ = precision_recall_fscore_support(y_true, y_pred, average='macro', zero_division = 1)
    return(precision, recall, fscore)


def seed_everything(seed=27):
    '''
    Set random seed for reproducible experiments
    Inputs: seed number
    '''
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [4]:
import torch
from torch import nn
from transformers import VisualBertModel, VisualBertConfig

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


'''
VisualBert Classification Model
'''
class VisualBertClassification(nn.Module):
    '''
    VisualBert Classification Model
    vocab_size    = tokenizer length
    encoder_layer = 6
    n_heads       = 8
    num_class     = number of class in dataset
    '''
    def __init__(self, vocab_size, layers, n_heads, num_class = 10):
        super(VisualBertClassification, self).__init__()
        VBconfig = VisualBertConfig(vocab_size= vocab_size, visual_embedding_dim = 512, num_hidden_layers = layers, num_attention_heads = n_heads, hidden_size = 2048)
        self.VisualBertEncoder = VisualBertModel(VBconfig)
        self.classifier = nn.Linear(VBconfig.hidden_size, num_class)
        self.dropout = nn.Dropout(VBconfig.hidden_dropout_prob)
        self.num_labels = num_class

        ## image processing
        self.img_feature_extractor = models.resnet18(weights=True)
        new_fc = nn.Sequential(*list(self.img_feature_extractor.fc.children())[:-1])
        self.img_feature_extractor.fc = new_fc

    def forward(self, inputs, img):
        # prepare visual embedding
        visual_embeds = self.img_feature_extractor(img)
        visual_embeds = torch.unsqueeze(visual_embeds, dim=1)
        visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long).to(device)
        visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float).to(device)

        # append visual features to text
        inputs.update({
                        "visual_embeds": visual_embeds,
                        "visual_token_type_ids": visual_token_type_ids,
                        "visual_attention_mask": visual_attention_mask,
                        "output_attentions": True
                        })

        inputs['input_ids'] = inputs['input_ids'].to(device)
        inputs['token_type_ids'] = inputs['token_type_ids'].to(device)
        inputs['attention_mask'] = inputs['attention_mask'].to(device)
        inputs['visual_token_type_ids'] = inputs['visual_token_type_ids'].to(device)
        inputs['visual_attention_mask'] = inputs['visual_attention_mask'].to(device)

        '----------------- VQA -----------------'
        index_to_gather = inputs['attention_mask'].sum(1) - 2  # as in original code # 6

        outputs = self.VisualBertEncoder(**inputs)
        sequence_output = outputs[0] # [1, 33, 2048]

        # TO-CHECK: From the original code
        index_to_gather = (index_to_gather.unsqueeze(-1).unsqueeze(-1).expand(index_to_gather.size(0), 1, sequence_output.size(-1))) #  [1, 1, 2048]

        pooled_output = torch.gather(sequence_output, 1, index_to_gather) # [1, 33, 2048]

        pooled_output = self.dropout(pooled_output) # [1, 1, 2048]
        logits = self.classifier(pooled_output) # [1, 1, 8]
        reshaped_logits = logits.view(-1, self.num_labels) # [1, 8]
        return reshaped_logits


import os
import sys
import argparse
from torch import nn
from torch import optim
import torch.utils.data
import torch.nn.functional as F
from torch.utils.data  import DataLoader
from transformers import BertTokenizer
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

def get_arg():
    parser = argparse.ArgumentParser(description='VisualQuestionAnswerClassification')

    # VB Model parameters
    parser.add_argument('--emb_dim',        type=int,   default=300,                                help='dimension of word embeddings.')
    parser.add_argument('--n_heads',        type=int,   default=8,                                  help='Multi-head attention.')
    parser.add_argument('--dropout',        type=float, default=0.1,                                help='dropout')
    parser.add_argument('--encoder_layers', type=int,   default=6,                                  help='the number of layers of encoder in Transformer.')

    # Training parameters
    parser.add_argument('--epochs',         type=int,   default=2,                                 help='number of epochs to train for (if early stopping is not triggered).') #80, 26
    parser.add_argument('--batch_size',     type=int,   default=64,                                 help='batch_size')
    parser.add_argument('--workers',        type=int,   default=1,                                  help='for data-loading; right now, only 1 works with h5pys.')
    parser.add_argument('--print_freq',     type=int,   default=100,                                help='print training/validation stats every __ batches.')

    # existing checkpoint
    parser.add_argument('--checkpoint',     default=None,                                           help='path to checkpoint, None if none.')

    parser.add_argument('--lr',             type=float, default=0.00001,                           help='0.000005, 0.00001, 0.000005')
    parser.add_argument('--checkpoint_dir', default= 'checkpoints/efvlegpt2Swin/m18_v1_z_qf_',            help='med_vqa_c/m18/c80/m18_vid/c80_vid') #clf_v1_2_1x1/med_vqa_c3
    parser.add_argument('--dataset_type',   default= 'm18',                                          help='med_vqa/m18/c80/m18_vid/c80_vid')
    parser.add_argument('--dataset_cat',    default= 'cat1',                                        help='cat1/cat2/cat3')
    parser.add_argument('--tokenizer_ver',  default= 'gpt2v1',                                      help='btv2/btv3/gpt2v1')
    parser.add_argument('--question_len',   default= 25,                                            help='25')
    parser.add_argument('--model_ver',      default= 'efvlegpt2Swin',                                          help='vb/vbrm/efvlegpt2rs18/efvlegpt2Swin/"')  #vrvb/gpt2rs18/gpt2ViT/gpt2Swin/biogpt2rs18/vilgpt2vqa/efgpt2rs18gr/efvlegpt2Swingr
    parser.add_argument('--model_subver',   default= 'v1',                                          help='V0,v1/v2/v3/v4')
    parser.add_argument('--vis_pos_emb',   default= 'zeroes',                                           help='None, zeroes, pos')
    parser.add_argument('--patch_size',     default= 5,                                             help='1/2/3/4/5')

    parser.add_argument('--num_class',      default= 2,                                             help='25')
    # parser.add_argument('--temporal_size',  default= 1,                                             help='1/2/3/4/5')
    parser.add_argument('--validate',       default=False,                                          help='When only validation required False/True')

    if 'ipykernel' in sys.modules:
        args = parser.parse_args([])
    else:
        args = parser.parse_args()
    return args

def train(args, train_dataloader, model, criterion, optimizer, epoch, tokenizer, device):

    model.train()
    total_loss = 0.0
    label_true = None
    label_pred = None
    label_score = None

    for i, (imgs, q, labels) in enumerate(train_dataloader,0):
        questions = []
        for question in q: questions.append(question)
        inputs = tokenizer(questions, padding="max_length",max_length= args.question_len, return_tensors="pt")
        imgs, labels = imgs.to(device), labels.to(device)
        outputs = model(inputs, imgs)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # print statistics
        total_loss += loss.item()
        # print('Loss:',loss.item())

        scores, predicted = torch.max(F.softmax(outputs, dim=1).data, 1)
        label_true = labels.data.cpu() if label_true == None else torch.cat((label_true, labels.data.cpu()), 0)
        label_pred = predicted.data.cpu() if label_pred == None else torch.cat((label_pred, predicted.data.cpu()), 0)
        label_score = scores.data.cpu() if label_score == None else torch.cat((label_score, scores.data.cpu()), 0)

    # loss and acc
    acc, c_acc = calc_acc(label_true, label_pred), calc_classwise_acc(label_true, label_pred)
    precision, recall, fscore = calc_precision_recall_fscore(label_true, label_pred)
    print('Train: epoch: %d loss: %.6f | Acc: %.6f | Precision: %.6f | Recall: %.6f | FScore: %.6f' %(epoch, total_loss, acc, precision, recall, fscore))
    return acc


def validate(args, val_loader, model, criterion, epoch, tokenizer, device, save_output = False):

    model.eval()
    total_loss = 0.0
    label_true = None
    label_pred = None
    label_score = None

    with torch.no_grad():
        for i, ( imgs, q, labels) in enumerate(val_loader,0):
            questions = []
            for question in q: questions.append(question)
            inputs = tokenizer(questions, padding="max_length",max_length=args.question_len, return_tensors="pt")
            imgs, labels = imgs.to(device), labels.to(device)

            # model forward pass
            outputs = model(inputs, imgs)

            # loss
            loss = criterion(outputs,labels)
            total_loss += loss.item()
            scores, predicted = torch.max(F.softmax(outputs, dim=1).data, 1)
            label_true = labels.data.cpu() if label_true == None else torch.cat((label_true, labels.data.cpu()), 0)
            label_pred = predicted.data.cpu() if label_pred == None else torch.cat((label_pred, predicted.data.cpu()), 0)
            label_score = scores.data.cpu() if label_score == None else torch.cat((label_score, scores.data.cpu()), 0)

    acc = calc_acc(label_true, label_pred)
    c_acc = 0.0
    precision, recall, fscore = calc_precision_recall_fscore(label_true, label_pred)
    print('Test: epoch: %d loss: %.6f | Acc: %.6f | Precision: %.6f | Recall: %.6f | FScore: %.6f' %(epoch, total_loss, acc, precision, recall, fscore))

    return (acc, c_acc, precision, recall, fscore)

if __name__ == '__main__':
    args = get_arg()
    args.checkpoint_dir = 'checkpoints/VB_RN18'
    os.makedirs(args.checkpoint_dir, exist_ok=True)
    seed_everything()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    start_epoch = 1
    best_epoch = [0]
    best_results = [0.0]
    epochs_since_improvement = 0
    final_args = {"emb_dim": args.emb_dim, "n_heads": args.n_heads, "dropout": args.dropout, "encoder_layers": args.encoder_layers}
    train_seq = [2, 3, 4, 6, 7, 9, 10, 11, 12, 14, 15]
    val_seq = [1, 5, 16]
    args.num_class = 18

    folder_head = 'EndoVis-18-VQA/seq_'
    folder_tail = '/vqa/Classification/*.txt'

    train_dataset = EndoVis18VQAGPTClassification(train_seq, folder_head, folder_tail)
    train_dataloader = DataLoader(dataset=train_dataset, batch_size= args.batch_size, shuffle=True, num_workers=8)
    val_dataset = EndoVis18VQAGPTClassification(val_seq, folder_head, folder_tail)
    val_dataloader = DataLoader(dataset=val_dataset, batch_size= args.batch_size, shuffle=False, num_workers=8)

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = VisualBertClassification(vocab_size=len(tokenizer), layers=args.encoder_layers, n_heads=args.n_heads, num_class = args.num_class)
    model = model.to(device)
    criterion = nn.CrossEntropyLoss().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    for epoch in range(start_epoch, args.epochs):

            if epochs_since_improvement > 0 and epochs_since_improvement % 5 == 0:
                adjust_learning_rate(optimizer, 0.8)

            train_acc = train(args, train_dataloader=train_dataloader, model = model, criterion=criterion, optimizer=optimizer, epoch=epoch, tokenizer = tokenizer, device = device)
            test_acc, test_c_acc, test_precision, test_recall, test_fscore = validate(args, val_loader=val_dataloader, model = model, criterion=criterion, epoch=epoch, tokenizer = tokenizer, device = device)

            if test_acc >= best_results[0]:
                print('Best Epoch:', epoch)
                epochs_since_improvement = 0
                best_results[0] = test_acc
                best_epoch[0] = epoch
                save_clf_checkpoint(args.checkpoint_dir, epoch, epochs_since_improvement, model, optimizer, best_results[0], final_args)
            else:
                epochs_since_improvement += 1
                print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement,))




Total files: 1560 | Total question: 9014
Total files: 447 | Total question: 2769


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 138MB/s]


Train: epoch: 1 loss: 257.440868 | Acc: 0.658753 | Precision: 0.591419 | Recall: 0.382544 | FScore: 0.403261




Test: epoch: 1 loss: 96.836967 | Acc: 0.591188 | Precision: 0.744209 | Recall: 0.320708 | FScore: 0.317160
Best Epoch: 1


RuntimeError: Parent directory checkpoints/efvlegpt2Swin does not exist.