In [1]:
pip install transformers

Collecting transformers
  Downloading transformers-2.11.0-py3-none-any.whl (674 kB)
[K     |████████████████████████████████| 674 kB 8.7 MB/s eta 0:00:01
[?25hCollecting regex!=2019.12.17
  Downloading regex-2020.6.8-cp37-cp37m-manylinux2010_x86_64.whl (661 kB)
[K     |████████████████████████████████| 661 kB 10.0 MB/s eta 0:00:01
[?25hCollecting filelock
  Downloading filelock-3.0.12-py3-none-any.whl (7.6 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.91-cp37-cp37m-manylinux1_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 9.7 MB/s eta 0:00:01
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.43.tar.gz (883 kB)
[K     |████████████████████████████████| 883 kB 11.2 MB/s eta 0:00:01
Collecting tokenizers==0.7.0
  Downloading tokenizers-0.7.0-cp37-cp37m-manylinux1_x86_64.whl (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 8.7 MB/s eta 0:00:01
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setu

# Image Classification

## Using convnet(resnet32) as fixed feature extractor
We use a resnet model, remove the last layer and include a fully connected layer with dropout

In [11]:
%run preprocessing.ipynb    
%run utils.ipynb

print('done')

done


In [3]:
'''
adapted from https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html#further-learning 
and https://github.com/huggingface/transformers/blob/master/examples/contrib/mm-imdb/run_mmimdb.py
'''
# 
class FixedFeatureClassifier(nn.Module):
    def __init__(self, args):
        super().__init__()
        model_conv = torchvision.models.resnet34(pretrained=True)
        in_features_fc = model_conv.fc.in_features
        modules = list(model_conv.children())[:-1] # remove fc layer
        self.model_conv = nn.Sequential(*modules)
        # freeze
        for param in self.model_conv.parameters():
            param.requires_grad = False
        self.dropout = nn.Dropout(args.dropout_prob)
        self.classifier = nn.Linear(in_features_fc, args.num_labels)
    
    def forward(self, x):
        out = self.model_conv(x)
        out = torch.flatten(out, start_dim=1)
        out = self.dropout(out)
        out = self.classifier(out)
    
        return out

import time
import os
import copy
from tqdm.notebook import tqdm, trange

def train_model(args, model, criterion, optimizer, scheduler, num_epochs=25, resume=False):
    since = time.time()

    best_f1 = 0.0
    epoch = 0
    
    if resume:
        checkpoint = load_checkpoint(args)
        epoch = checkpoint['epoch']
        best_f1 = checkpoint['best_score']
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])

    while epoch < num_epochs: 
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            
            running_preds = None
            out_label_ids = None

            # Iterate over data.
            for inputs, labels in tqdm(dataloaders[phase]):
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                if running_preds is None:
                    running_preds =  probas(outputs)
                    out_label_ids = labels.cpu().detach().numpy()
                    print('OK')
                else:
                    running_preds = np.append(running_preds, probas(outputs), axis=0)
                    out_label_ids = np.append(out_label_ids, labels.cpu().detach().numpy(), axis=0)
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_f1 = f1_score(out_label_ids, running_preds, average="micro")
            print('{} Loss: {:.4f} F1_micro: {:.4f}'.format(phase, epoch_loss, epoch_f1))
            
            # Save last epoch model
            if phase == 'val':
                epoch += 1    
                save_checkpoint(args, epoch, best_f1, model, optimizer, scheduler)
            
            # Save best model
            if phase == 'val' and epoch_f1 > best_f1:
                best_f1 = epoch_f1
                save_checkpoint(args, epoch, best_f1, model, optimizer, scheduler, best=True)
                
        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))

    # load best model weights
    
    return model

from sklearn.metrics import f1_score, classification_report

def evaluate_model(args, model, criterion, dataloaders, split, load_weights=False, load=False, do_classification_report=False):
    print('evaluate :', args.model_name_or_path)
    since = time.time()

    best_f1 = 0.0
    epoch = 0
    if load or load_weights:
        checkpoint = load_checkpoint(args, best=True)
        best_f1 = checkpoint['best_score']
        print('best val f1_micro', best_f1)
    if load_weights:
        model.load_state_dict(checkpoint['model_state_dict'])
    else:
        return

    model.eval()   # Set model to evaluate mode

    running_loss = 0.0

    running_preds = None
    out_label_ids = None

    # Iterate over data.
    for inputs, labels in tqdm(dataloaders[split]):
        inputs = inputs.to(device)
        labels = labels.to(device)


        # forward
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # statistics
        running_loss += loss.item() * inputs.size(0)
        if running_preds is None:
            running_preds =  probas(outputs)
            out_label_ids = labels.cpu().detach().numpy()
            print('OK')
        else:
            running_preds = np.append(running_preds, probas(outputs), axis=0)
            out_label_ids = np.append(out_label_ids, labels.cpu().detach().numpy(), axis=0)

    epoch_loss = running_loss / dataset_sizes[split]
    epoch_f1 = f1_score(out_label_ids, running_preds, average="micro")
    print('{} Loss: {:.4f} F1_micro: {:.4f}'.format(split, epoch_loss, epoch_f1))
    if do_classification_report:
        print(classification_report(out_label_ids, running_preds, target_names=get_mmimdb_labels(), digits=3))
    

    time_elapsed = time.time() - since
    print('Evaluation complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))

    # load best model weights
    
    return model

In [4]:
import logging

from tqdm.notebook import tqdm, trange
from sklearn.metrics import f1_score
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler


args = Args(model_name_or_path='resnet34fixedfeature', image_only=True, use_transformed_tensors=True)
device = torch.device('cuda')  
labels = get_mmimdb_labels()
set_seed(args)
tokenizer = None # no tokenizer for only images

model = FixedFeatureClassifier(args)
model = model.to(device)

Downloading: "https://download.pytorch.org/models/resnet34-333f7ec4.pth" to /home/miaortizma/.cache/torch/checkpoints/resnet34-333f7ec4.pth


HBox(children=(FloatProgress(value=0.0, max=87306240.0), HTML(value='')))




In [5]:
train_dataset = load_examples(args, None)
label_frequences = train_dataset.get_label_frequencies()
label_frequences = [label_frequences[l] for l in labels]
label_weights = (
    torch.tensor(label_frequences, device=device, dtype=torch.float) / len(train_dataset)
) ** -1
criterion = nn.BCEWithLogitsLoss(pos_weight=label_weights) 

#we are only training the classifier parameters
optimizer_conv = optim.SGD(model.classifier.parameters(), lr=0.001, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = optim.lr_scheduler.StepLR(optimizer_conv, step_size=7, gamma=0.1)

In [6]:
from torch.utils.data import DataLoader
image_datasets = {x: load_examples(args, None, split=x) for x in ['train', 'val', 'test']}
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=64,
                                             shuffle=True, num_workers=0, collate_fn=collate_fn_image_only)
              for x in ['train', 'val', 'test']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val', 'test']}

In [None]:
model_conv = train_model(args, model, criterion, optimizer_conv, exp_lr_scheduler, num_epochs=50, resume=True)

Epoch 30/49
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1511 F1_micro: 0.3472


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.1153 F1_micro: 0.4108

Epoch 31/49
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1463 F1_micro: 0.3479


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.1144 F1_micro: 0.4055

Epoch 32/49
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1445 F1_micro: 0.3472


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.1153 F1_micro: 0.4111

Epoch 33/49
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1442 F1_micro: 0.3482


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.1153 F1_micro: 0.4101

Epoch 34/49
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1448 F1_micro: 0.3486


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.1149 F1_micro: 0.4092

Epoch 35/49
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1457 F1_micro: 0.3483


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.1149 F1_micro: 0.4078

Epoch 36/49
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1452 F1_micro: 0.3464


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.1144 F1_micro: 0.4095

Epoch 37/49
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1476 F1_micro: 0.3467


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.1149 F1_micro: 0.4071

Epoch 38/49
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1512 F1_micro: 0.3453


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.1150 F1_micro: 0.4120

Epoch 39/49
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1470 F1_micro: 0.3473


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.1152 F1_micro: 0.4102

Epoch 40/49
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1502 F1_micro: 0.3461


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.1145 F1_micro: 0.4103

Epoch 41/49
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1486 F1_micro: 0.3474


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.1150 F1_micro: 0.4095

Epoch 42/49
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1501 F1_micro: 0.3462


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.1149 F1_micro: 0.4102

Epoch 43/49
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK


In [7]:
_ = evaluate_model(args, model, criterion, dataloaders, 'test', load_weights=True, do_classification_report=True)

evaluate : resnet34fixedfeature
best val f1_micro 0.4192967160709096


HBox(children=(FloatProgress(value=0.0, max=122.0), HTML(value='')))

OK

test Loss: 1.1304 F1_micro: 0.4138
              precision    recall  f1-score   support

       Crime      0.282     0.359     0.316      1163
       Drama      0.584     0.848     0.692      4142
    Thriller      0.383     0.544     0.450      1567
      Action      0.341     0.509     0.408      1044
      Comedy      0.530     0.667     0.591      2611
     Romance      0.346     0.481     0.402      1590
 Documentary      0.208     0.362     0.264       629
       Short      0.073     0.303     0.118       142
     Mystery      0.175     0.190     0.182       617
     History      0.089     0.203     0.124       345
      Family      0.332     0.510     0.402       518
   Adventure      0.263     0.435     0.328       821
     Fantasy      0.193     0.403     0.261       585
      Sci-Fi      0.263     0.466     0.336       586
     Western      0.178     0.529     0.266       210
      Horror      0.323     0.582     0.415       825
       Sport      0.071     0.408     0.12

  _warn_prf(average, modifier, msg_start, len(result))


## Fine Tunning Classifier (resnet32)
for fine tunning we load the weights of the fixed feature extractor classifier and unfreeze 1 module of the resnet (3 convolutional blocks) and continue training

In [3]:
class FineTunningClassifier(nn.Module):
    def __init__(self, args):
        super().__init__()
        model_conv = torchvision.models.resnet34(pretrained=True)
        in_features_fc = model_conv.fc.in_features
        modules = list(model_conv.children())[:-1] # remove fc layer
        
        # freeze all but the last module of the ResNet
        for module in modules[:-1]:
            for param in module.parameters():
                param.requires_grad = False
        
        self.model_conv = nn.Sequential(*modules)
        self.dropout = nn.Dropout(args.dropout_prob)
        self.classifier = nn.Linear(in_features_fc, args.num_labels)
        
    def forward(self, x):
        out = self.model_conv(x)
        out = torch.flatten(out, start_dim=1)
        out = self.dropout(out)
        out = self.classifier(out)
    
        return out

In [4]:
args = Args(model_name_or_path='resnet34finetunning', image_only=True, use_transformed_tensors=True)
device = torch.device('cuda')        
set_seed(args)
tokenizer = None # no tokenizer for only images

model = FineTunningClassifier(args)
#print(model)
model = model.to(device)

In [5]:
labels = get_mmimdb_labels()
train_dataset = load_examples(args, None)
label_frequences = train_dataset.get_label_frequencies()
label_frequences = [label_frequences[l] for l in labels]
label_weights = (
    torch.tensor(label_frequences, device=device, dtype=torch.float) / len(train_dataset)
) ** -1
criterion = nn.BCEWithLogitsLoss(pos_weight=label_weights) 

#we are only training the unfrozen parameters
optimizer_conv = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = optim.lr_scheduler.StepLR(optimizer_conv, step_size=7, gamma=0.1)

In [6]:
from torch.utils.data import DataLoader

image_datasets = {x: load_examples(args, None, split=x) for x in ['train', 'val', 'test']}
dataloaders = {x: DataLoader(image_datasets[x], batch_size=64,
                                             shuffle=True, num_workers=0, collate_fn=collate_fn_image_only)
              for x in ['train', 'val', 'test']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val', 'test']}

In [112]:
checkpoint = load_checkpoint(Args(model_name_or_path='resnet34fixedfeature'), best=True)
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [8]:
model_finetunned = train_model(args, model, criterion, optimizer_conv, exp_lr_scheduler, num_epochs=100, resume=True)

Epoch 80/99
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1174 F1_micro: 0.3642


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.1000 F1_micro: 0.4163

Epoch 81/99
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1184 F1_micro: 0.3636


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.0995 F1_micro: 0.4140

Epoch 82/99
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1172 F1_micro: 0.3633


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.1000 F1_micro: 0.4168

Epoch 83/99
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1130 F1_micro: 0.3651


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.0993 F1_micro: 0.4153

Epoch 84/99
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1160 F1_micro: 0.3639


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.0994 F1_micro: 0.4136

Epoch 85/99
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1165 F1_micro: 0.3612


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.0996 F1_micro: 0.4149

Epoch 86/99
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1134 F1_micro: 0.3648


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.0992 F1_micro: 0.4138

Epoch 87/99
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1169 F1_micro: 0.3646


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.0991 F1_micro: 0.4166

Epoch 88/99
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1108 F1_micro: 0.3650


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.1000 F1_micro: 0.4143

Epoch 89/99
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1162 F1_micro: 0.3629


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.0988 F1_micro: 0.4139

Epoch 90/99
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1204 F1_micro: 0.3613


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.0996 F1_micro: 0.4150

Epoch 91/99
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1161 F1_micro: 0.3631


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.0996 F1_micro: 0.4127

Epoch 92/99
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1171 F1_micro: 0.3619


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.0996 F1_micro: 0.4178

Epoch 93/99
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1181 F1_micro: 0.3626


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.0986 F1_micro: 0.4152

Epoch 94/99
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1206 F1_micro: 0.3626


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.0990 F1_micro: 0.4129

Epoch 95/99
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1171 F1_micro: 0.3630


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.0997 F1_micro: 0.4145

Epoch 96/99
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1176 F1_micro: 0.3626


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.0999 F1_micro: 0.4165

Epoch 97/99
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1166 F1_micro: 0.3627


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.0996 F1_micro: 0.4192

Epoch 98/99
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1152 F1_micro: 0.3657


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.0998 F1_micro: 0.4146

Epoch 99/99
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.1183 F1_micro: 0.3636


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.0998 F1_micro: 0.4162

Training complete in 9m 16s


In [11]:
_ = evaluate_model(args, model, criterion, dataloaders, 'test', load_weights=True, do_classification_report=True)

evaluate : resnet34finetunning
best val f1_micro 0.4191541578677016


HBox(children=(FloatProgress(value=0.0, max=122.0), HTML(value='')))

OK

test Loss: 1.1098 F1_micro: 0.4115
              precision    recall  f1-score   support

       Crime      0.291     0.389     0.333      1163
       Drama      0.603     0.819     0.694      4142
    Thriller      0.394     0.542     0.456      1567
      Action      0.340     0.547     0.420      1044
      Comedy      0.555     0.649     0.599      2611
     Romance      0.364     0.456     0.405      1590
 Documentary      0.200     0.418     0.270       629
       Short      0.070     0.437     0.121       142
     Mystery      0.161     0.287     0.206       617
     History      0.099     0.232     0.139       345
      Family      0.285     0.569     0.380       518
   Adventure      0.250     0.479     0.328       821
     Fantasy      0.196     0.439     0.271       585
      Sci-Fi      0.240     0.541     0.332       586
     Western      0.140     0.610     0.227       210
      Horror      0.322     0.577     0.413       825
       Sport      0.090     0.403     0.14

## Finetunning Resnet152
Following [Supervised Multimodal Bitransformers for Classifying Images and Text](https://arxiv.org/pdf/1909.02950.pdf)(Kiela, et. al) we train a model similar to the one presented earlier but changing resnet32 to resnet152. Also, we include a different training routine, we use a regular Adam optimizer with frozen convolutional layers for the first 4 epochs, then we unfreeze the layers and keep training.

In this approach we unfreeze the whole net

For this experiment we also use pretransformed image tensors, this speed ups the training a lot.

The transformations applied to the images are as follows:
```
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=[0.46777044, 0.44531429, 0.40661017], std=[0.12221994, 0.12145835, 0.14380469],),

```
The transformations parameters are obtained from ImageNet.

In [1]:
%run preprocessing.ipynb    
%run utils.ipynb

print('done')

done


In [2]:
class FineTunningClassifier(nn.Module):
    def __init__(self, args):
        super().__init__()
        model_conv = torchvision.models.resnet152(pretrained=True)
        in_features_fc = model_conv.fc.in_features
        modules = list(model_conv.children())[:-1] # remove fc layer
        self.model_conv = nn.Sequential(*modules)
        
        self.dropout = nn.Dropout(args.dropout_prob)
        self.classifier = nn.Linear(in_features_fc, args.num_labels)
    
    def freeze_conv(self):
        for param in self.model_conv.parameters():
            param.requires_grad = False
    def unfreeze_conv(self):
        for param in self.model_conv.parameters():
            param.requires_grad = True
        
    def forward(self, x):
        out = self.model_conv(x)
        out = torch.flatten(out, start_dim=1)
        out = self.dropout(out)
        out = self.classifier(out)
    
        return out

In [3]:
args = Args(model_name_or_path='resnet152finetunning', 
            image_only=True, 
            use_transformed_tensors=True)
device = torch.device('cuda')        
set_seed(args)
tokenizer = None # no tokenizer for only images

model = FineTunningClassifier(args)
model = model.to(device)

labels = get_mmimdb_labels()
train_dataset = load_examples(args, None)
label_frequences = train_dataset.get_label_frequencies()
label_frequences = [label_frequences[l] for l in labels]
label_weights = (
    torch.tensor(label_frequences, device=device, dtype=torch.float) / len(train_dataset)
) ** -1
criterion = nn.BCEWithLogitsLoss(pos_weight=label_weights) 

optimizer_conv = optim.Adam(model.parameters(), lr=1e-4)

In [4]:
from torch.utils.data import DataLoader

image_datasets = {x: load_examples(args, None, split=x) for x in ['train', 'val', 'test']}
dataloaders = {x: DataLoader(image_datasets[x],
                             batch_size=16,
                             shuffle=True, num_workers=0,
                             collate_fn=collate_fn_image_only)
              for x in ['train', 'val', 'test']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val', 'test']}

In [5]:
import time
import os
import copy
from tqdm.notebook import tqdm, trange
from sklearn.metrics import f1_score

def train_model(args, model, criterion, optimizer, num_epochs=25, resume=False):
    since = time.time()

    best_f1 = 0.0
    epoch = 0
    
    if resume:
        checkpoint = load_checkpoint(args)
        epoch = checkpoint['epoch']
        best_f1 = checkpoint['best_score']
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])

    while epoch < num_epochs: 
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        if epoch == 3:
            model.unfreeze_conv()
            print('Model Unfrozen')

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            
            running_preds = None
            out_label_ids = None

            # Iterate over data.
            for inputs, labels in tqdm(dataloaders[phase]):
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                if running_preds is None:
                    running_preds =  probas(outputs)
                    out_label_ids = labels.cpu().detach().numpy()
                    print('OK')
                else:
                    running_preds = np.append(running_preds, probas(outputs), axis=0)
                    out_label_ids = np.append(out_label_ids, labels.cpu().detach().numpy(), axis=0)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_f1 = f1_score(out_label_ids, running_preds, average="micro")
            print('{} Loss: {:.4f} F1_micro: {:.4f}'.format(phase, epoch_loss, epoch_f1))
            
            # Save last epoch model
            if phase == 'val':
                epoch += 1    
                save_checkpoint(args, epoch, best_f1, model, optimizer)
            
            # Save best model
            if phase == 'val' and epoch_f1 > best_f1:
                best_f1 = epoch_f1
                save_checkpoint(args, epoch, best_f1, model, optimizer, best=True)
                
        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))

    # load best model weights
    
    return model

In [6]:
model.freeze_conv()
model_finetunned = train_model(args, model, criterion, optimizer_conv, num_epochs=30, resume=False)

Epoch 0/29
----------


HBox(children=(FloatProgress(value=0.0, max=972.0), HTML(value='')))

OK

train Loss: 1.2619 F1_micro: 0.3181


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

OK

val Loss: 1.1729 F1_micro: 0.4424

Epoch 1/29
----------


HBox(children=(FloatProgress(value=0.0, max=972.0), HTML(value='')))

OK

train Loss: 1.1765 F1_micro: 0.3685


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

OK

val Loss: 1.1249 F1_micro: 0.4127

Epoch 2/29
----------


HBox(children=(FloatProgress(value=0.0, max=972.0), HTML(value='')))

OK

train Loss: 1.1468 F1_micro: 0.3754


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

OK

val Loss: 1.1037 F1_micro: 0.4012

Epoch 3/29
----------
Model Unfrozen


HBox(children=(FloatProgress(value=0.0, max=972.0), HTML(value='')))

OK

train Loss: 1.1396 F1_micro: 0.3593


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

OK

val Loss: 1.0976 F1_micro: 0.3756

Epoch 4/29
----------


HBox(children=(FloatProgress(value=0.0, max=972.0), HTML(value='')))

OK

train Loss: 1.0366 F1_micro: 0.3922


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

OK

val Loss: 1.0718 F1_micro: 0.3956

Epoch 5/29
----------


HBox(children=(FloatProgress(value=0.0, max=972.0), HTML(value='')))

OK

train Loss: 0.9383 F1_micro: 0.4243


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

OK

val Loss: 1.1303 F1_micro: 0.4262

Epoch 6/29
----------


HBox(children=(FloatProgress(value=0.0, max=972.0), HTML(value='')))

OK

train Loss: 0.8296 F1_micro: 0.4665


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

OK

val Loss: 1.2199 F1_micro: 0.3915

Epoch 7/29
----------


HBox(children=(FloatProgress(value=0.0, max=972.0), HTML(value='')))

OK

train Loss: 0.7174 F1_micro: 0.5109


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

OK

val Loss: 1.3240 F1_micro: 0.3865

Epoch 8/29
----------


HBox(children=(FloatProgress(value=0.0, max=972.0), HTML(value='')))

OK

train Loss: 0.6066 F1_micro: 0.5592


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

OK

val Loss: 1.4318 F1_micro: 0.4211

Epoch 9/29
----------


HBox(children=(FloatProgress(value=0.0, max=972.0), HTML(value='')))

OK

train Loss: 0.5318 F1_micro: 0.6009


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

OK

val Loss: 1.6088 F1_micro: 0.4234

Epoch 10/29
----------


HBox(children=(FloatProgress(value=0.0, max=972.0), HTML(value='')))

OK

train Loss: 0.4761 F1_micro: 0.6300


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

OK

val Loss: 1.7658 F1_micro: 0.4176

Epoch 11/29
----------


HBox(children=(FloatProgress(value=0.0, max=972.0), HTML(value='')))

OK

train Loss: 0.4196 F1_micro: 0.6646


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

OK

val Loss: 1.9272 F1_micro: 0.4178

Epoch 12/29
----------


HBox(children=(FloatProgress(value=0.0, max=972.0), HTML(value='')))

OK

train Loss: 0.3993 F1_micro: 0.6780


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

OK

val Loss: 2.0656 F1_micro: 0.4365

Epoch 13/29
----------


HBox(children=(FloatProgress(value=0.0, max=972.0), HTML(value='')))

OK

train Loss: 0.3605 F1_micro: 0.7033


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

OK

val Loss: 2.1617 F1_micro: 0.4390

Epoch 14/29
----------


HBox(children=(FloatProgress(value=0.0, max=972.0), HTML(value='')))

OK

train Loss: 0.3302 F1_micro: 0.7222


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

OK

val Loss: 2.5451 F1_micro: 0.4458

Epoch 15/29
----------


HBox(children=(FloatProgress(value=0.0, max=972.0), HTML(value='')))

OK

train Loss: 0.3150 F1_micro: 0.7357


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

OK

val Loss: 2.4587 F1_micro: 0.4382

Epoch 16/29
----------


HBox(children=(FloatProgress(value=0.0, max=972.0), HTML(value='')))

OK

train Loss: 0.2828 F1_micro: 0.7577


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

OK

val Loss: 2.5414 F1_micro: 0.4513

Epoch 17/29
----------


HBox(children=(FloatProgress(value=0.0, max=972.0), HTML(value='')))

OK

train Loss: 0.2784 F1_micro: 0.7623


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

OK

val Loss: 2.7114 F1_micro: 0.4480

Epoch 18/29
----------


HBox(children=(FloatProgress(value=0.0, max=972.0), HTML(value='')))

OK

train Loss: 0.2577 F1_micro: 0.7812


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

OK

val Loss: 2.8321 F1_micro: 0.4299

Epoch 19/29
----------


HBox(children=(FloatProgress(value=0.0, max=972.0), HTML(value='')))

OK

train Loss: 0.2731 F1_micro: 0.7711


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

OK

val Loss: 2.9298 F1_micro: 0.4360

Epoch 20/29
----------


HBox(children=(FloatProgress(value=0.0, max=972.0), HTML(value='')))

OK

train Loss: 0.2324 F1_micro: 0.8005


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

OK

val Loss: 2.9475 F1_micro: 0.4374

Epoch 21/29
----------


HBox(children=(FloatProgress(value=0.0, max=972.0), HTML(value='')))

OK

train Loss: 0.2104 F1_micro: 0.8169


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

OK

val Loss: 2.9237 F1_micro: 0.4485

Epoch 22/29
----------


HBox(children=(FloatProgress(value=0.0, max=972.0), HTML(value='')))

OK

train Loss: 0.1966 F1_micro: 0.8277


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

OK

val Loss: 3.3503 F1_micro: 0.4294

Epoch 23/29
----------


HBox(children=(FloatProgress(value=0.0, max=972.0), HTML(value='')))

OK

train Loss: 0.2148 F1_micro: 0.8141


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

OK

val Loss: 3.1360 F1_micro: 0.4241

Epoch 24/29
----------


HBox(children=(FloatProgress(value=0.0, max=972.0), HTML(value='')))

OK



KeyboardInterrupt: 

In [7]:
from sklearn.metrics import f1_score, classification_report

def evaluate_model(args, model, criterion, dataloaders, split, load_weights=False, load=False, do_classification_report=False):
    print('evaluate :', args.model_name_or_path)
    since = time.time()

    best_f1 = 0.0
    epoch = 0
    if load or load_weights:
        checkpoint = load_checkpoint(args, best=True)
        best_f1 = checkpoint['best_score']
        print('best val f1_micro', best_f1)
    if load_weights:
        model.load_state_dict(checkpoint['model_state_dict'])
    else:
        return

    model.eval()   # Set model to evaluate mode

    running_loss = 0.0

    running_preds = None
    out_label_ids = None

    # Iterate over data.
    for inputs, labels in tqdm(dataloaders[split]):
        inputs = inputs.to(device)
        labels = labels.to(device)


        # forward
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # statistics
        running_loss += loss.item() * inputs.size(0)
        if running_preds is None:
            running_preds =  probas(outputs)
            out_label_ids = labels.cpu().detach().numpy()
            print('OK')
        else:
            running_preds = np.append(running_preds, probas(outputs), axis=0)
            out_label_ids = np.append(out_label_ids, labels.cpu().detach().numpy(), axis=0)

    epoch_loss = running_loss / dataset_sizes[split]
    epoch_f1 = f1_score(out_label_ids, running_preds, average="micro")
    print('{} Loss: {:.4f} F1_micro: {:.4f}'.format(split, epoch_loss, epoch_f1))
    if do_classification_report:
        print(classification_report(out_label_ids, running_preds, target_names=get_mmimdb_labels(), digits=3))
    

    time_elapsed = time.time() - since
    print('Evaluation complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))

    # load best model weights
    
    return model

In [8]:
_ = evaluate_model(args, model, criterion, dataloaders, 'test', load_weights=True, do_classification_report=True)

evaluate : resnet152finetunning
best val f1_micro 0.45127017639446976


HBox(children=(FloatProgress(value=0.0, max=488.0), HTML(value='')))

OK

test Loss: 2.6343 F1_micro: 0.4412
              precision    recall  f1-score   support

       Crime      0.286     0.390     0.330      1163
       Drama      0.618     0.792     0.694      4142
    Thriller      0.386     0.609     0.473      1567
      Action      0.323     0.470     0.383      1044
      Comedy      0.569     0.613     0.590      2611
     Romance      0.357     0.420     0.386      1590
 Documentary      0.258     0.280     0.269       629
       Short      0.167     0.310     0.217       142
     Mystery      0.165     0.232     0.193       617
     History      0.096     0.133     0.112       345
      Family      0.527     0.390     0.448       518
   Adventure      0.297     0.404     0.343       821
     Fantasy      0.227     0.267     0.245       585
      Sci-Fi      0.265     0.396     0.317       586
     Western      0.439     0.329     0.376       210
      Horror      0.396     0.485     0.436       825
       Sport      0.115     0.173     0.13

  _warn_prf(average, modifier, msg_start, len(result))


# Image Classification results analysis

We trained 3 models for different number of epochs until each one converged, for this modality and the following we use f1-micro as the performance measure on which the best instance of a model is chosen. 

## Experiment with learning rate 1e-3

The first two models, resnet32 as fixed feature extractor and with fine tunning perform almost equally, with the fixed feature extractor performing slightly better. The model with resnet152 unfreezing all the layers performs the worst. We could interpret this as unfreezing the layers "damaging" the quality of the weights or with not having enough data to train the model, along with the omitting of some implementation details like optimizer, gradient norm clip, gradient accumulation, etc.

We choose the fixed feature extractor model for the multimodal model.

The performance of the chosen model is similar to the one of the best visual model found in the original paper:

- The weighted f1-score is better for our model (**0.437** vs 0.410)
- Samples average is better for the model in the original paper (0.411 vs **0.429**)
- The micro f1-score is better for the model in the original paper (0.414 vs **0.437**)
- The macro f1-score is better for our model (**0.293** vs 0.283)

## Experiment with learning rate 1e-4

It seemed odd to us that resnet152 would perform worse than resnet32 so we decided to rerun the experiment only for this model but with a lower learning rate.

resnet152 performed much better using the previously described freezing schedule, we suspect than rerunning the same experiment with resnet32 would perform better than the fixed feature counterpart but not reach the performance of resnet152.

Our model performed slightly better than the best visual model of the original paper:

- The weighted f1-score is better for our model (**0.444** vs 0.410)
- Samples average is better for our model (**0.435** vs 0.429)
- The micro f1-score is better for our model (**0.441** vs 0.437)
- The macro f1-score is better for our model (**0.316** vs 0.283)


The residual architecture of resnet allows us to train a deeper model in comparison to vgg, Kiela et. Al reports even better results with a 0.447 f1-micro but they also do experiments with a sooner moment of unfreezing and a lower learning rate (1e-5). 

We can observe clearly that a high learning rate is too much for fine tunning, effectively "damaging" the weights and the learnt representations of resnet.



# Text Classification

As a base model we use a transfomer tokenizer as an embedding layer based on a pretained bert model,

- [BERT word embeddings tutorial](https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/)
- [BERT transformers docs](https://huggingface.co/transformers/model_doc/bert.html?highlight=bertmodel)


The benefit of using a BERT embedding is that word vectors are context-aware, instead of always returning the same vector for a word, the vector changes in accordance to the context. Even one word that appears multiple times in one sentence will have different vector representations based on the position and semantic meaning, with appearances with similar semantic meaning having similar vector representations. BERT model is pretained using the english version of Wikipedia.



Following the pre-trained embedding layer we use a single LSTM layer to produce a context vector which is then feed into an fully connected layer with dropout

In [1]:
%run preprocessing.ipynb    
%run utils.ipynb

print('done')

done


In [2]:
import torch.nn as nn
import torch
from transformers import BertModel

class RnnClassifier(nn.Module):
    def __init__(self, dropout_prob, hidden_size, num_labels):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.rnn = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size)
        self.dropout = nn.Dropout(dropout_prob)
        self.classifier = nn.Linear(hidden_size, num_labels)
    
    def freeze_bert(self):
        for param in self.bert.parameters():
            param.requires_grad = False
        
    def unfreeze_bert(self):
        for param in self.bert.parameters():
            param.requires_grad = True
        
    def forward(self, input_ids, attention_mask):
        (h_t, p_o) = self.bert(input_ids, attention_mask)
        # change to batch second
        h_t = h_t.permute(1, 0, 2)
        out, (h_t, c_t) = self.rnn(h_t)
        # get output for only last state
        out = out[-1] 
        out = self.dropout(out)
        out = self.classifier(out)
        
        return out

In [3]:
args = Args(model_name_or_path='lstm_classifier',
            tokenizer_name='bert-base-uncased',
            text_only=True)
labels = get_mmimdb_labels()

device = torch.device('cuda')      

In [4]:
rnnargs = {'dropout_prob': 0.5, 'num_labels': 23, 'hidden_size': 768}
model = RnnClassifier(**rnnargs)
model = model.to(device)

In [5]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [6]:
train_dataset = load_examples(args, tokenizer)
label_frequences = train_dataset.get_label_frequencies()
label_frequences = [label_frequences[l] for l in labels]
label_weights = (
    torch.tensor(label_frequences, device=device, dtype=torch.float) / len(train_dataset)
) ** -1
criterion = nn.BCEWithLogitsLoss(pos_weight=label_weights) 

#we are only training the classifier parameters
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [7]:
from tqdm.notebook import tqdm, trange

from sklearn.metrics import f1_score
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

text_datasets = {x: load_examples(args, tokenizer, split=x) for x in ['train', 'val', 'test']}
dataloaders = {x: torch.utils.data.DataLoader(text_datasets[x], batch_size=32,
                                             shuffle=True, num_workers=0, collate_fn=collate_fn_text_only)
              for x in ['train', 'val', 'test']}
dataset_sizes = {x: len(text_datasets[x]) for x in ['train', 'val', 'test']}

We have to update the training routine since bert embedding needs two inputs (tokenized text and mask)

In [14]:
import time

def train_model(args, model, criterion, optimizer, num_epochs=25, resume=False):
    print('Training', args.model_name_or_path)
    since = time.time()

    best_f1 = 0.0
    epoch = 0
    
    if resume:
        checkpoint = load_checkpoint(args)
        epoch = checkpoint['epoch']
        best_f1 = checkpoint['best_score']
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])

    while epoch < num_epochs: 
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        if epoch == 3:
            model.unfreeze_bert()
            print('Bert Unfrozen')

        

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            
            running_preds = None
            out_label_ids = None

            # Iterate over data.
            for (input_ids, attention_mask), labels in tqdm(dataloaders[phase]):
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(input_ids, attention_mask)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * input_ids.size(0)
                if running_preds is None:
                    running_preds =  probas(outputs)
                    out_label_ids = labels.cpu().detach().numpy()
                    print('OK')
                else:
                    running_preds = np.append(running_preds, probas(outputs), axis=0)
                    out_label_ids = np.append(out_label_ids, labels.cpu().detach().numpy(), axis=0)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_f1 = f1_score(out_label_ids, running_preds, average="micro")
            print('{} Loss: {:.4f} F1_micro: {:.4f}'.format(
                phase, epoch_loss, epoch_f1))
            
            # Save last epoch model
            if phase == 'val':
                epoch += 1    
                save_checkpoint(args, epoch, best_f1, model, optimizer)
            
            # Save best model
            if phase == 'val' and epoch_f1 > best_f1:
                best_f1 = epoch_f1
                save_checkpoint(args, epoch, best_f1, model, optimizer, best=True)
                
        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))

    # load best model weights
    
    return model


In [None]:
model_rnn = train_model(args, model, criterion, optimizer, exp_lr_scheduler, num_epochs=30, resume=True)

Training lstm_classifier
Epoch 1/29
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 1.0292 F1_micro: 0.4619


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 0.9209 F1_micro: 0.4947

Epoch 2/29
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.8705 F1_micro: 0.4959


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 0.8346 F1_micro: 0.4811

Epoch 3/29
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.7883 F1_micro: 0.5165


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 0.7699 F1_micro: 0.5396

Epoch 4/29
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.7361 F1_micro: 0.5357


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 0.7848 F1_micro: 0.5192

Epoch 5/29
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.6898 F1_micro: 0.5492


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 0.7412 F1_micro: 0.5703

Epoch 6/29
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.6560 F1_micro: 0.5611


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 0.7212 F1_micro: 0.5546

Epoch 7/29
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.6065 F1_micro: 0.5829


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 0.7174 F1_micro: 0.5735

Epoch 8/29
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.6003 F1_micro: 0.5872


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 0.7106 F1_micro: 0.5676

Epoch 9/29
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.5926 F1_micro: 0.5883


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 0.7127 F1_micro: 0.5699

Epoch 10/29
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK


In [2]:
import time
from sklearn.metrics import classification_report

def evaluate_model(args, model, criterion, dataloaders, split, load_weights=False, load=False, do_classification_report=False):
    print('evaluate :', args.model_name_or_path)
    since = time.time()

    best_f1 = 0.0
    epoch = 0
    if load or load_weights:
        checkpoint = load_checkpoint(args, best=True)
        best_f1 = checkpoint['best_score']
        print('validation best f1_micro', best_f1)
    if load_weights:
        model.load_state_dict(checkpoint['model_state_dict'])
    else:
        return

    
    model.eval()   # Set model to evaluate mode

    running_loss = 0.0

    running_preds = None
    out_label_ids = None

    # Iterate over data.
    for (input_ids, attention_mask), labels in tqdm(dataloaders[split]):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        
        # forward
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        # statistics
        running_loss += loss.item() * input_ids.size(0)
        if running_preds is None:
            running_preds =  probas(outputs)
            out_label_ids = labels.cpu().detach().numpy()
        else:
            running_preds = np.append(running_preds, probas(outputs), axis=0)
            out_label_ids = np.append(out_label_ids, labels.cpu().detach().numpy(), axis=0)

    epoch_loss = running_loss / dataset_sizes[split]
    epoch_f1 = f1_score(out_label_ids, running_preds, average="micro")
    print('{} Loss: {:.4f} F1_micro: {:.4f}'.format(split, epoch_loss, epoch_f1))
    if do_classification_report:
        print(classification_report(out_label_ids, running_preds, target_names=get_mmimdb_labels(), digits=3))


    time_elapsed = time.time() - since
    print('Evaluation complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))

    # load best model weights
    
    return model

In [13]:
_ = evaluate_model(args, model, criterion, dataloaders, 'test', load_weights=True, do_classification_report=True)

evaluate : lstm_classifier
validation best f1_micro 0.5763797540749214


HBox(children=(FloatProgress(value=0.0, max=244.0), HTML(value='')))


test Loss: 0.7126 F1_micro: 0.5710
              precision    recall  f1-score   support

       Crime      0.516     0.760     0.615      1163
       Drama      0.683     0.836     0.752      4142
    Thriller      0.493     0.777     0.603      1567
      Action      0.466     0.782     0.584      1044
      Comedy      0.593     0.687     0.637      2611
     Romance      0.451     0.637     0.528      1590
 Documentary      0.600     0.903     0.721       629
       Short      0.129     0.627     0.214       142
     Mystery      0.288     0.705     0.409       617
     History      0.226     0.812     0.354       345
      Family      0.389     0.736     0.509       518
   Adventure      0.394     0.736     0.513       821
     Fantasy      0.300     0.749     0.428       585
      Sci-Fi      0.432     0.834     0.570       586
     Western      0.584     0.890     0.706       210
      Horror      0.480     0.850     0.614       825
       Sport      0.561     0.817     0.665  

  _warn_prf(average, modifier, msg_start, len(result))


# BertClassifier
For this second model instead of using bert as a word embedding we use the pooling of the word vectors and feed it directly into a linear layer for classification.

In [3]:
from transformers import BertModel

class BertClassifier(nn.Module):
    def __init__(self, dropout_prob, num_labels, hidden_size):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        
        self.dropout = nn.Dropout(dropout_prob)
        self.classifier = nn.Linear(hidden_size, num_labels)
        
    def freeze_bert(self):
        for param in self.bert.parameters():
            param.requires_grad = False
        
    def unfreeze_bert(self):
        for param in self.bert.parameters():
            param.requires_grad = True
    
    def forward(self, input_ids, attention_mask):
        (h_t, p_o) = self.bert(input_ids, attention_mask)
        out = self.dropout(p_o)
        out = self.classifier(out)
        
        return out

In [4]:
args = Args(model_name_or_path='bert_freezing_classifier',
            tokenizer_name='bert-base-uncased',
            text_only=True)
labels = get_mmimdb_labels()

device = torch.device('cuda')      

In [5]:
bertargs = {'dropout_prob': 0.5, 'num_labels': 23, 'hidden_size': 768}
model = BertClassifier(**bertargs)
model = model.to(device)

In [6]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [7]:
train_dataset = load_examples(args, tokenizer)
label_frequences = train_dataset.get_label_frequencies()
label_frequences = [label_frequences[l] for l in labels]
label_weights = (
    torch.tensor(label_frequences, device=device, dtype=torch.float) / len(train_dataset)
) ** -1
criterion = nn.BCEWithLogitsLoss(pos_weight=label_weights) 

optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [9]:
from tqdm.notebook import tqdm, trange

from sklearn.metrics import f1_score
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

text_datasets = {x: load_examples(args, tokenizer, split=x) for x in ['train', 'val', 'test']}
dataloaders = {x: torch.utils.data.DataLoader(text_datasets[x], batch_size=32,
                                             shuffle=True, num_workers=0, collate_fn=collate_fn_text_only)
              for x in ['train', 'val', 'test']}
dataset_sizes = {x: len(text_datasets[x]) for x in ['train', 'val', 'test']}

In [15]:
model.freeze_bert()
model_bert = train_model(args, model, criterion, optimizer, num_epochs=40, resume=False)

Training bert_freezing_classifier
Epoch 0/39
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.2937 F1_micro: 0.2849


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.2619 F1_micro: 0.3770

Epoch 1/39
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.2793 F1_micro: 0.3032


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.2488 F1_micro: 0.3804

Epoch 2/39
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 1.2681 F1_micro: 0.3124


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.2362 F1_micro: 0.3958

Epoch 3/39
----------
Bert Unfrozen


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 0.8613 F1_micro: 0.4773


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 0.7343 F1_micro: 0.5412

Epoch 4/39
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 0.6259 F1_micro: 0.5770


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 0.7155 F1_micro: 0.5650

Epoch 5/39
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 0.4903 F1_micro: 0.6402


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 0.7817 F1_micro: 0.5764

Epoch 6/39
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 0.3987 F1_micro: 0.6904


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 0.7967 F1_micro: 0.6058

Epoch 7/39
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 0.3245 F1_micro: 0.7372


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 0.8617 F1_micro: 0.6088

Epoch 8/39
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 0.2753 F1_micro: 0.7711


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 0.9502 F1_micro: 0.6033

Epoch 9/39
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 0.2415 F1_micro: 0.7979


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.0717 F1_micro: 0.6286

Epoch 10/39
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 0.2053 F1_micro: 0.8266


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.1679 F1_micro: 0.6295

Epoch 11/39
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 0.1892 F1_micro: 0.8410


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.2628 F1_micro: 0.6214

Epoch 12/39
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 0.1617 F1_micro: 0.8642


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.2052 F1_micro: 0.6063

Epoch 13/39
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 0.1387 F1_micro: 0.8819


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.4317 F1_micro: 0.6315

Epoch 14/39
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 0.1250 F1_micro: 0.8958


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.4908 F1_micro: 0.6281

Epoch 15/39
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 0.1186 F1_micro: 0.9013


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.5822 F1_micro: 0.6238

Epoch 16/39
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 0.1058 F1_micro: 0.9126


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.6156 F1_micro: 0.6322

Epoch 17/39
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 0.0829 F1_micro: 0.9336


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.6171 F1_micro: 0.6225

Epoch 18/39
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 0.0840 F1_micro: 0.9322


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.7481 F1_micro: 0.6191

Epoch 19/39
----------


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

OK

train Loss: 0.0890 F1_micro: 0.9273


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

OK

val Loss: 1.8032 F1_micro: 0.6107


KeyboardInterrupt: 

In [10]:
_ = evaluate_model(args, model, criterion, dataloaders, 'test', load_weights=True, do_classification_report=True)

evaluate : bert_freezing_classifier
validation best f1_micro 0.6321589096708713


HBox(children=(FloatProgress(value=0.0, max=244.0), HTML(value='')))


test Loss: 1.6189 F1_micro: 0.6272
              precision    recall  f1-score   support

       Crime      0.562     0.688     0.618      1163
       Drama      0.740     0.785     0.762      4142
    Thriller      0.545     0.660     0.597      1567
      Action      0.542     0.668     0.599      1044
      Comedy      0.645     0.661     0.653      2611
     Romance      0.525     0.480     0.501      1590
 Documentary      0.753     0.820     0.785       629
       Short      0.404     0.282     0.332       142
     Mystery      0.396     0.522     0.450       617
     History      0.398     0.528     0.454       345
      Family      0.534     0.604     0.567       518
   Adventure      0.497     0.622     0.553       821
     Fantasy      0.468     0.544     0.503       585
      Sci-Fi      0.661     0.737     0.697       586
     Western      0.740     0.814     0.776       210
      Horror      0.662     0.719     0.689       825
       Sport      0.580     0.796     0.671  

  _warn_prf(average, modifier, msg_start, len(result))


# Text Classification Results Analysis


## learning rate 1e-3 experiment

The trained model using only the bert representation performs slightly better than applying an LSTM layer to the bert representation. The model with only bert reached 0.58 f1-micro score, nearing to the performance of 0.595 of the original paper. 

This shows that LSTM is able to use bert features, but the ability to use attention is lost on the last layer which could explain the slight drop in performance.

## learning rate 1e-4 experiment

After seeing the performance improvements of the visual model with the new training schedule (frezzing lower model for first epochs and then unfreezing it all) we decided to rerun the experiment for the only bert model, we suspect that bert + lstm model would perfom slightly worse than only bert with this training schedule.

With this adjusment our model outperforms the best text model of the original paper (MaxOutMLP_w2v), the results are as follows:


- The weighted f1-score is better for our model (**0.628** vs 0.588)
- Samples average is better for our model (**0.624** vs 0.592)
- The micro f1-score is better for our model (**0.627** vs 0.595)
- The macro f1-score is better for our model (**0.570** vs 0.488)

# Multimodal Baseline 

For a multimodal baseline we implement a simple concatenation of the best image and text representations and feed them into a single linear layer with dropout. Thus, we obtain a 2048 + 768 dimensions representation for our multimodal classifier **BertConcat**

We train this model with the same freezing schedule, for the first 3 epochs the modal feature extractors are frozen and then they are unfrozen.

In [1]:
%run preprocessing.ipynb    
%run utils.ipynb

print('done')

done


In [2]:
from transformers import BertModel

class BertConcat(nn.Module):
    def __init__(self, visual_model_name, text_model_name):
        super().__init__()
        
        # load the best textual and visual models
        bertargs = {'dropout_prob': 0.5, 'num_labels': 23, 'hidden_size': 768}
        bertclassifier = BertClassifier(**bertargs)
        
        args = Args(model_name_or_path=text_model_name)
        checkpoint = load_checkpoint(args, best=True)
        bertclassifier.load_state_dict(checkpoint['model_state_dict'])
        
        
        args = Args(model_name_or_path=visual_model_name)
        resnetclassifier = FineTunningClassifier(args)
        
        checkpoint = load_checkpoint(args, best=True)
        resnetclassifier.load_state_dict(checkpoint['model_state_dict'])
        
        # use only the base models, ignore the linear layers of each
        in_dimension = resnetclassifier.classifier.in_features + bertclassifier.classifier.in_features
        
        self.bert = bertclassifier.bert
        self.model_conv = resnetclassifier.model_conv
        
        self.dropout = nn.Dropout(args.dropout_prob)
        self.classifier = nn.Linear(in_dimension, args.num_labels)
    
    def forward(self, input_ids, attention_mask, image_tensor):
        conv_out = self.model_conv(image_tensor)
        conv_out = torch.flatten(conv_out, start_dim=1)
        
        (h_t, p_o) = self.bert(input_ids, attention_mask)
        
        concat = torch.cat((conv_out, p_o), dim=1)
        out = self.dropout(concat)
        out = self.classifier(out)
        return out
    
    def freeze_bert(self):
        for param in self.bert.parameters():
            param.requires_grad = False
        
    def unfreeze_bert(self):
        for param in self.bert.parameters():
            param.requires_grad = True
            
    def freeze_conv(self):
        for param in self.model_conv.parameters():
            param.requires_grad = False
            
    def unfreeze_conv(self):
        for param in self.model_conv.parameters():
            param.requires_grad = True


class BertClassifier(nn.Module):
    def __init__(self, dropout_prob, num_labels, hidden_size):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        
        self.dropout = nn.Dropout(dropout_prob)
        self.classifier = nn.Linear(hidden_size, num_labels)
        
    def freeze_bert(self):
        for param in self.bert.parameters():
            param.requires_grad = False
        
    def unfreeze_bert(self):
        for param in self.bert.parameters():
            param.requires_grad = True
    
    def forward(self, input_ids, attention_mask):
        (h_t, p_o) = self.bert(input_ids, attention_mask)
        out = self.dropout(p_o)
        out = self.classifier(out)
        
        return out
    
class FineTunningClassifier(nn.Module):
    def __init__(self, args):
        super().__init__()
        model_conv = torchvision.models.resnet152(pretrained=True)
        in_features_fc = model_conv.fc.in_features
        modules = list(model_conv.children())[:-1] # remove fc layer
        self.model_conv = nn.Sequential(*modules)
        self.dropout = nn.Dropout(args.dropout_prob)
        self.classifier = nn.Linear(in_features_fc, args.num_labels)
    
    def freeze_conv(self):
        for param in self.model_conv.parameters():
            param.requires_grad = False
    def unfreeze_conv(self):
        for param in self.model_conv.parameters():
            param.requires_grad = True
        
    def forward(self, x):
        out = self.model_conv(x)
        out = torch.flatten(out, start_dim=1)
        out = self.dropout(out)
        out = self.classifier(out)
    
        return out

In [3]:
args = Args(model_name_or_path='bertconcat', use_transformed_tensors=True)
labels = get_mmimdb_labels()
device = torch.device('cuda') 

In [4]:
model = BertConcat(
    visual_model_name='resnet152finetunning',
    text_model_name='bert_freezing_classifier', 
)
model = model.to(device)

In [5]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [6]:
train_dataset = load_examples(args, tokenizer)
label_frequences = train_dataset.get_label_frequencies()
label_frequences = [label_frequences[l] for l in labels]
label_weights = (
    torch.tensor(label_frequences, device=device, dtype=torch.float) / len(train_dataset)
) ** -1
criterion = nn.BCEWithLogitsLoss(pos_weight=label_weights) 

optimizer = optim.Adam(model.parameters(), lr=(5 * 1e-5))

In [7]:
from tqdm.notebook import tqdm, trange

from sklearn.metrics import f1_score
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

text_datasets = {x: load_examples(args, tokenizer, split=x) for x in ['train', 'val', 'test']}
dataloaders = {x: torch.utils.data.DataLoader(text_datasets[x], batch_size=32,
                                             shuffle=True, num_workers=0, collate_fn=collate_fn_modal)
              for x in ['train', 'val', 'test']}
dataset_sizes = {x: len(text_datasets[x]) for x in ['train', 'val', 'test']}

In [2]:
import time

def train_model(args, model, criterion, optimizer, num_epochs=25, resume=False):
    print('Training', args.model_name_or_path)
    since = time.time()

    best_f1 = 0.0
    epoch = 0
    
    if resume:
        checkpoint = load_checkpoint(args)
        epoch = checkpoint['epoch']
        best_f1 = checkpoint['best_score']
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    while epoch < num_epochs: 
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        #if epoch == 6:
           # model.unfreeze_bert()
            #model.unfreeze_conv()
            #print('Bert Unfrozen')
        #if epoch == 3:
           # model.unfreeze_conv()

        

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            
            running_preds = None
            out_label_ids = None

            # Iterate over data.
            for (input_ids, attention_mask, image_tensor), labels in tqdm(dataloaders[phase]):
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                image_tensor = image_tensor.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(input_ids, attention_mask, image_tensor)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * input_ids.size(0)
                if running_preds is None:
                    running_preds =  probas(outputs)
                    out_label_ids = labels.cpu().detach().numpy()
                    print('OK')
                else:
                    running_preds = np.append(running_preds, probas(outputs), axis=0)
                    out_label_ids = np.append(out_label_ids, labels.cpu().detach().numpy(), axis=0)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_f1 = f1_score(out_label_ids, running_preds, average="micro")
            print('{} Loss: {:.4f} F1_micro: {:.4f}'.format(
                phase, epoch_loss, epoch_f1))
            
            # Save last epoch model
            if phase == 'val':
                epoch += 1    
                save_checkpoint(args, epoch, best_f1, model, optimizer)
            
            # Save best model
            if phase == 'val' and epoch_f1 > best_f1:
                best_f1 = epoch_f1
                save_checkpoint(args, epoch, best_f1, model, optimizer, best=True)
                
        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))

    # load best model weights
    
    return model


In [9]:
model.freeze_bert()
model.freeze_conv()
model_bert = train_model(args, model, criterion, optimizer, num_epochs=24, resume=False)

Training bertconcat
Epoch 0/23
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.5340 F1_micro: 0.8094


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 1.0425 F1_micro: 0.6171

Epoch 1/23
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.1881 F1_micro: 0.9244


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 1.1929 F1_micro: 0.6293

Epoch 2/23
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.1241 F1_micro: 0.9392


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 1.3125 F1_micro: 0.6383

Epoch 3/23
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.0964 F1_micro: 0.9465


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 1.4271 F1_micro: 0.6388

Epoch 4/23
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.0800 F1_micro: 0.9500


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 1.5021 F1_micro: 0.6433

Epoch 5/23
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.0698 F1_micro: 0.9533


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 1.6174 F1_micro: 0.6417

Epoch 6/23
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.0631 F1_micro: 0.9558


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 1.7095 F1_micro: 0.6429

Epoch 7/23
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.0578 F1_micro: 0.9568


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 1.7835 F1_micro: 0.6464

Epoch 8/23
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.0534 F1_micro: 0.9593


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 1.8515 F1_micro: 0.6469

Epoch 9/23
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.0510 F1_micro: 0.9591


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 1.9370 F1_micro: 0.6448

Epoch 10/23
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.0481 F1_micro: 0.9603


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 1.9894 F1_micro: 0.6463

Epoch 11/23
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.0458 F1_micro: 0.9615


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 2.0747 F1_micro: 0.6473

Epoch 12/23
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.0450 F1_micro: 0.9617


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 2.1518 F1_micro: 0.6470

Epoch 13/23
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.0438 F1_micro: 0.9626


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 2.1418 F1_micro: 0.6489

Epoch 14/23
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.0423 F1_micro: 0.9625


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 2.2253 F1_micro: 0.6480

Epoch 15/23
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.0417 F1_micro: 0.9618


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 2.2731 F1_micro: 0.6482

Epoch 16/23
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.0411 F1_micro: 0.9628


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 2.3249 F1_micro: 0.6492

Epoch 17/23
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.0397 F1_micro: 0.9635


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 2.3992 F1_micro: 0.6475

Epoch 18/23
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.0393 F1_micro: 0.9641


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 2.4462 F1_micro: 0.6482

Epoch 19/23
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.0391 F1_micro: 0.9637


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 2.4405 F1_micro: 0.6480

Epoch 20/23
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.0386 F1_micro: 0.9636


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 2.5140 F1_micro: 0.6469

Epoch 21/23
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.0376 F1_micro: 0.9647


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 2.6156 F1_micro: 0.6469

Epoch 22/23
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.0373 F1_micro: 0.9638


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 2.6191 F1_micro: 0.6473

Epoch 23/23
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 0.0371 F1_micro: 0.9648


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 2.6146 F1_micro: 0.6484

Training complete in 71m 40s


In [3]:
from sklearn.metrics import classification_report 

def evaluate_model(args, model, criterion, dataloaders, split, load_weights=False, load=False, do_classification_report=False):
    print('evaluate :', args.model_name_or_path)
    since = time.time()

    best_f1 = 0.0
    epoch = 0
    if load or load_weights:
        checkpoint = load_checkpoint(args, best=True)
        best_f1 = checkpoint['best_score']
        print('validation best f1_micro', best_f1)
    if load_weights:
        model.load_state_dict(checkpoint['model_state_dict'])
    else:
        return

    
    model.eval()   # Set model to evaluate mode

    running_loss = 0.0

    running_preds = None
    out_label_ids = None

    # Iterate over data.
    for (input_ids, attention_mask, image_tensor), labels in tqdm(dataloaders[split]):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        image_tensor = image_tensor.to(device)
        labels = labels.to(device)
        
        # forward
        outputs = model(input_ids, attention_mask, image_tensor)
        loss = criterion(outputs, labels)

        # statistics
        running_loss += loss.item() * input_ids.size(0)
        if running_preds is None:
            running_preds =  probas(outputs)
            out_label_ids = labels.cpu().detach().numpy()
        else:
            running_preds = np.append(running_preds, probas(outputs), axis=0)
            out_label_ids = np.append(out_label_ids, labels.cpu().detach().numpy(), axis=0)

    epoch_loss = running_loss / dataset_sizes[split]
    epoch_f1 = f1_score(out_label_ids, running_preds, average="micro")
    print('{} Loss: {:.4f} F1_micro: {:.4f}'.format(split, epoch_loss, epoch_f1))
    if do_classification_report:
        print(classification_report(out_label_ids, running_preds, target_names=get_mmimdb_labels(), digits=3))


    time_elapsed = time.time() - since
    print('Evaluation complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))

    # load best model weights
    
    return model

In [13]:
_ = evaluate_model(args, model, criterion, dataloaders, 'test', load_weights=True, do_classification_report=True)

evaluate : bertconcat
validation best f1_micro 0.6491703892788769


HBox(children=(FloatProgress(value=0.0, max=244.0), HTML(value='')))


test Loss: 2.3809 F1_micro: 0.6420
              precision    recall  f1-score   support

       Crime      0.644     0.547     0.591      1163
       Drama      0.732     0.812     0.770      4142
    Thriller      0.583     0.627     0.605      1567
      Action      0.652     0.586     0.617      1044
      Comedy      0.674     0.705     0.689      2611
     Romance      0.549     0.496     0.521      1590
 Documentary      0.845     0.720     0.778       629
       Short      0.630     0.239     0.347       142
     Mystery      0.488     0.310     0.379       617
     History      0.551     0.313     0.399       345
      Family      0.737     0.529     0.616       518
   Adventure      0.568     0.566     0.567       821
     Fantasy      0.532     0.444     0.484       585
      Sci-Fi      0.755     0.695     0.724       586
     Western      0.828     0.733     0.778       210
      Horror      0.779     0.670     0.721       825
       Sport      0.736     0.555     0.633  

  _warn_prf(average, modifier, msg_start, len(result))


## Multimodal LstmConcat
We do the same process but changing BertClassifer with Lstmclassifier

In [4]:
from transformers import BertModel

class LstmConcat(nn.Module):
    def __init__(self, visual_model_name, text_model_name):
        super().__init__()
        
        # load the best textual and visual models
        bertargs = {'dropout_prob': 0.5, 'num_labels': 23, 'hidden_size': 768}
        bertclassifier = RnnClassifier(**bertargs)
        
        args = Args(model_name_or_path=text_model_name)
        checkpoint = load_checkpoint(args, best=True)
        bertclassifier.load_state_dict(checkpoint['model_state_dict'])
        
        
        args = Args(model_name_or_path=visual_model_name)
        resnetclassifier = FineTunningClassifier(args)
        
        checkpoint = load_checkpoint(args, best=True)
        resnetclassifier.load_state_dict(checkpoint['model_state_dict'])
        
        # use only the base models, ignore the linear layers of each
        in_dimension = resnetclassifier.classifier.in_features + bertclassifier.classifier.in_features
        
        self.bert = bertclassifier.bert
        self.rnn = bertclassifier.rnn
        self.model_conv = resnetclassifier.model_conv
        
        self.dropout = nn.Dropout(args.dropout_prob)
        self.classifier = nn.Linear(in_dimension, args.num_labels)
    
    def forward(self, input_ids, attention_mask, image_tensor):
        conv_out = self.model_conv(image_tensor)
        conv_out = torch.flatten(conv_out, start_dim=1)
        
        (h_t, p_o) = self.bert(input_ids, attention_mask)
        # change to batch second
        h_t = h_t.permute(1, 0, 2)
        text_out, (h_t, c_t) = self.rnn(h_t)
        # get output for only last state
        text_out = text_out[-1] 

        concat = torch.cat((conv_out, text_out), dim=1)
        out = self.dropout(concat)
        out = self.classifier(out)
        return out
    
    def freeze_bert(self):
        for param in self.bert.parameters():
            param.requires_grad = False
        
    def unfreeze_bert(self):
        for param in self.bert.parameters():
            param.requires_grad = True
            
    def freeze_conv(self):
        for param in self.model_conv.parameters():
            param.requires_grad = False
            
    def unfreeze_conv(self):
        for param in self.model_conv.parameters():
            param.requires_grad = True

class RnnClassifier(nn.Module):
    def __init__(self, dropout_prob, hidden_size, num_labels):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.rnn = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size)
        self.dropout = nn.Dropout(dropout_prob)
        self.classifier = nn.Linear(hidden_size, num_labels)
    
    def freeze_bert(self):
        for param in self.bert.parameters():
            param.requires_grad = False
        
    def unfreeze_bert(self):
        for param in self.bert.parameters():
            param.requires_grad = True
        
    def forward(self, input_ids, attention_mask):
        (h_t, p_o) = self.bert(input_ids, attention_mask)
        # change to batch second
        h_t = h_t.permute(1, 0, 2)
        out, (h_t, c_t) = self.rnn(h_t)
        # get output for only last state
        out = out[-1] 
        out = self.dropout(out)
        out = self.classifier(out)
        
        return out

class FineTunningClassifier(nn.Module):
    def __init__(self, args):
        super().__init__()
        model_conv = torchvision.models.resnet152(pretrained=True)
        in_features_fc = model_conv.fc.in_features
        modules = list(model_conv.children())[:-1] # remove fc layer
        self.model_conv = nn.Sequential(*modules)
        self.dropout = nn.Dropout(args.dropout_prob)
        self.classifier = nn.Linear(in_features_fc, args.num_labels)
    
    def freeze_conv(self):
        for param in self.model_conv.parameters():
            param.requires_grad = False
    def unfreeze_conv(self):
        for param in self.model_conv.parameters():
            param.requires_grad = True
        
    def forward(self, x):
        out = self.model_conv(x)
        out = torch.flatten(out, start_dim=1)
        out = self.dropout(out)
        out = self.classifier(out)
    
        return out

In [5]:
args = Args(model_name_or_path='lstmconcat', use_transformed_tensors=True)
labels = get_mmimdb_labels()
device = torch.device('cuda') 

In [6]:
model = LstmConcat(
    visual_model_name='resnet152finetunning',
    text_model_name='lstm_classifier', 
)
model = model.to(device)

In [7]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [8]:
train_dataset = load_examples(args, tokenizer)
label_frequences = train_dataset.get_label_frequencies()
label_frequences = [label_frequences[l] for l in labels]
label_weights = (
    torch.tensor(label_frequences, device=device, dtype=torch.float) / len(train_dataset)
) ** -1
criterion = nn.BCEWithLogitsLoss(pos_weight=label_weights) 

optimizer = optim.Adam(model.parameters(), lr=(5 * 1e-5))

In [9]:
from tqdm.notebook import tqdm, trange

from sklearn.metrics import f1_score
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

text_datasets = {x: load_examples(args, tokenizer, split=x) for x in ['train', 'val', 'test']}
dataloaders = {x: torch.utils.data.DataLoader(text_datasets[x], batch_size=32,
                                             shuffle=True, num_workers=0, collate_fn=collate_fn_modal)
              for x in ['train', 'val', 'test']}
dataset_sizes = {x: len(text_datasets[x]) for x in ['train', 'val', 'test']}

In [14]:
model.freeze_bert()
model.freeze_conv()
model_bert = train_model(args, model, criterion, optimizer, num_epochs=24, resume=False)

Training lstmconcat
Epoch 0/23
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 1.3510 F1_micro: 0.2080


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 1.3313 F1_micro: 0.2340

Epoch 1/23
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 1.3517 F1_micro: 0.2063


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK

val Loss: 1.3317 F1_micro: 0.2358

Epoch 2/23
----------


HBox(children=(FloatProgress(value=0.0, max=486.0), HTML(value='')))

OK

train Loss: 1.3536 F1_micro: 0.2083


HBox(children=(FloatProgress(value=0.0, max=82.0), HTML(value='')))

OK



KeyboardInterrupt: 

In [11]:
# Create Dataloaders with less batch size for test set, for some reason it runs out of memory otherwise.
from tqdm.notebook import tqdm, trange

from sklearn.metrics import f1_score
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
torch.cuda.empty_cache()
text_datasets = {x: load_examples(args, tokenizer, split=x) for x in ['train', 'val', 'test']}
dataloaders = {x: torch.utils.data.DataLoader(text_datasets[x], batch_size=16,
                                             shuffle=True, num_workers=0, collate_fn=collate_fn_modal)
              for x in ['train', 'val', 'test']}
dataset_sizes = {x: len(text_datasets[x]) for x in ['train', 'val', 'test']}

In [12]:
model.freeze_bert()
model.freeze_conv()
_ = evaluate_model(args, model, criterion, dataloaders, 'test', load_weights=True, do_classification_report=True)

evaluate : lstmconcat
validation best f1_micro 0.23580265315248178


HBox(children=(FloatProgress(value=0.0, max=488.0), HTML(value='')))


test Loss: 1.3299 F1_micro: 0.2322
              precision    recall  f1-score   support

       Crime      0.054     0.002     0.003      1163
       Drama      0.512     0.720     0.599      4142
    Thriller      0.000     0.000     0.000      1567
      Action      0.126     0.199     0.154      1044
      Comedy      0.483     0.366     0.416      2611
     Romance      0.188     0.002     0.004      1590
 Documentary      0.031     0.002     0.003       629
       Short      0.000     0.000     0.000       142
     Mystery      0.000     0.000     0.000       617
     History      0.030     0.348     0.056       345
      Family      0.000     0.000     0.000       518
   Adventure      0.080     0.125     0.098       821
     Fantasy      0.071     0.326     0.117       585
      Sci-Fi      0.030     0.038     0.033       586
     Western      0.003     0.005     0.004       210
      Horror      0.005     0.001     0.002       825
       Sport      0.000     0.000     0.000  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
