# Generate Visual Property Norms results


We wish to investigate the effect of different adaptations to a text-only input for pre-trained vision-and-language (VL) models.

This notebook is used for generating adaptation results on Visual Property Norms (VPN) for the BERT-base, LXMERT, VisualBERT and CLIP-BERT models. Visual Property Norms is a text-only tasks that more or less evaluates visual commonsense knowledge. LXMERT, VisualBERT and CLIP-BERT are VL models.

The notebook is segmented into 8 main parts in which we evaluate different models and adaptations of these on VPN:
1. All models as they are, with no or small adaptations to text-only input, queried without visual features
2. BERT-base trained on Wikipedia or LXMERT text training data with no adaptation (used as a baseline)
3. All VL models adapted to text-only input through finetuning on Wikipedia or LXMERT text training data, queried without visual features
4. All VL models queried with constant average visual features from their training sets
5. All VL models queried with constant visual features from a black image
6. All VL models queried with constant visual features that are zeroes
7. All VL models queried with constant visual features that have been finetuned to either Wikipedia or LXMERT text train data
8. CLIP-BERT queried with visual features generated by CLIP based on the textual query ("imagined visual features")

For each model and adaptation we save the results on VPN in a file.

Lastly, all of the model and adaptation results are aggregated to one dataframe and saved to a file.

## Setup packages and save file

Move to root folder

In [None]:
%cd ..

In [None]:
from transformers import BertTokenizer, BertForMaskedLM, BertConfig, CLIPModel, CLIPProcessor, VisualBertForPreTraining, VisualBertConfig, LxmertForPreTraining, LxmertConfig
import torch
import pandas as pd
import copy
import os

from models.src.clip_bert.modeling_bert import BertImageForMaskedLM
from models.src.lxmert.alterations import LxmertLanguageOnlyXLayer

# import support functions
from visual_property_norms.src.utils import get_model_results, get_model_preds_for_questions, get_clip_bert_model, get_clip_bert_preds_for_questions, get_lxmert_preds_for_questions, get_visualbert_preds_for_questions

In [None]:
SAVE_FOLDER = "visual_property_norms/data/results"

## 1. Evaluate all models as they are (more or less)

Measure with MAP. Report results per 1) pf split and 2) feature starter.

### BERT

In [None]:
adaptation = "default"
model_name = "BERT-base"
save_filename = os.path.join(SAVE_FOLDER, get_save_filename(model_name, adaptation))

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForMaskedLM.from_pretrained("bert-base-uncased")
model.eval()
get_preds = lambda questions: get_model_preds_for_questions(model, tokenizer, questions, batch_size=128)

results = get_model_results(get_preds, tokenizer)
results.to_csv(save_filename, index=False)

### CLIP-BERT

In [None]:
adaptation = "default"
model_name = "CLIP-BERT"
save_filename = os.path.join(SAVE_FOLDER, get_save_filename(model_name, adaptation))

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model_path = "models/data/model-weights/clip-bert/mp_rank_00_model_states.pt"

model, clip_model, clip_processor = get_clip_bert_model(model_path, no_visual_prediction=True)
model.eval()

get_preds = lambda questions: get_clip_bert_preds_for_questions(model, clip_model, clip_processor, questions, tokenizer, no_visual_prediction=True, batch_size=128)

results = get_model_results(get_preds, tokenizer)
results.to_csv(save_filename, index=False)

### VisualBERT

In [None]:
adaptation = "default"
model_name = "VisualBERT"
save_filename = os.path.join(SAVE_FOLDER, get_save_filename(model_name, adaptation))

model = VisualBertForPreTraining.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
model.eval()
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

get_preds = lambda questions: get_model_preds_for_questions(model, tokenizer, questions, batch_size=128)

results = get_model_results(get_preds, tokenizer)
results.to_csv(save_filename, index=False)

### LXMERT

In [None]:
adaptation = "default"
model_name = "LXMERT"
save_filename = os.path.join(SAVE_FOLDER, get_save_filename(model_name, adaptation))

model = LxmertForPreTraining.from_pretrained("unc-nlp/lxmert-base-uncased")
prev_encoder = copy.deepcopy(model.lxmert.encoder)
model.lxmert.encoder.x_layers = torch.nn.ModuleList([LxmertLanguageOnlyXLayer(model.lxmert.encoder.config) for _ in range(model.lxmert.encoder.config.x_layers)])
model.lxmert.encoder.load_state_dict(prev_encoder.state_dict())
model.eval()

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

get_preds = lambda questions: get_lxmert_preds_for_questions(model, tokenizer, questions, batch_size=128)

results = get_model_results(get_preds, tokenizer)
results.to_csv(save_filename, index=False)

## 2. BERT-base trained on Wikipedia or LXMERT text training data

### Evaluate BERT trained on LXMERT data from pretrained weights

In [None]:
adaptation = "trained-LXMERT"
model_name = "BERT-base"
save_filename = os.path.join(SAVE_FOLDER, get_save_filename(model_name, adaptation))

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
config = BertConfig.from_pretrained("bert-base-uncased")
model = BertImageForMaskedLM(config)
model.load_state_dict(torch.load("models/data/model-weights/bert-lxmert-trained/mp_rank_00_model_states.pt", map_location="cpu")["module"], strict=False)
model.eval()
   
get_preds = lambda questions: get_model_preds_for_questions(model, tokenizer, questions, batch_size=256)

results = get_model_results(get_preds, tokenizer)
results.to_csv(save_filename, index=False)

### Evaluate BERT trained on LXMERT data from scratch

In [None]:
adaptation = "trained-LXMERT-scratch"
model_name = "BERT-base"
save_filename = os.path.join(SAVE_FOLDER, get_save_filename(model_name, adaptation))

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
config = BertConfig.from_pretrained("bert-base-uncased")
model = BertImageForMaskedLM(config)
model.load_state_dict(torch.load("models/data/model-weights/bert-lxmert-trained-scratch/mp_rank_00_model_states.pt", map_location="cpu")["module"], strict=False)
model.eval()
   
get_preds = lambda questions: get_model_preds_for_questions(model, tokenizer, questions, batch_size=128)

results = get_model_results(get_preds, tokenizer)
results.to_csv(save_filename, index=False)

### BERT-base trained on Wikipedia (same size as LXMERT data)

In [None]:
adaptation = "trained-Wikipedia"
model_name = "BERT-base"
save_filename = os.path.join(SAVE_FOLDER, get_save_filename(model_name, adaptation))

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
config = BertConfig.from_pretrained("bert-base-uncased")
model = BertImageForMaskedLM(config)
model.load_state_dict(torch.load("models/data/model-weights/bert-wikipedia-trained/mp_rank_00_model_states.pt", map_location="cpu")["module"], strict=False)
model.eval()

get_preds = lambda questions: get_model_preds_for_questions(model, tokenizer, questions, batch_size=128)

results = get_model_results(get_preds, tokenizer)
results.to_csv(save_filename, index=False)

## 3. All VL models finetuned on Wikipedia or LXMERT text training data

### LXMERT on unimodal Wikipedia

In [None]:
adaptation = "no-visual-features-finetuned-Wikipedia"
model_name = "LXMERT"
save_filename = os.path.join(SAVE_FOLDER, get_save_filename(model_name, adaptation))

config = LxmertConfig.from_pretrained("unc-nlp/lxmert-base-uncased")
model = LxmertForPreTraining(config)
model.lxmert.encoder.x_layers = torch.nn.ModuleList([LxmertLanguageOnlyXLayer(model.lxmert.encoder.config) for _ in range(model.lxmert.encoder.config.x_layers)])
model.load_state_dict(torch.load("adaptations/data/runs/finetune/lxmert-wikipedia/best_global_step1200/mp_rank_00_model_states.pt", map_location="cpu")["module"], strict=False)
model.eval()

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

get_preds = lambda questions: get_lxmert_preds_for_questions(model, tokenizer, questions, batch_size=256)

results = get_model_results(get_preds, tokenizer)
results.to_csv(save_filename, index=False)

### VisualBERT on unimodal Wikipedia

In [None]:
adaptation = "no-visual-features-finetuned-Wikipedia"
model_name = "VisualBERT"
save_filename = os.path.join(SAVE_FOLDER, get_save_filename(model_name, adaptation))

config = VisualBertConfig.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
model = VisualBertForPreTraining(config)
model.load_state_dict(torch.load("adaptations/data/runs/finetune/visualbert-wikipedia/best_global_step440/mp_rank_00_model_states.pt", map_location="cpu")["module"], strict=False)
model.eval()

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

get_preds = lambda questions: get_model_preds_for_questions(model, tokenizer, questions, batch_size=256)

results = get_model_results(get_preds, tokenizer)
results.to_csv(save_filename, index=False)

### CLIP-BERT on unimodal Wikipedia

In [None]:
adaptation = "no-visual-features-finetuned-Wikipedia"
model_name = "CLIP-BERT"
save_filename = os.path.join(SAVE_FOLDER, get_save_filename(model_name, adaptation))

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model_path = "adaptations/data/runs/finetune/clip-bert-wikipedia/best_global_step120/mp_rank_00_model_states.pt"

model, clip_model, clip_processor = get_clip_bert_model(model_path, no_visual_prediction=True)
model.eval()
 
get_preds = lambda questions: get_clip_bert_preds_for_questions(model, clip_model, clip_processor, questions, tokenizer, no_visual_prediction=True, batch_size=256)

results = get_model_results(get_preds, tokenizer)
results.to_csv(save_filename, index=False)

### LXMERT on unimodal LXMERT

In [None]:
adaptation = "no-visual-features-finetuned-LXMERT"
model_name = "LXMERT"
save_filename = os.path.join(SAVE_FOLDER, get_save_filename(model_name, adaptation))

config = LxmertConfig.from_pretrained("unc-nlp/lxmert-base-uncased")
model = LxmertForPreTraining(config)
model.lxmert.encoder.x_layers = torch.nn.ModuleList([LxmertLanguageOnlyXLayer(model.lxmert.encoder.config) for _ in range(model.lxmert.encoder.config.x_layers)])
model.load_state_dict(torch.load("adaptations/data/runs/finetune/lxmert-lxmert/best_global_step400/mp_rank_00_model_states.pt", map_location="cpu")["module"], strict=False)
model.eval()

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

get_preds = lambda questions: get_lxmert_preds_for_questions(model, tokenizer, questions, batch_size=128)

results = get_model_results(get_preds, tokenizer)
results.to_csv(save_filename, index=False)

### VisualBERT on unimodal LXMERT

In [None]:
adaptation = "no-visual-features-finetuned-LXMERT"
model_name = "VisualBERT"
save_filename = os.path.join(SAVE_FOLDER, get_save_filename(model_name, adaptation))

config = VisualBertConfig.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
model = VisualBertForPreTraining(config)
model.load_state_dict(torch.load("adaptations/data/runs/finetune/visualbert-lxmert/best_global_step600/mp_rank_00_model_states.pt", map_location="cpu")["module"], strict=False)
model.eval()

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

get_preds = lambda questions: get_model_preds_for_questions(model, tokenizer, questions, batch_size=128)

results = get_model_results(get_preds, tokenizer)
results.to_csv(save_filename, index=False)

### CLIP-BERT on unimodal LXMERT

In [None]:
adaptation = "no-visual-features-finetuned-LXMERT"
model_name = "CLIP-BERT"
save_filename = os.path.join(SAVE_FOLDER, get_save_filename(model_name, adaptation))

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model_path = "adaptations/data/runs/finetune/clip-bert-lxmert/best_global_step320/mp_rank_00_model_states.pt"

model, clip_model, clip_processor = get_clip_bert_model(model_path, no_visual_prediction=True)
model.eval()
 
get_preds = lambda questions: get_clip_bert_preds_for_questions(model, clip_model, clip_processor, questions, tokenizer, no_visual_prediction=True, batch_size=128)

results = get_model_results(get_preds, tokenizer)
results.to_csv(save_filename, index=False)

## 4. Use average visual features filler

### LXMERT

In [None]:
adaptation = "avg-visual-features"
model_name = "LXMERT"
save_filename = os.path.join(SAVE_FOLDER, get_save_filename(model_name, adaptation))

model = LxmertForPreTraining.from_pretrained("unc-nlp/lxmert-base-uncased")
model.eval()

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# features have dimension (36, 2048)
visual_features = torch.load("adaptations/data/avg-visual-features/frcnn_features_per_detection.pt")
visual_boxes = torch.load("adaptations/data/avg-visual-features/frcnn_boxes_per_detection.pt")

get_preds = lambda questions: get_lxmert_preds_for_questions(model, tokenizer, questions, batch_size=128, visual_features=visual_features, visual_boxes=visual_boxes)

results = get_model_results(get_preds, tokenizer)
results.to_csv(save_filename, index=False)

### VisualBERT

In [None]:
VisualBertConfig.from_pretrained("uclanlp/visualbert-vqa-coco-pre")

Note above that VisualBERT image data from VisualBERT repo is not compatible with VisualBERT on Huggingface. Visual embedding dim of data from VisualBERT repo is 1024 (150, 1024), while VisualBERT config above says 2048. Therefore, we use the LXMERT visual features instead, it should be from the same backbone.

In [None]:
adaptation = "avg-visual-features"
model_name = "VisualBERT"
save_filename = os.path.join(SAVE_FOLDER, get_save_filename(model_name, adaptation))

model = VisualBertForPreTraining.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
model.eval()

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Using LXMERT features below
visual_features = torch.load("adaptations/data/avg-visual-features/frcnn_features_per_detection.pt")

get_preds = lambda questions: get_visualbert_preds_for_questions(model, tokenizer, questions, batch_size=128, visual_features=visual_features)

results = get_model_results(get_preds, tokenizer)
results.to_csv(save_filename, index=False)

### CLIP-BERT

In [None]:
adaptation = "avg-visual-features"
model_name = "CLIP-BERT"
save_filename = os.path.join(SAVE_FOLDER, get_save_filename(model_name, adaptation))

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model_path = "models/data/model-weights/clip-bert/mp_rank_00_model_states.pt"

model, clip_model, clip_processor = get_clip_bert_model(model_path, no_visual_prediction=True)
model.eval()

# features have dimension (512,)
visual_features = torch.load("adaptations/data/avg-visual-features/clip_features.pt")

get_preds = lambda questions: get_clip_bert_preds_for_questions(model, clip_model, clip_processor, questions, tokenizer, no_visual_prediction=True, batch_size=128, visual_features=visual_features)

results = get_model_results(get_preds, tokenizer)
results.to_csv(save_filename, index=False)

## 5. Use feature vector from black image as visual features

### LXMERT

In [None]:
adaptation = "zero-image-visual-features"
model_name = "LXMERT"
save_filename = os.path.join(SAVE_FOLDER, get_save_filename(model_name, adaptation))

model = LxmertForPreTraining.from_pretrained("unc-nlp/lxmert-base-uncased")
model.eval()

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
batch_size = 64

visual_features = torch.load("adaptations/data/zero-image-visual-features/frcnn_features.pt")
visual_boxes = torch.load("adaptations/data/zero-image-visual-features/frcnn_boxes.pt")

get_preds = lambda questions: get_lxmert_preds_for_questions(model, tokenizer, questions, batch_size=batch_size, visual_features=visual_features, visual_boxes=visual_boxes)

results = get_model_results(get_preds, tokenizer)
results.to_csv(save_filename, index=False)

### VisualBERT

In [None]:
adaptation = "zero-image-visual-features"
model_name = "VisualBERT"
save_filename = os.path.join(SAVE_FOLDER, get_save_filename(model_name, adaptation))

model = VisualBertForPreTraining.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
model.eval()

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# VisualBERT features have dimension (150, 1024)
# Using LXMERT features below!!
visual_features = torch.load("adaptations/data/zero-image-visual-features/frcnn_features.pt")

get_preds = lambda questions: get_visualbert_preds_for_questions(model, tokenizer, questions, batch_size=128, visual_features=visual_features)

results = get_model_results(get_preds, tokenizer)
results.to_csv(save_filename, index=False)

### CLIP-BERT

In [None]:
adaptation = "zero-image-visual-features"
model_name = "CLIP-BERT"
save_filename = os.path.join(SAVE_FOLDER, get_save_filename(model_name, adaptation))

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model_path = "models/data/model-weights/clip-bert/mp_rank_00_model_states.pt"

model, clip_model, clip_processor = get_clip_bert_model(model_path, no_visual_prediction=True)
model.eval()

# features have dimension (512,)
visual_features = torch.load("adaptations/data/zero-image-visual-features/clip_features.pt")

get_preds = lambda questions: get_clip_bert_preds_for_questions(model, clip_model, clip_processor, questions, tokenizer, no_visual_prediction=True, batch_size=128, visual_features=visual_features)

results = get_model_results(get_preds, tokenizer)
results.to_csv(save_filename, index=False)

## 6. Use zero vectors as visual features

### LXMERT

In [None]:
adaptation = "zeroed-visual-features"
model_name = "LXMERT"
save_filename = os.path.join(SAVE_FOLDER, get_save_filename(model_name, adaptation))

model = LxmertForPreTraining.from_pretrained("unc-nlp/lxmert-base-uncased")
model.eval()

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
batch_size = 64
visual_features = torch.zeros((36, 2048))
visual_boxes = torch.zeros((36, 4))

get_preds = lambda questions: get_lxmert_preds_for_questions(model, tokenizer, questions, batch_size=batch_size, visual_features=visual_features, visual_boxes=visual_boxes)

results = get_model_results(get_preds, tokenizer)
results.to_csv(save_filename, index=False)

### VisualBERT

In [None]:
adaptation = "zeroed-visual-features"
model_name = "VisualBERT"
save_filename = os.path.join(SAVE_FOLDER, get_save_filename(model_name, adaptation))

model = VisualBertForPreTraining.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
model.eval()

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# VisualBERT features have dimension (150, 1024)
# Using LXMERT features below
visual_features = torch.zeros((36, 2048))

get_preds = lambda questions: get_visualbert_preds_for_questions(model, tokenizer, questions, batch_size=128, visual_features=visual_features)

results = get_model_results(get_preds, tokenizer)
results.to_csv(save_filename, index=False)

### CLIP-BERT

In [None]:
adaptation = "zeroed-visual-features"
model_name = "CLIP-BERT"
save_filename = os.path.join(SAVE_FOLDER, get_save_filename(model_name, adaptation))

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model_path = "models/data/model-weights/clip-bert/mp_rank_00_model_states.pt"

model, clip_model, clip_processor = get_clip_bert_model(model_path, no_visual_prediction=True)
model.eval()

# features have dimension (512,)
visual_features = torch.zeros((512,))

get_preds = lambda questions: get_clip_bert_preds_for_questions(model, clip_model, clip_processor, questions, tokenizer, no_visual_prediction=True, batch_size=128, visual_features=visual_features)

results = get_model_results(get_preds, tokenizer)
results.to_csv(save_filename, index=False)

## 7. Use fine-tuned visual feature fillers

### CLIP-BERT - tuned on LXMERT-finetune (9,500 train samples)

In [None]:
adaptation = "finetuned-LXMERT-visual-features"
model_name = "CLIP-BERT"
save_filename = os.path.join(SAVE_FOLDER, get_save_filename(model_name, adaptation))

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model_path = "models/data/model-weights/clip-bert/mp_rank_00_model_states.pt"

model, clip_model, clip_processor = get_clip_bert_model(model_path, no_visual_prediction=True)
model.eval()

# features have dimension (512,)
visual_features = torch.load("adaptations/data/runs/finetune-visual-features/clip-bert-lxmert/best_global_step_features4520.pt")

get_preds = lambda questions: get_clip_bert_preds_for_questions(model, clip_model, clip_processor, questions, tokenizer, no_visual_prediction=True, batch_size=128, visual_features=visual_features)

results = get_model_results(get_preds, tokenizer)
results.to_csv(save_filename, index=False)

### CLIP-BERT - tuned on Wikipedia-finetune

In [None]:
adaptation = "finetuned-Wikipedia-visual-features"
model_name = "CLIP-BERT"
save_filename = os.path.join(SAVE_FOLDER, get_save_filename(model_name, adaptation))

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model_path = "models/data/model-weights/clip-bert/mp_rank_00_model_states.pt"

model, clip_model, clip_processor = get_clip_bert_model(model_path, no_visual_prediction=True)
model.eval()

# features have dimension (512,)
visual_features = torch.load("adaptations/data/runs/finetune-visual-features/clip-bert-wikipedia/best_global_step_features75600.pt")

get_preds = lambda questions: get_clip_bert_preds_for_questions(model, clip_model, clip_processor, questions, tokenizer, no_visual_prediction=True, batch_size=128, visual_features=visual_features)

results = get_model_results(get_preds, tokenizer)

results.to_csv(save_filename, index=False)

### LXMERT - tuned on LXMERT-finetune

In [None]:
adaptation = "finetuned-LXMERT-visual-features"
model_name = "LXMERT"
save_filename = os.path.join(SAVE_FOLDER, get_save_filename(model_name, adaptation))

model = LxmertForPreTraining.from_pretrained("unc-nlp/lxmert-base-uncased")
model.eval()

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
batch_size = 64

visual_features = torch.load("adaptations/data/runs/finetune-visual-features/lxmert-lxmert/best_global_step_features70080.pt")
visual_boxes = torch.load("adaptations/data/runs/finetune-visual-features/lxmert-lxmert/best_global_step_boxes70080.pt")

get_preds = lambda questions: get_lxmert_preds_for_questions(model, tokenizer, questions, batch_size=batch_size, visual_features=visual_features, visual_boxes=visual_boxes)

results = get_model_results(get_preds, tokenizer)
results.to_csv(save_filename, index=False)

### LXMERT - tuned on Wikipedia-finetune

In [None]:
adaptation = "finetuned-Wikipedia-visual-features"
model_name = "LXMERT"
save_filename = os.path.join(SAVE_FOLDER, get_save_filename(model_name, adaptation))

model = LxmertForPreTraining.from_pretrained("unc-nlp/lxmert-base-uncased")
model.eval()

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
batch_size = 64

visual_features = torch.load("adaptations/data/runs/finetune-visual-features/lxmert-wikipedia/best_global_step_features72320.pt")
visual_boxes = torch.load("adaptations/data/runs/finetune-visual-features/lxmert-wikipedia/best_global_step_boxes72320.pt")

get_preds = lambda questions: get_lxmert_preds_for_questions(model, tokenizer, questions, batch_size=batch_size, visual_features=visual_features, visual_boxes=visual_boxes)

results = get_model_results(get_preds, tokenizer)
results.to_csv(save_filename, index=False)

### VisualBERT - tuned on LXMERT-finetune

In [None]:
adaptation = "finetuned-LXMERT-visual-features"
model_name = "VisualBERT"
save_filename = os.path.join(SAVE_FOLDER, get_save_filename(model_name, adaptation))

model = VisualBertForPreTraining.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
model.eval()

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# VisualBERT features have dimension (150, 1024)
# Using LXMERT shaped features below
visual_features = torch.load("adaptations/data/runs/finetune-visual-features/visualbert-lxmert/best_global_step_features5920.pt")

get_preds = lambda questions: get_visualbert_preds_for_questions(model, tokenizer, questions, batch_size=128, visual_features=visual_features)

results = get_model_results(get_preds, tokenizer)
results.to_csv(save_filename, index=False)

### VisualBERT - tuned on Wikipedia-finetune

In [None]:
adaptation = "finetuned-Wikipedia-visual-features"
model_name = "VisualBERT"
save_filename = os.path.join(SAVE_FOLDER, get_save_filename(model_name, adaptation))

model = VisualBertForPreTraining.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
model.eval()

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# VisualBERT features have dimension (150, 1024)
# Using LXMERT shaped features below!!
visual_features = torch.load("adaptations/data/runs/finetune-visual-features/visualbert-wikipedia/best_global_step_features4400.pt")

get_preds = lambda questions: get_visualbert_preds_for_questions(model, tokenizer, questions, batch_size=128, visual_features=visual_features)

results = get_model_results(get_preds, tokenizer)
results.to_csv(save_filename, index=False)

## 8. Imagined visual features

### CLIP-BERT

In [None]:
adaptation = "imagined-visual-features"
model_name = "CLIP-BERT"
save_filename = os.path.join(SAVE_FOLDER, get_save_filename(model_name, adaptation))

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model_path = "models/data/model-weights/clip-bert/mp_rank_00_model_states.pt"

model, clip_model, clip_processor = get_clip_bert_model(model_path, no_visual_prediction=False)
model.eval()
 
get_preds = lambda questions: get_clip_bert_preds_for_questions(model, clip_model, clip_processor, questions, tokenizer, no_visual_prediction=False, batch_size=128)

results = get_model_results(get_preds, tokenizer)
results.to_csv(save_filename, index=False)

## Aggregate results

In [None]:
model_names = ["BERT-base",
               "CLIP-BERT",
               "LXMERT",
               "VisualBERT"
              ]
adaptations = ["default",
               "trained-LXMERT",
               "trained-LXMERT-scratch",
               "trained-Wikipedia",
               "no-visual-features-finetuned-LXMERT",
               "no-visual-features-finetuned-Wikipedia",
               "avg-visual-features",
               "zero-image-visual-features",
               "zeroed-visual-features",
               "finetuned-LXMERT-visual-features",
               "finetuned-Wikipedia-visual-features",
               "imagined-visual-features"
              ]

results = pd.DataFrame()
for model_name in model_names:
    for adaptation in adaptations:
        save_filename = os.path.join(SAVE_FOLDER, get_save_filename(model_name, adaptation))
        try:
            tmp = pd.read_csv(save_filename)
        except Exception as e:
            print(f"Couldn't read results from model {model_name} with adaptation {adaptation}")
            print("This is expected if the model was never evaluated with this adaptation.")
            print(e)
        else:
            tmp["model"] = model_name
            tmp["adaptation"] = adaptation
            results = results.append(tmp, ignore_index=True)

In [None]:
results.to_csv(os.path.join(SAVE_FOLDER, "results.csv"), index=False)