# Generate Visual Property Norms results


Move to root folder

In [None]:
%cd ..

In [2]:
from transformers import BertTokenizer, BertForMaskedLM, BertConfig, CLIPModel, CLIPProcessor, VisualBertForPreTraining, LxmertForPreTraining
import torch
from torch.utils.data import DataLoader
import json
from tqdm import tqdm
import numpy as np
import pandas as pd
import os
import copy
from sklearn.metrics import average_precision_score

# TODO: make sure that paths work
from models.src.clip_bert.modeling_bert import BertImageForMaskedLM
from models.src.lxmert.alterations import LxmertLanguageOnlyXLayer

In [3]:
QUERIES_FOLDER = "visual_property_norms/data/queries"

with open("visual_property_norms/data/labels.txt", "r") as f:
    MASK_LABELS = [label.strip() for label in f.readlines()]

def get_model_preds_for_questions(model, tokenizer, questions):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    dataloader = DataLoader(questions, batch_size=64, shuffle=False)
    with torch.no_grad():
        preds = []
        for questions_batch in iter(dataloader):
            inputs = tokenizer(questions_batch, return_tensors="pt", padding=True).to(device)
            mask_idx = (inputs.input_ids == tokenizer.mask_token_id).nonzero(as_tuple=False)[:, 1][:, None]
            assert len(mask_idx) == inputs.input_ids.shape[0], "ERROR: Found multiple [MASK] tokens per example"
            
            outputs = model(**inputs)["logits"] if "logits" in model(**inputs) else model(**inputs)["prediction_logits"]
            pred = outputs.gather(1, mask_idx.repeat(1, outputs.shape[-1]).unsqueeze(1)).squeeze(1)
            preds.append(pred)

    preds = torch.cat(preds)
    return preds

def get_map_score_for_preds(labels, pred, tokenizer):
    scores = []
    assert pred[0].shape[0] == tokenizer.vocab_size
    vocab = tokenizer.get_vocab()
    for query_ix in range(len(labels)):
        y_true = [0]*tokenizer.vocab_size
        for label in labels[query_ix]:
            y_true[vocab[label]] = 1 
        scores.append(average_precision_score(y_true, pred[query_ix]))
    
    return scores

def get_map_score_for_masked_preds(labels, pred, tokenizer, mask_labels):
    scores = []
    assert pred[0].shape[0] == tokenizer.vocab_size
    vocab = tokenizer.get_vocab()
    mask_ix = [vocab[mask_label] for mask_label in mask_labels]
    
    for query_ix in range(len(labels)):
        y_true = [0]*len(mask_ix)
        for label in labels[query_ix]:
            y_true[mask_ix.index(vocab[label])] = 1 
        scores.append(average_precision_score(y_true, pred[query_ix][mask_ix]))
    
    return scores

def visualize_predictions(pred, questions, labels, tokenizer, num):
    random_ix = np.random.choice(len(pred), num, replace=False)
    for i in random_ix:
        print("-------------------------------")
        print(f"Question: {questions[i]}")
        print(f"Golden labels: {labels[i]}")
        print(f"Predicted labels: {tokenizer.decode(pred[i].topk(k=20).indices)}")
        print("-------------------------------")
        
def update_results_with_model_preds(results, get_preds, query_files, tokenizer):
    for query_file in tqdm(query_files):
        with open(os.path.join(QUERIES_FOLDER, query_file)) as f:
            examples = [json.loads(line) for line in f.readlines()]

        questions = [ex["query"] for ex in examples]
        labels = [ex["labels"] for ex in examples]
        concepts = [ex["concept"] for ex in examples]
        feature_starters = [ex["feature_starter"] for ex in examples]
        pred = get_preds(questions)
        scores = get_map_score_for_preds(labels, pred.cpu().detach().numpy(), tokenizer)
        masked_scores = get_map_score_for_masked_preds(labels, pred.cpu().detach().numpy(), tokenizer, MASK_LABELS)    
        mean_nbr_alternatives = np.mean([len(alternatives) for alternatives in labels])

        query_template = examples[0]["query_template"] #same for the same file
        pf = examples[0]["pf"]
        for query_ix in range(len(labels)):
            assert len(results[(results.model==model_name) & 
                               (results.concept==concepts[query_ix]) & 
                               (results.feature_starter==feature_starters[query_ix]) & 
                               (results.query_template==query_template) & 
                               (results.pf==pf)]) == 0, "Should not append results to already existing key values"
            results_entry = {"model": model_name, 
                             "concept": concepts[query_ix],
                             "query_template": query_template, 
                             "feature_starter": feature_starters[query_ix],
                             "pf": pf,
                             "score": scores[query_ix], 
                             "masked_score": masked_scores[query_ix], 
                             "nbr_alternatives": len(labels[query_ix]),
                             "top10_preds": tokenizer.convert_ids_to_tokens(pred[query_ix].topk(k=10).indices),
                             #"top10_preds": [print(val) for val in pred[query_ix].topk(k=10).indices],
                             "gold_labels": labels[query_ix]}
            results = results.append(results_entry, ignore_index=True).reset_index(drop=True)
        
    return results

In [4]:
# Model evaluation results will be added to this data frame
results = pd.DataFrame(columns=["model", 
                                "concept", 
                                "query_template", 
                                "feature_starter",
                                "pf", 
                                "score", 
                                "masked_score", 
                                "nbr_alternatives", 
                                "top10_preds", 
                                "gold_labels"])


### Evaluate BERT
Measure with MAP. Report results per 1) pf split and 2) feature starter.

In [25]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForMaskedLM.from_pretrained("bert-base-uncased")
get_preds = lambda questions: get_model_preds_for_questions(model, tokenizer, questions)

query_files = os.listdir(QUERIES_FOLDER)
results = update_results_with_model_preds(results, get_preds, query_files, tokenizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [36:49<00:00, 49.11s/it]


In [26]:
MODEL_RESULTS_FILE = "visual_property_norms/data/results-"+model_name+".csv"
save_results = True
if save_results:
    results.to_csv(MODEL_RESULTS_FILE, index=False)
    results = pd.DataFrame(columns=["model", 
                                "concept", 
                                "query_template", 
                                "feature_starter",
                                "pf", 
                                "score", 
                                "masked_score", 
                                "nbr_alternatives", 
                                "top10_preds", 
                                "gold_labels"])

### Evaluate bert finetuned on visual-text data

In [27]:
model_name = "bert-base-trained"
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
config = BertConfig.from_pretrained("bert-base-uncased")
model = BertImageForMaskedLM(config)
model.load_state_dict(torch.load("models/data/model-weights/bert-clip-bert-trained/mp_rank_00_model_states.pt", map_location="cpu")["module"], strict=False)
model.eval()
   
get_preds = lambda questions: get_model_preds_for_questions(model, tokenizer, questions)

query_files = os.listdir(QUERIES_FOLDER)
results = update_results_with_model_preds(results, get_preds, query_files, tokenizer)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [37:00<00:00, 49.34s/it]


In [28]:
MODEL_RESULTS_FILE = "visual_property_norms/data/results/results-"+model_name+".csv"
save_results = True
if save_results:
    results.to_csv(MODEL_RESULTS_FILE, index=False)
    results = pd.DataFrame(columns=["model", 
                                "concept", 
                                "query_template", 
                                "feature_starter",
                                "pf", 
                                "score", 
                                "masked_score", 
                                "nbr_alternatives", 
                                "top10_preds", 
                                "gold_labels"])

### Evaluate clip-bert

In [29]:
def get_clip_bert_model(bert_image_model_path: str, no_visual_prediction: bool=False):
    # Load BertImageForMaskedLM model
    config = BertConfig.from_pretrained("bert-base-uncased")
    bert_image_model = BertImageForMaskedLM(config).eval()
    bert_image_model.load_state_dict(torch.load(bert_image_model_path, map_location="cpu")["module"], strict=False)
    bert_image_model.eval()

    # Load CLIP
    if not no_visual_prediction:
        clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").eval()
        clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

        return bert_image_model, clip_model, clip_processor
    else:
        return bert_image_model, None, None

def get_clip_bert_preds_for_questions(model, 
                                      clip_model, 
                                      clip_processor,
                                      questions,
                                      tokenizer,
                                      no_visual_prediction: bool=False):

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    clip_model = clip_model.to(device) if clip_model is not None else clip_model
    
    dataloader = DataLoader(questions, batch_size=64, shuffle=False)
    with torch.no_grad():
        preds = []
        for questions_batch in iter(dataloader):
            inputs = tokenizer(questions_batch, return_tensors="pt", padding=True).to(device)
            
            # Predict visual features using CLIP
            if not no_visual_prediction:
                img_feats = clip_model.get_text_features(**clip_processor(text=questions_batch, return_tensors="pt", padding=True).to(device)).unsqueeze(1)
                inputs["img_feats"] = img_feats
                
            outputs = model(**inputs)["logits"]
            
            mask_idx = (inputs.input_ids == tokenizer.mask_token_id).nonzero(as_tuple=False)[:, 1][:, None]
            assert len(mask_idx) == inputs.input_ids.shape[0], "ERROR: Found multiple [MASK] tokens per example"
            pred = outputs.gather(1, mask_idx.repeat(1, outputs.shape[-1]).unsqueeze(1)).squeeze(1)
            preds.append(pred)
        preds = torch.cat(preds)

    return preds

In [30]:
model_name = "clip-bert-regress"
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model_path = "models/data/model-weights/clip-bert/mp_rank_00_model_states.pt"

model, clip_model, clip_processor = get_clip_bert_model(model_path, no_visual_prediction=False)
 
get_preds = lambda questions: get_clip_bert_preds_for_questions(model, clip_model, clip_processor, questions, tokenizer, no_visual_prediction=False)

query_files = os.listdir(QUERIES_FOLDER)
results = update_results_with_model_preds(results, get_preds, query_files, tokenizer)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [36:59<00:00, 49.33s/it]


In [31]:
MODEL_RESULTS_FILE = "visual_property_norms/data/results/results-"+model_name+".csv"
save_results = True
if save_results:
    results.to_csv(MODEL_RESULTS_FILE, index=False)
    results = pd.DataFrame(columns=["model", 
                                "concept", 
                                "query_template", 
                                "feature_starter",
                                "pf", 
                                "score", 
                                "masked_score", 
                                "nbr_alternatives", 
                                "top10_preds", 
                                "gold_labels"])

### Evaluate clip-bert without regression

In [32]:
model_name = "clip-bert"
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model_path = "models/data/model-weights/clip-bert/mp_rank_00_model_states.pt"

model, clip_model, clip_processor = get_clip_bert_model(model_path, no_visual_prediction=True)

get_preds = lambda questions: get_clip_bert_preds_for_questions(model, clip_model, clip_processor, questions, tokenizer, no_visual_prediction=True)

query_files = os.listdir(QUERIES_FOLDER)
results = update_results_with_model_preds(results, get_preds, query_files, tokenizer)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [36:26<00:00, 48.60s/it]


In [33]:
MODEL_RESULTS_FILE = "visual_property_norms/data/results/results-"+model_name+".csv"
save_results = True
if save_results:
    results.to_csv(MODEL_RESULTS_FILE, index=False)
    results = pd.DataFrame(columns=["model", 
                                "concept", 
                                "query_template", 
                                "feature_starter",
                                "pf", 
                                "score", 
                                "masked_score", 
                                "nbr_alternatives", 
                                "top10_preds", 
                                "gold_labels"])

### Evaluate VisualBERT

In [34]:
model_name = "visualbert-vqa-coco"
model = VisualBertForPreTraining.from_pretrained("uclanlp/visualbert-vqa-coco-pre").eval()
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

get_preds = lambda questions: get_model_preds_for_questions(model, tokenizer, questions)

query_files = os.listdir(QUERIES_FOLDER)
results = update_results_with_model_preds(results, get_preds, query_files, tokenizer)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [37:09<00:00, 49.55s/it]


In [35]:
MODEL_RESULTS_FILE = "visual_property_norms/data/results/results-"+model_name+".csv"
save_results = True
if save_results:
    results.to_csv(MODEL_RESULTS_FILE, index=False)
    results = pd.DataFrame(columns=["model", 
                                "concept", 
                                "query_template", 
                                "feature_starter",
                                "pf", 
                                "score", 
                                "masked_score", 
                                "nbr_alternatives", 
                                "top10_preds", 
                                "gold_labels"])

### LXMERT

In [36]:
FEATURES_SHAPE = (1, 2048)
NORMALIZED_BOXES_SHAPE = (1, 4)

def get_lxmert_preds_for_questions(model, tokenizer, questions):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    dataloader = DataLoader(questions, batch_size=64, shuffle=False)
    with torch.no_grad():
        preds = []
        for questions_batch in iter(dataloader):
            inputs = tokenizer(questions_batch, return_tensors="pt", padding=True).to(device)
            mask_idx = (inputs.input_ids == tokenizer.mask_token_id).nonzero(as_tuple=False)[:, 1][:, None]
            assert len(mask_idx) == inputs.input_ids.shape[0], "ERROR: Found multiple [MASK] tokens per example"
            
            nbr_samples = len(questions_batch)
            normalized_boxes = torch.empty((nbr_samples,)+NORMALIZED_BOXES_SHAPE).uniform_(0, 1).to(device)
            features = torch.empty((nbr_samples,)+FEATURES_SHAPE).uniform_(0, 10).to(device)
            inputs.update({
                "visual_feats": features,
                "visual_pos": normalized_boxes
            })
            outputs = model(**inputs)["logits"] if "logits" in model(**inputs) else model(**inputs)["prediction_logits"]
            pred = outputs.gather(1, mask_idx.repeat(1, outputs.shape[-1]).unsqueeze(1)).squeeze(1)
            preds.append(pred)

    preds = torch.cat(preds)
    return preds

In [37]:
model_name = "lxmert-base-uncased"
model = LxmertForPreTraining.from_pretrained("unc-nlp/lxmert-base-uncased")
prev_encoder = copy.deepcopy(model.lxmert.encoder)
model.lxmert.encoder.x_layers = torch.nn.ModuleList([LxmertLanguageOnlyXLayer(model.lxmert.encoder.config) for _ in range(model.lxmert.encoder.config.x_layers)])
model.lxmert.encoder.load_state_dict(prev_encoder.state_dict())

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

get_preds = lambda questions: get_lxmert_preds_for_questions(model, tokenizer, questions)

query_files = os.listdir(QUERIES_FOLDER)
results = update_results_with_model_preds(results, get_preds, query_files, tokenizer)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [37:40<00:00, 50.23s/it]


In [38]:
MODEL_RESULTS_FILE = "visual_property_norms/data/results/results-"+model_name+".csv"
save_results = True
if save_results:
    results.to_csv(MODEL_RESULTS_FILE, index=False)
    results = pd.DataFrame(columns=["model", 
                                "concept", 
                                "query_template", 
                                "feature_starter",
                                "pf", 
                                "score", 
                                "masked_score", 
                                "nbr_alternatives", 
                                "top10_preds", 
                                "gold_labels"])

### Random baseline

In [39]:
model_name = "random-baseline"
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
 
get_preds = lambda questions: torch.rand((len(questions), tokenizer.vocab_size))

query_files = os.listdir(QUERIES_FOLDER)
results = update_results_with_model_preds(results, get_preds, query_files, tokenizer)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [35:17<00:00, 47.06s/it]


In [40]:
MODEL_RESULTS_FILE = "visual_property_norms/data/results/results-"+model_name+".csv"
save_results = True
if save_results:
    results.to_csv(MODEL_RESULTS_FILE, index=False)
    results = pd.DataFrame(columns=["model", 
                                "concept", 
                                "query_template", 
                                "feature_starter",
                                "pf", 
                                "score", 
                                "masked_score", 
                                "nbr_alternatives", 
                                "top10_preds", 
                                "gold_labels"])

### Evaluate BERT trained on LXMERT data from pretrained weights

In [5]:
model_name = "bert-base-trained-lxmert"
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
config = BertConfig.from_pretrained("bert-base-uncased")
model = BertImageForMaskedLM(config)
model.load_state_dict(torch.load("models/data/model-weights/bert-lxmert-trained/mp_rank_00_model_states.pt", map_location="cpu")["module"], strict=False)
model.eval()
   
get_preds = lambda questions: get_model_preds_for_questions(model, tokenizer, questions)

query_files = os.listdir(QUERIES_FOLDER)
results = update_results_with_model_preds(results, get_preds, query_files, tokenizer)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [36:59<00:00, 49.32s/it]


In [6]:
MODEL_RESULTS_FILE = "visual_property_norms/data/results/results-"+model_name+".csv"
save_results = True
if save_results:
    results.to_csv(MODEL_RESULTS_FILE, index=False)
    results = pd.DataFrame(columns=["model", 
                                "concept", 
                                "query_template", 
                                "feature_starter",
                                "pf", 
                                "score", 
                                "masked_score", 
                                "nbr_alternatives", 
                                "top10_preds", 
                                "gold_labels"])

### Evaluate BERT trained on LXMERT data from scratch

In [7]:
model_name = "bert-base-trained-lxmert-scratch"
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
config = BertConfig.from_pretrained("bert-base-uncased")
model = BertImageForMaskedLM(config)
model.load_state_dict(torch.load("models/data/model-weights/bert-lxmert-trained-scratch/mp_rank_00_model_states.pt", map_location="cpu")["module"], strict=False)
model.eval()
   
get_preds = lambda questions: get_model_preds_for_questions(model, tokenizer, questions)

query_files = os.listdir(QUERIES_FOLDER)
results = update_results_with_model_preds(results, get_preds, query_files, tokenizer)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [37:58<00:00, 50.63s/it]


In [8]:
MODEL_RESULTS_FILE = "visual_property_norms/data/results/results-"+model_name+".csv"
save_results = True
if save_results:
    results.to_csv(MODEL_RESULTS_FILE, index=False)
    results = pd.DataFrame(columns=["model", 
                                "concept", 
                                "query_template", 
                                "feature_starter",
                                "pf", 
                                "score", 
                                "masked_score", 
                                "nbr_alternatives", 
                                "top10_preds", 
                                "gold_labels"])

## Aggregate results

In [10]:
model_names = ["bert-base-uncased",
               "bert-base-trained",
               "clip-bert-regress",
               "clip-bert",
               "visualbert-vqa-coco",
               "lxmert-base-uncased",
               "random-baseline",
               "bert-base-trained-lxmert",
               "bert-base-trained-lxmert-scratch"]

RESULTS_FOLDER = "visual_property_norms/data/results/"
load_results = True

if load_results:
    results = pd.DataFrame(columns=["model", 
                                "concept", 
                                "query_template", 
                                "feature_starter",
                                "pf", 
                                "score", 
                                "masked_score", 
                                "nbr_alternatives", 
                                "top10_preds", 
                                "gold_labels"])
    
    for model_name in model_names:
        results_file = RESULTS_FOLDER+"results-"+model_name+".csv"
        results = results.append(pd.read_csv(results_file), ignore_index=True)

In [16]:
save_results = False
if save_results:
    results.to_csv("visual_property_norms/data/results/results.csv", index=False)