# Create Visual Property Norms Evaluation 

This notebook creates the MLM queries used for the Visual Property Norms evaluation.


In [None]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer

In [None]:
data = pd.read_table("data/norms.dat")
data.head(30)

In [None]:
visual_data = data[data["feature type"]=="visual perceptual"].drop(columns=["feature type"])

print(f"Number of concepts with visual perceptual feature types: {len(visual_data.concept.unique())}")
print(f"Number of unique visual features: {len(visual_data.feature.unique())}")
print(f"Number of lines in visual perceptual data: {len(visual_data)}")
print("Examples:")
print(visual_data.concept.unique()[:10])

## Choose most common feature if there are alternatives

In [None]:
KNOWN_FEATURE_REPLACEMENTS = {"has a skin": "has skin", 
                              "is furry": "has fur", 
                              "is stream-lined": "is streamlined",
                              "is made of material": "made of fabric",
                              "made of material": "made of fabric",
                              "is hairy": "has fur",
                              "made of porcelain": "made of ceramic",
                              "is see through": "is transparent",
                              "is ceramic": "made of ceramic",
                              "made of clay": "made of ceramic",
                              "is translucent": "is transparent",
                              "has a cable": "has a wire",
                              "has wires": "has a wire",
                              "made of wires": "has a wire",
                              "has a hard skin": "has hard skin",
                              "has a tough skin": "has tough skin",
                              "is see-through": "is transparent",
                              "is rounded": "is round",
                              "has round": "is round",
                              "has a pip": "has pips"}

def get_participant_counts_from_participant_list(participant_list):
    #input example: "p 3 29 / 14"
    participant_list = participant_list.split()[1:]

    prev_split_ix = 0
    counts = []
    for ix, val in enumerate(participant_list):
        if val=="/":
            counts.append(ix-prev_split_ix)
            prev_split_ix = ix+1
    counts.append(len(participant_list)-prev_split_ix)
    return counts

def pick_most_common_feature(row):
    if "_" not in row["feature"]:
        return row["feature"] #just pick the listed one if there are no alternatives
    else:
        pick_ix = np.argmax(row["participant list"])
        most_common_feature = row["feature alternatives"].split("; ")[pick_ix]
        most_common_feature = KNOWN_FEATURE_REPLACEMENTS[most_common_feature] if most_common_feature in KNOWN_FEATURE_REPLACEMENTS else most_common_feature

        feature_variations = [row["feature"].split("_")[0]]
        feature_variations.append(row["feature"].split("_")[1])
        feature_variations.append((" ").join([row["feature"].split("_")[0].split(" ")[0]] + [row["feature"].split("_")[1]]))
        if len(row["feature"].split("_")[0].split(" ")) > 2:
            feature_variations.append((" ").join(row["feature"].split("_")[0].split(" ")[:2] + [row["feature"].split("_")[1]]))
        if len(row["feature"].split("_")[1].split(" ")) > 1:
            feature_variations.append((" ").join([row["feature"].split("_")[0]] + row["feature"].split("_")[1].split(" ")[1:]))

    if not most_common_feature in feature_variations:
        print("--------")
        print("Warning")
        print(f"For concept '{row.concept}'")
        print(f"The most common feature ({most_common_feature}) should be among the potential features listed with '_' ({feature_variations})")
        most_common_feature = feature_variations[0]
        print(f"Picking first option ({most_common_feature})")
    return most_common_feature

In [None]:
visual_data["participant list"] = visual_data["participant list"].apply(get_participant_counts_from_participant_list)
visual_data["feature"] = visual_data.apply(pick_most_common_feature, axis=1)
visual_data.head()

## Extract feature relations
(The starting phrase before the actual feature)

In [None]:
visual_data["feature_starter"] = visual_data.feature.apply(lambda x: (' ').join(x.split(' ')[:-1]))
visual_data["feature_main"] = visual_data.feature.apply(lambda x: x.split(' ')[-1].split('_')[0])
visual_data = visual_data.drop(columns=["feature"])
visual_data = visual_data.drop(columns=["feature alternatives"])
visual_data = visual_data.drop(columns=["participant list"])
visual_data

Note that the frequency of 'is a' is very low compared to the 506 frequency in tacit assumptions! Probably due to that we only look at visual concepts.

## Only include the most common feature starters

In [None]:
FEATURE_STARTERS_TO_INCLUDE = {"is", "has", "has a", "made of"}
starter_mask = [feature_starter in FEATURE_STARTERS_TO_INCLUDE for feature_starter in visual_data.feature_starter]
visual_data = visual_data[starter_mask]
made_of_mask = visual_data.feature_starter=="made of"
visual_data.loc[made_of_mask, "feature_starter"] = "is made of"
visual_data

## Only include feature alternatives that are described by one wordpiece

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_base_vocab = tokenizer.get_vocab()
vocab_mask = [feature in bert_base_vocab for feature in visual_data.feature_main]

visual_data = visual_data[vocab_mask]
visual_data

## View the data

In [None]:
visual_data.sample(6)

## Create partitions based on minimum production frequencies
The data will be partitioned to include all features with at least the specified pf.

In [None]:
PF_SPLITS = [2, 5, 10, 20, 30]
visual_data[visual_data.pf>=PF_SPLITS[-1]]

## Save the partitions

In [None]:
for split in PF_SPLITS:
    file_path = "data/pf-partitions/"+str(split)+".csv"
    visual_data[visual_data.pf>=split].to_csv(file_path, columns=["concept","feature_starter","feature_main"], index=False)

## Create the evaluation queries
* Fix article before item ("a", "an" or nothing if uncountable noun)
  * _Done through manual annotation!_
* Handle different query templates
  * _Done! Currently have four different query templates. Worth investigating if they always work out._
* (Look for visual sequence perceptual features, that can only be seen from videos?)

Example:

{"query": "Q: an alligator is? A: [MASK]", "labels": ["green", "big"]}

In [None]:
descriptors = pd.read_csv("data/descriptors.csv", keep_default_na=False)
descriptors = {row.concept: row.descriptor for _, row in descriptors.iterrows()}

In [None]:
import json

QUERY_TEMPLATES = ["[DESCRIPTOR] [CONCEPT] [FEATURE_STARTER] [MASK].",
                   "everybody knows that [DESCRIPTOR] [CONCEPT] [FEATURE_STARTER] [MASK].",
                   "[DESCRIPTOR] [CONCEPT] usually [FEATURE_STARTER] [MASK].",
                   "q: [DESCRIPTOR] [CONCEPT] [FEATURE_STARTER]? a: [MASK].",
                   "q: [DESCRIPTOR] [CONCEPT] usually [FEATURE_STARTER]? a: [MASK].",
                   "generally, [DESCRIPTOR] [CONCEPT] [FEATURE_STARTER] [MASK].",
                   "[DESCRIPTOR] [CONCEPT] generally [FEATURE_STARTER] [MASK].",
                   "describe the properties of [DESCRIPTOR] [CONCEPT]. [DESCRIPTOR] [CONCEPT] [FEATURE_STARTER] [MASK].",
                   "describe the properties of [DESCRIPTOR] [CONCEPT]. [DESCRIPTOR] [CONCEPT] usually [FEATURE_STARTER] [MASK]."]

for query_ix, query_template in enumerate(QUERY_TEMPLATES):
    for split in PF_SPLITS:
        filename = "data/queries/template_" + str(query_ix) + "_pf_" + str(split) + ".jsonl"
        with open(filename, "w") as f:
            visual_data = pd.read_csv("data/pf-partitions/" + str(split) + ".csv")
            for concept, feature_starter in list(visual_data.groupby(["concept","feature_starter"]).count().index):
                query = query_template.replace("[DESCRIPTOR]", descriptors[concept]).replace("[CONCEPT]", concept.replace('_', ' ')).replace("[FEATURE_STARTER]", feature_starter).replace("  ", " ")

                json_entry = {"query": query.strip(), 
                              "labels": list(visual_data[(visual_data.concept==concept) & (visual_data.feature_starter==feature_starter)].feature_main.values),
                              "concept": concept,
                              "query_template": query_template,
                              "feature_starter": feature_starter,
                              "pf": split}
                json.dump(json_entry, f)
                f.write("\n")

## Create a list of tokens to mask task answers for
There are a total of 614 possible answer alternatives to this task

In [None]:
visual_data = pd.read_csv("data/pf-partitions/2.csv")
np.savetxt("data/labels.txt", np.sort(visual_data.feature_main.unique()), delimiter="\n", fmt='%s')