# Create Visual Property Norms Evaluation 

This notebook creates the MLM queries used for the Visual Property Norms evaluation.


In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_table("data/norms.dat")
data.head()

Unnamed: 0,domain,feature type,concept,feature,pf,feature alternatives,participant list
0,nonliving,visual perceptual,accordion,has keys,22,has keys; a keyboard,p 1 2 3 6 7 12 16 20 23 25 26 27 19 22 30 / 4 ...
1,nonliving,taxonomic,accordion,is a musical instrument,20,is a musical instrument,p 1 3 4 5 7 8 10 11 13 15 16 17 18 19 20 21 22...
2,nonliving,functional,accordion,does make music,10,does make music; is musical; is used in music;...,p 2 3 17 19 25 / 14 26 / 5 / 13 / 10
3,nonliving,other perceptual,accordion,is noisy_loud,9,is loud; is noisy; makes a horrible noise at t...,p 6 7 9 26 / 1 9 19 / 24 / 22
4,nonliving,visual perceptual,accordion,has buttons,8,has buttons; has knobs,p 8 12 15 16 18 24 28 / 20


In [3]:
visual_data = data[data["feature type"]=="visual perceptual"].drop(columns=["feature type"])

print(f"Number of concepts with visual perceptual feature types: {len(visual_data.concept.unique())}")
print(f"Number of unique visual features: {len(visual_data.feature.unique())}")
print(f"Number of lines in visual perceptual data: {len(visual_data)}")
print("Examples:")
print(visual_data.concept.unique()[:10])

Number of concepts with visual perceptual feature types: 638
Number of unique visual features: 1662
Number of lines in visual perceptual data: 8907
Examples:
['accordion' 'aeroplane' 'alligator' 'ambulance' 'anchor' 'ant' 'apple'
 'apricot' 'apron' 'arm']


## Choose most common feature if there are alternatives

In [4]:
KNOWN_FEATURE_REPLACEMENTS = {"has a skin": "has skin", 
                              "is furry": "has fur", 
                              "is stream-lined": "is streamlined",
                              "is made of material": "made of fabric",
                              "made of material": "made of fabric",
                              "is hairy": "has fur",
                              "made of porcelain": "made of ceramic",
                              "is see through": "is transparent",
                              "is ceramic": "made of ceramic",
                              "made of clay": "made of ceramic",
                              "is translucent": "is transparent",
                              "has a cable": "has a wire",
                              "has wires": "has a wire",
                              "made of wires": "has a wire",
                              "has a hard skin": "has hard skin",
                              "has a tough skin": "has tough skin",
                              "is see-through": "is transparent",
                              "is rounded": "is round",
                              "has round": "is round",
                              "has a pip": "has pips"}

def get_participant_counts_from_participant_list(participant_list):
    #input example: "p 3 29 / 14"
    participant_list = participant_list.split()[1:]

    prev_split_ix = 0
    counts = []
    for ix, val in enumerate(participant_list):
        if val=="/":
            counts.append(ix-prev_split_ix)
            prev_split_ix = ix+1
    counts.append(len(participant_list)-prev_split_ix)
    return counts

def pick_most_common_feature(row):
    if "_" not in row["feature"]:
        return row["feature"] #just pick the listed one if there are no alternatives
    else:
        pick_ix = np.argmax(row["participant list"])
        most_common_feature = row["feature alternatives"].split("; ")[pick_ix]
        most_common_feature = KNOWN_FEATURE_REPLACEMENTS[most_common_feature] if most_common_feature in KNOWN_FEATURE_REPLACEMENTS else most_common_feature

        feature_variations = [row["feature"].split("_")[0]]
        feature_variations.append(row["feature"].split("_")[1])
        feature_variations.append((" ").join([row["feature"].split("_")[0].split(" ")[0]] + [row["feature"].split("_")[1]]))
        if len(row["feature"].split("_")[0].split(" ")) > 2:
            feature_variations.append((" ").join(row["feature"].split("_")[0].split(" ")[:2] + [row["feature"].split("_")[1]]))
        if len(row["feature"].split("_")[1].split(" ")) > 1:
            feature_variations.append((" ").join([row["feature"].split("_")[0]] + row["feature"].split("_")[1].split(" ")[1:]))

    if not most_common_feature in feature_variations:
        print("--------")
        print("Warning")
        print(f"For concept '{row.concept}'")
        print(f"The most common feature ({most_common_feature}) should be among the potential features listed with '_' ({feature_variations})")
        most_common_feature = feature_variations[0]
        print(f"Picking first option ({most_common_feature})")
    return most_common_feature

In [5]:
visual_data["participant list"] = visual_data["participant list"].apply(get_participant_counts_from_participant_list)
visual_data["feature"] = visual_data.apply(pick_most_common_feature, axis=1)
visual_data.head()

--------
For concept 'accordion'
The most common feature (does have a strap) should be among the potential features listed with '_' (['has a strap', 'straps', 'has straps', 'has a straps'])
Picking first option (has a strap)
--------
For concept 'apron'
The most common feature (has pictures on it) should be among the potential features listed with '_' (['has a picture', 'pictures', 'has pictures', 'has a pictures'])
Picking first option (has a picture)
--------
For concept 'belt'
The most common feature (does go through belt loops) should be among the potential features listed with '_' (['does go through a belt hole', 'loops', 'does loops', 'does go loops'])
Picking first option (does go through a belt hole)
--------
For concept 'bin'
The most common feature (is cuboid) should be among the potential features listed with '_' (['is rectangular', 'square', 'is square'])
Picking first option (is rectangular)
--------
For concept 'bouquet'
The most common feature (made of ribbons) should be

Unnamed: 0,domain,concept,feature,pf,feature alternatives,participant list
0,nonliving,accordion,has keys,22,has keys; a keyboard,"[15, 7]"
4,nonliving,accordion,has buttons,8,has buttons; has knobs,"[7, 1]"
7,nonliving,accordion,made of wood,6,made of wood,[6]
8,nonliving,accordion,has bellows,5,has bellows,[5]
12,nonliving,accordion,made of metal,5,made of metal; made of steel,"[4, 1]"


## Extract feature relations
(The starting phrase before the actual feature)

In [6]:
visual_data["feature_starter"] = visual_data.feature.apply(lambda x: (' ').join(x.split(' ')[:-1]))
visual_data["feature_main"] = visual_data.feature.apply(lambda x: x.split(' ')[-1].split('_')[0])
visual_data = visual_data.drop(columns=["feature"])
visual_data = visual_data.drop(columns=["feature alternatives"])
visual_data = visual_data.drop(columns=["participant list"])
visual_data

Unnamed: 0,domain,concept,pf,feature_starter,feature_main
0,nonliving,accordion,22,has,keys
4,nonliving,accordion,8,has,buttons
7,nonliving,accordion,6,made of,wood
8,nonliving,accordion,5,has,bellows
12,nonliving,accordion,5,made of,metal
...,...,...,...,...,...
22650,living,zebra,8,has a,mane
22653,living,zebra,6,is,fast
22658,living,zebra,3,has,ears
22659,living,zebra,3,has,fur


Note that the frequency of 'is a' is very low compared to the 506 frequency in tacit assumptions! Probably due to that we only look at visual concepts.

## Only include the most common feature starters

In [7]:
FEATURE_STARTERS_TO_INCLUDE = {"is", "has", "has a", "made of"}
starter_mask = [feature_starter in FEATURE_STARTERS_TO_INCLUDE for feature_starter in visual_data.feature_starter]
visual_data = visual_data[starter_mask]
made_of_mask = visual_data.feature_starter=="made of"
visual_data.loc[made_of_mask, "feature_starter"] = "is made of"
visual_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


Unnamed: 0,domain,concept,pf,feature_starter,feature_main
0,nonliving,accordion,22,has,keys
4,nonliving,accordion,8,has,buttons
7,nonliving,accordion,6,is made of,wood
8,nonliving,accordion,5,has,bellows
12,nonliving,accordion,5,is made of,metal
...,...,...,...,...,...
22650,living,zebra,8,has a,mane
22653,living,zebra,6,is,fast
22658,living,zebra,3,has,ears
22659,living,zebra,3,has,fur


## Only include feature alternatives that are described by one wordpiece

In [8]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_base_vocab = tokenizer.get_vocab()
vocab_mask = [feature in bert_base_vocab for feature in visual_data.feature_main]

visual_data = visual_data[vocab_mask]
visual_data

Unnamed: 0,domain,concept,pf,feature_starter,feature_main
0,nonliving,accordion,22,has,keys
4,nonliving,accordion,8,has,buttons
7,nonliving,accordion,6,is made of,wood
12,nonliving,accordion,5,is made of,metal
17,nonliving,accordion,3,has,handles
...,...,...,...,...,...
22646,living,zebra,9,has a,tail
22650,living,zebra,8,has a,mane
22653,living,zebra,6,is,fast
22658,living,zebra,3,has,ears


## View the data

In [9]:
visual_data.sample(6)

Unnamed: 0,domain,concept,pf,feature_starter,feature_main
11372,living,leg,6,has a,hip
5500,nonliving,corkscrew,10,is made of,wood
14420,nonliving,pen,24,is made of,plastic
4540,nonliving,chisel,17,has a,handle
15072,nonliving,plough,7,is made of,wood
5812,nonliving,crayon,20,is made of,wax


## Create partitions based on minimum production frequencies
The data will be partitioned to include all features with at least the specified pf.

In [10]:
PF_SPLITS = [2, 5, 10, 20, 30]
visual_data[visual_data.pf>=PF_SPLITS[-1]]

Unnamed: 0,domain,concept,pf,feature_starter,feature_main
1015,nonliving,barrel,30,is made of,wood
1635,nonliving,bicycle,30,has,wheels
2530,living,broccoli,30,is,green
2636,living,brussel_sprouts,30,is,green
4043,nonliving,cello,30,has,strings
4120,nonliving,certificate,30,is made of,paper
4147,nonliving,chain,30,is made of,metal
4538,nonliving,chisel,30,is made of,metal
5551,living,courgette,30,is,green
5975,living,cucumber,30,is,green


## Save the partitions

In [11]:
for split in PF_SPLITS:
    file_path = "data/pf-partitions/"+str(split)+".csv"
    visual_data[visual_data.pf>=split].to_csv(file_path, columns=["concept","feature_starter","feature_main"], index=False)

## Create the evaluation queries
* Fix article before item ("a", "an" or nothing if uncountable noun)
  * _Done through manual annotation!_
* Handle different query templates
  * _Done! Currently have four different query templates. Worth investigating if they always work out._
* (Look for visual sequence perceptual features, that can only be seen from videos?)

Example:

{"query": "Q: an alligator is? A: [MASK]", "labels": ["green", "big"]}

In [13]:
descriptors = pd.read_csv("data/descriptors.csv", keep_default_na=False)
descriptors = {row.concept: row.descriptor for _, row in descriptors.iterrows()}

In [15]:
import json

QUERY_TEMPLATES = ["[DESCRIPTOR] [CONCEPT] [FEATURE_STARTER] [MASK].",
                   "everybody knows that [DESCRIPTOR] [CONCEPT] [FEATURE_STARTER] [MASK].",
                   "[DESCRIPTOR] [CONCEPT] usually [FEATURE_STARTER] [MASK].",
                   "q: [DESCRIPTOR] [CONCEPT] [FEATURE_STARTER]? a: [MASK].",
                   "q: [DESCRIPTOR] [CONCEPT] usually [FEATURE_STARTER]? a: [MASK].",
                   "generally, [DESCRIPTOR] [CONCEPT] [FEATURE_STARTER] [MASK].",
                   "[DESCRIPTOR] [CONCEPT] generally [FEATURE_STARTER] [MASK].",
                   "describe the properties of [DESCRIPTOR] [CONCEPT]. [DESCRIPTOR] [CONCEPT] [FEATURE_STARTER] [MASK].",
                   "describe the properties of [DESCRIPTOR] [CONCEPT]. [DESCRIPTOR] [CONCEPT] usually [FEATURE_STARTER] [MASK]."]

for query_ix, query_template in enumerate(QUERY_TEMPLATES):
    for split in PF_SPLITS:
        filename = "data/queries/template_" + str(query_ix) + "_pf_" + str(split) + ".jsonl"
        with open(filename, "w") as f:
            visual_data = pd.read_csv("data/pf-partitions/" + str(split) + ".csv")
            for concept, feature_starter in list(visual_data.groupby(["concept","feature_starter"]).count().index):
                query = query_template.replace("[DESCRIPTOR]", descriptors[concept]).replace("[CONCEPT]", concept.replace('_', ' ')).replace("[FEATURE_STARTER]", feature_starter).replace("  ", " ")

                json_entry = {"query": query.strip(), 
                              "labels": list(visual_data[(visual_data.concept==concept) & (visual_data.feature_starter==feature_starter)].feature_main.values),
                              "concept": concept,
                              "query_template": query_template,
                              "feature_starter": feature_starter,
                              "pf": split}
                json.dump(json_entry, f)
                f.write("\n")

## Create a list of tokens to mask task answers for
There are a total of 614 possible answer alternatives to this task

In [17]:
visual_data = pd.read_csv("data/pf-partitions/2.csv")
np.savetxt("data/labels.txt", np.sort(visual_data.feature_main.unique()), delimiter="\n", fmt='%s')