## Pre-process the raw ShapeTalk data; e.g., make a vocabulary, spell them, etc.

In [None]:
##
## 1. Each row of final csv contains a single utterance of a given saliency
## 2. Tokenize/spell-check these utterances (adding columns: 'utterance_spelled', 'tokens_lens', 'tokens')
## 3. Add splits (train/test/val) for each utterance, based on a "unary" split concerning shapes (e.g., used for AE)
## 4. Create a vocabulary taking care of <UNK> symbol etc. and use it to encode the tokens
## 

In [10]:
import nltk
import numpy as np
import pandas as pd
import os.path as osp

from changeit3d.language.basics import tokenize_and_spell
from changeit3d.language.spelling import token_spelling_dictionary
from changeit3d.language.vocabulary import build_vocab
from changeit3d.in_out.datasets.shape_talk import expand_df_from_descriptions_to_utterances

In [11]:
tokenizer = nltk.word_tokenize
freq_file = '../../data/aux_language/symspell_frequency_dictionary_en_82_765.txt'
glove_file = '../../data/aux_language/glove.6B.100d.vocabulary.txt'

shape_talk_version = 0
shape_talk_file = f'../../data/shapetalk/language/shapetalk/shapetalk_raw_public_version_{shape_talk_version}.csv'

random_seed = 2022
shape_split_file = f'../../data/shapetalk/misc/unary_split_rs_{random_seed}.csv'

verbose = True
save_res = False
too_short_bound = 0     # if 0 ignore this restriction
too_long_utter_prc = 99 # if 0 ignore, else, sentences longer than this percentile will be ignored
min_word_freq = 2       # word must exist in training split at least this many times, else mapped to <UNK> 

In [12]:
df = pd.read_csv(shape_talk_file)
print(len(df))
df.sample(1)

130342


Unnamed: 0,workerid,utterance_0,utterance_1,utterance_2,utterance_3,utterance_4,assignmentid,worktimeinseconds,source_model_name,source_object_class,source_dataset,target_model_name,target_object_class,target_dataset,is_patched,target_uid,source_uid,hard_context,target_original_object_class,source_original_object_class
54520,user_556,Is shorter in height.,The table top is not circular.,The table top has a cut at the edge.,Does not have four curved legs at the base.,,3V0Z7YWSI0TXEAUON63VUG7C8AIV2V,251.0,7093cec0f1eba67e11f3f1bdf34ac930,table,ShapeNet,8b3bae4e65ee0f67caf7718498824d44,table,ShapeNet,False,table/ShapeNet/8b3bae4e65ee0f67caf7718498824d44,table/ShapeNet/7093cec0f1eba67e11f3f1bdf34ac930,False,table,table


Step 1. (expand datadrame)

In [13]:
df = expand_df_from_descriptions_to_utterances(df)
print('Unique utterances:', len(df))

Unique utterances: 536596


Step 2. (tokenization, spelling)

In [14]:
missed_tokens = tokenize_and_spell(df,
                                   glove_file=glove_file,
                                   freq_file=freq_file,
                                   tokenizer=tokenizer,
                                   token_spelling_dictionary=token_spelling_dictionary)

if verbose:
    for m in missed_tokens:
        print(m, missed_tokens[m])

SymSpell spell-checker loaded: True
Loading glove word embeddings.
Done. 400000 words loaded.
Updating Glove vocabulary with *valid* ShapeTalk words that are missing from it.


In [15]:
all_tokens = set()
df.tokens.apply(lambda x: all_tokens.update(x));
print('Number of tokens', len(all_tokens))

Number of tokens 9251


Step 3. Make-add train/test/val splits

In [16]:
## Read unary (AE-based) shape-split and use it to the shapetalk df.
split_df = pd.read_csv(shape_split_file)
uids_to_split = split_df.groupby('model_uid')['split'].apply(lambda x: list(x)[0]).to_dict()  # dictionary

# pass first the info to each source -- target
df = df.assign(target_unary_split=df.target_uid.apply(lambda x: uids_to_split[x]))
df = df.assign(source_unary_split=df.source_uid.apply(lambda x: uids_to_split[x]))

# now, use it to decide the a split for *neural-listening* (i.e., mimic the split for the AE focusing only on target)
df = df.assign(listening_split=df.target_unary_split)

In [17]:
## also ignore some corner-cases based on tokens-len
if too_short_bound > 0:
    ignore_mask = df.tokens_len <= too_short_bound
    df.loc[ignore_mask, 'listening_split'] = 'ignore'

if too_long_utter_prc > 0:
    too_long_len = np.percentile(df[df.listening_split == 'train']['tokens_len'], too_long_utter_prc)
    print('Too-long token length threshold at {}-percentile is {}.'.format(too_long_utter_prc, too_long_len))
    ignore_mask = df.tokens_len > too_long_len    
    df.loc[ignore_mask, 'listening_split'] = 'ignore'

Too-long token length threshold at 99-percentile is 16.0.


In [18]:
for split in df.listening_split.unique():
    print(split, (df.listening_split == split).mean())

train 0.8432824694928773
test 0.09940625722144779
val 0.05012150668286756
ignore 0.007189766602807326


In [19]:
##
## let's add one final split for testing/training changeIt3DNet systems
## 


## these systems input the **distractor** and attempt to change it  
## so our goal here is to use for testing an input geometry that 
## a) was not seen during training by the underlying shape encoder (e.g, AE) and
## b) couple it with language (prompt) that is *not* compatible with it i.e., it was describing the target

df = df.assign(changeit_split=df.source_unary_split) # main condition

# also use the ignore mask from the listening ("linguistic") conditions above
df.loc[ignore_mask, 'changeit_split'] = 'ignore'

# also, remove utterances that include the word 'distractor(s)' since these could actually refer directly to the distrator and not the target
# e.g., "the distractor has thinner legs"

mask = df.utterance_spelled.apply(lambda x: 'distract' in x)
df.loc[mask, 'changeit_split'] = 'ignore'

for split in df.changeit_split.unique():
    print(split, (df.changeit_split == split).mean())

train 0.8348198644790494
test 0.09547965322141798
val 0.05010100708913223
ignore 0.019599475210400376


Task 4. Make a vocabulary

In [20]:
# Now, use the "train" listening split to make a vocabulary and encode the tokens
train_tokens = df[df.listening_split == 'train']['tokens']
print('Using min-word-freq:', min_word_freq)
vocab = build_vocab(train_tokens, min_word_freq)
print('Using a vocabulary with {} tokens'.format(len(vocab)))

Using min-word-freq: 2
Using a vocabulary with 5883 tokens


In [28]:
if save_res:
    out_vocab_file = osp.join(osp.dirname(shape_talk_file), 'vocabulary.pkl')
    print(out_vocab_file)
    vocab.save(out_vocab_file)

../../data/shapetalk/language/shapetalk/vocabulary_new.pkl


In [23]:
largest_sequence = df[df.listening_split.isin(['train', 'test', 'val'])]['tokens_len'].max()
print('largest-sequence-len:', largest_sequence)

largest-sequence-len: 16


In [24]:
# use the vocab to encode tokens as ints (Adds SOS/EOS/PADDING, etc.)
df = df.assign(tokens_encoded = df.tokens.apply(lambda x: vocab.encode(x, largest_sequence)))
assert all(df.tokens_encoded.apply(lambda x: len(x)) == largest_sequence + 2) 

In [27]:
## Save preprocessed dataframe.
out_file = shape_talk_file.replace('shapetalk_raw', 'shapetalk_preprocessed')
print(out_file)

if save_res:
    df.to_csv(out_file, index=False)

../../data/shapetalk/language/shapetalk/shapetalk_preprocessed_new_public_version_0.csv
