In [1]:
from collections import defaultdict
import itertools
import random
import pandas as pd
from ceat import ceat_meta
import numpy as np
from embeddings import generate_embedding, generate_embedding_bulk, visualize_embeddings

Set random seed

In [2]:
random.seed(42)

In [3]:
FLOWERS = ['aster', 'clover', 'hyacinth', 'marigold', 'poppy', 'azalea', 'crocus', 'iris', 'orchid', 'rose', 'bluebell', 'daffodil', 'lilac',
           'pansy', 'tulip', 'buttercup', 'daisy', 'lily', 'peony', 'violet', 'carnation', 'gladiola', 'magnolia', 'petunia', 'zinnia']

INSECTS = ['ant', 'caterpillar', 'flea', 'locust', 'spider', 'bedbug', 'centipede', 'fly', 'maggot', 'tarantula', 'bee', 'cockroach',
           'gnat', 'mosquito', 'termite', 'beetle', 'cricket', 'hornet', 'moth', 'wasp', 'blackfly', 'dragonfly', 'horsefly', 'roach', 'weevil']

MUSICAL_INSTRUMENTS = ['bagpipe', 'cello', 'guitar', 'lute', 'trombone', 'banjo', 'clarinet', 'harmonica', 'mandolin', 'trumpet', 'bassoon',
                       'drum', 'harp', 'oboe', 'tuba', 'bell', 'fiddle', 'harpsichord', 'piano', 'viola', 'bongo', 'flute', 'horn', 'saxophone', 'violin']

WEAPONS = ['arrow', 'club', 'gun', 'missile', 'spear', 'axe', 'dagger', 'harpoon', 'pistol', 'sword', 'blade', 'dynamite',
           'hatchet', 'rifle', 'tank', 'bomb', 'firearm', 'knife', 'shotgun', 'teargas', 'cannon', 'grenade', 'mace', 'slingshot', 'whip']

In [4]:
PLEASANT = ['caress', 'freedom', 'health', 'love', 'peace', 'cheer', 'friend', 'heaven', 'loyal', 'pleasure', 'diamond', 'gentle',
            'honest', 'lucky', 'rainbow', 'diploma', 'gift', 'honor', 'miracle', 'sunrise', 'family', 'happy', 'laughter', 'paradise', 'vacation']

UNPLEASANT = ['abuse', 'crash', 'filth', 'murder', 'sickness', 'accident', 'death', 'grief', 'poison', 'stink', 'assault', 'disaster',
              'hatred', 'pollute', 'tragedy', 'divorce', 'jail', 'poverty', 'ugly', 'cancer', 'kill', 'rotten', 'vomit', 'agony', 'prison']

In [11]:
SENTENCE_TEMPLATES_FLOWERS_INSECTS = [
    "The delicate movements of a(n) {object} evoke feelings of {feeling}",
    "When I see a(n) {object}, I think of {feeling}",
    "Watching a(n) {object} fills me with {feeling}",
    "A {object} in the garden can bring thoughts of {feeling}",
    "Encountering a(n) {object} makes me feel {feeling}"
]

SENTENCE_TEMPLATES_WEAPONS_MUSICAL = [
    "The use of a(n) {object} often leads to {feeling}",
    "The sound of a(n) {object} brings a sense of {feeling}",
    "Listening to a(n) {object} evokes feelings of {feeling}",
    "The misuse of a(n) {object} can be linked to {feeling}",
    "The careful craftsmanship of a(n) {object} can evoke feelings of {feeling}",
    "The sight of a sheathed {object} can bring a sense of {feeling}"
]

# Generate sentences and create a dataframe that has all sentences

In [12]:
def generate_sentences(names, attributes=pd.concat([pd.Series(PLEASANT), pd.Series(UNPLEASANT)]), templates=SENTENCE_TEMPLATES_FLOWERS_INSECTS):
    sentences = {}
    # Create all possible combinations of attribute, noun and sentence template
    triplets = list(itertools.product(attributes, names, templates))
    for (a, n, st) in triplets:
        sentence = st.format(object=n, feeling=a)
        sentences[(a, n, st)] = sentence
    return sentences

In [13]:
insect_sentence_dict = generate_sentences(names=INSECTS, templates=SENTENCE_TEMPLATES_FLOWERS_INSECTS)
flowers_sentence_dict = generate_sentences(names=FLOWERS, templates=SENTENCE_TEMPLATES_FLOWERS_INSECTS)

weapons_sentence_dict = generate_sentences(names=MUSICAL_INSTRUMENTS, templates=SENTENCE_TEMPLATES_WEAPONS_MUSICAL)
musical_sentence_dict = generate_sentences(names=WEAPONS, templates=SENTENCE_TEMPLATES_WEAPONS_MUSICAL)

In [15]:
pd_i = pd.DataFrame([(k[0],k[1],k[2],v,"IN") for k, v in insect_sentence_dict.items()], columns=["attribute","name","sentence_template","sentence", "group"])
pd_f = pd.DataFrame([(k[0],k[1],k[2],v,"FL") for k, v in flowers_sentence_dict.items()], columns=["attribute","name","sentence_template","sentence", "group"])

pd_w = pd.DataFrame([(k[0],k[1],k[2],v,"IN") for k, v in weapons_sentence_dict.items()], columns=["attribute","name","sentence_template","sentence", "group"])
pd_m = pd.DataFrame([(k[0],k[1],k[2],v,"FL") for k, v in musical_sentence_dict.items()], columns=["attribute","name","sentence_template","sentence", "group"])

In [16]:
pd_all_if = pd.concat([pd_i,pd_f])
pd_all_if = pd_all_if.reset_index()
pd_all_if.to_csv("dataset_pd_all_insects_flowers.csv")

pd_all_wm = pd.concat([pd_w,pd_m])
pd_all_wm = pd_all_wm.reset_index()
pd_all_wm.to_csv("dataset_pd_all_weapons_musical.csv")

Print total sentence tokens

In [17]:
total_sentence_tokens_if = pd_all_if["sentence"].apply(lambda x: len(x.split(" "))).sum()
print(f"Total sentence tokens(make sure this is not too much for OpenAI): {total_sentence_tokens_if}")
assert total_sentence_tokens_if < 1000000 # Make sure we are not over the limit, this is a rough estimate of 13 cents.

total_sentence_tokens_wm = pd_all_wm["sentence"].apply(lambda x: len(x.split(" "))).sum()
print(f"Total sentence tokens(make sure this is not too much for OpenAI): {total_sentence_tokens_wm}")
assert total_sentence_tokens_wm < 1000000 # Make sure we are not over the limit, this is a rough estimate of 13 cents.


Total sentence tokens(make sure this is not too much for OpenAI): 172
Total sentence tokens(make sure this is not too much for OpenAI): 240


# Acquire embeddings for all sentences, code can use some cleaning up

Create dictionary copy of pd_all

In [18]:
pd_all_to_dict_if = pd_all_if.to_dict(orient="index")
pd_all_to_dict_wm = pd_all_wm.to_dict(orient="index")

In [20]:
distinct_attributes_if = pd_all_if["attribute"].unique()
distinct_groups_if = pd_all_if["group"].unique()

distinct_attributes_wm = pd_all_wm["attribute"].unique()
distinct_groups_wm = pd_all_wm["group"].unique()

In [21]:
for a in distinct_attributes_if:
    for g in distinct_groups_if:
        print(f"Attribute: {a}, Group: {g}")
        data_a_g = pd_all_if[(pd_all_if["attribute"] == a) & (pd_all_if["group"] == g)]
        
        embedding_str_pairs = generate_embedding_bulk(sentences=data_a_g["sentence"].values,save_to_file=True)
        assert len(embedding_str_pairs) == len(data_a_g)
        for ((i, row),(sentence, embedding)) in zip(data_a_g.iterrows(),embedding_str_pairs):
            assert row["sentence"] == sentence              

            # Store array in pd_all_to_dict
            pd_all_to_dict_if[i]["embedding"] = np.array(embedding)


for a in distinct_attributes_wm:
    for g in distinct_groups_wm:
        print(f"Attribute: {a}, Group: {g}")
        data_a_g = pd_all_wm[(pd_all_wm["attribute"] == a) & (pd_all_wm["group"] == g)]
        
        embedding_str_pairs = generate_embedding_bulk(sentences=data_a_g["sentence"].values,save_to_file=True)
        assert len(embedding_str_pairs) == len(data_a_g)
        for ((i, row),(sentence, embedding)) in zip(data_a_g.iterrows(),embedding_str_pairs):
            assert row["sentence"] == sentence              

            # Store array in pd_all_to_dict
            pd_all_to_dict_wm[i]["embedding"] = np.array(embedding)

Attribute: caress, Group: IN
From cache: 0, To compute: 5
Attribute: caress, Group: FL
From cache: 0, To compute: 5
Attribute: abuse, Group: IN
From cache: 0, To compute: 5
Attribute: abuse, Group: FL
From cache: 0, To compute: 5
Attribute: caress, Group: IN
From cache: 0, To compute: 6
Attribute: caress, Group: FL
From cache: 0, To compute: 6
Attribute: abuse, Group: IN
From cache: 0, To compute: 6
Attribute: abuse, Group: FL
From cache: 0, To compute: 6


In [26]:
sent2emb = defaultdict(list)
for emb in pd_all_to_dict_wm.values():
    sent2emb[emb["name"]].append(emb["embedding"])
    sent2emb[emb["attribute"]].append(emb["embedding"])
print(ceat_meta([[WEAPONS,MUSICAL_INSTRUMENTS,PLEASANT,UNPLEASANT]],sent2emb,N = 1000))
print("_, p-value")

tao>0
(-0.1421465624475714, 1.0)
_, p-value


In [25]:
sent2emb = defaultdict(list)
for emb in pd_all_to_dict_if.values():
    sent2emb[emb["name"]].append(emb["embedding"])
    sent2emb[emb["attribute"]].append(emb["embedding"])
print(ceat_meta([[INSECTS,FLOWERS,PLEASANT,UNPLEASANT]],sent2emb,N = 1000))
print("_, p-value")

tao>0
(0.016405586602217595, 0.36891452786261547)
_, p-value
