# Better Definitions
Includes the revisied algorithm from `naive_def.ipynb`, a method for swapping between most model sizes tested, and a larger set of interesting definitions by default.

In [1]:
import os
import openai
import inflect
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from tqdm.notebook import tqdm

openai.api_key = os.getenv('OPENAI_API_KEY')
en = inflect.engine()
plural = en.plural

## Load Model Cache
Standard inclusion based on earlier tests are OpenAI's GPT-3 Curie embeddings.

In [2]:
cached_models = ['curie', 'ada', 'babbage', 'davinci']

########################
# CHANGE MODEL HERE!!! #
########################
version = 'curie'   

# must be one of 'cached_models'
assert version in cached_models

In [3]:
vocab = [line.strip() for line in open('./vocab/expanded_vocab.txt', 'r')]
gpt_embeds = [
    [float(x) for x in line.strip().split()]
    for line in open(f'./gpt/gpt_{version}.txt', 'r')
]

model_gpt = dict(zip(vocab, gpt_embeds))

In [4]:
#########################
# Preprocess vocabulary #
#########################

helper = set(vocab)
removed = set()
for word in vocab:
    # naively assume each word is singular, pluralize it
    p = plural(word, count=2)
    # then try and remove it from helper
    comp = en.compare(word, p)
    flag = str(comp)[0] != 'p' # this flags that word is a plural.
    flag2 = p != word
    if flag and flag2 and p in helper:
        helper.remove(p)
        removed.add(p)

# a few words we want to keep:
removed -= set(['her', 'his', 'media', 'offspring', 'our', 'some', 'them', 'us'])

# len(removed)

## Helpers

In [5]:
# for getting new embeddings from OpenAI
def gpt_embed(text, engine=f'text-similarity-{version}-001'):
    text = text.replace('\n', ' ')
    return openai.Embedding.create(input=[text], engine=engine)['data'][0]['embedding']

In [6]:
def positive(words):
    """
    Args:
        words: iterable
    Returns:
        Positive (summed vectors) of word embeddings of a given list of words.
    """
    if isinstance(words, str):
        print(f"You requested the positive of the string \"{words}\". Did you mean [\"{words}\"]?")

    out = 0
    for token in words:
        # convert token to string
        word = str(token)        
        # look for token in cached GPT embeds
        if word in vocab:
            ex = model_gpt[word]  # ex for "extracted"
        # if not found, query API
        else:
            ex = gpt_embed(word)
            model_gpt[word] = ex
        

        # construct positive
        if isinstance(out, int):
            out = np.array(ex).reshape(1, -1)
        else:
            out += np.array(ex).reshape(1, -1)
            
    return out if not isinstance(out, int) else np.array([])

In [7]:
def define(word, top_n=10):
    """
    Args:
        word: str   -> Single word to define
        top_n: int  -> Number of best solutions to return. Defualt 10.
    Return:
        N 2-word definition pairs, ordered by similarity to word.
    """
    history = []
    target = positive([word])
    for i in tqdm(range(len(vocab))):
        w1 = vocab[i]
        if w1 == word:
            continue
        elif w1[:3] == word[:3]:
            continue
        elif w1 in removed:
            continue
        for j in range(i+1, len(vocab)):
            w2 = vocab[j]
            if w2 == word:
                continue
            elif word[:3] == w2[:3]:
                continue
            elif w2 in removed:
                continue
            p = positive([w1, w2])
            history.append(((w1, w2), cosine_similarity(target, p)[0][0]))
    history.sort(key=lambda x: x[1], reverse=True)
    return history[:top_n]


## Testing

In [8]:
define('rationality')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('logical', 'reasoning'), 0.9236378390372653),
 (('reasoning', 'sensible'), 0.9205015500512653),
 (('logic', 'sensible'), 0.9202925336095696),
 (('irrationals', 'logical'), 0.9189462622168958),
 (('intelligence', 'logical'), 0.9185250022595706),
 (('logical', 'morality'), 0.916561303777381),
 (('logic', 'reasoning'), 0.9156371956966491),
 (('consciousness', 'logical'), 0.9154282228476602),
 (('justification', 'logical'), 0.9148433828940656),
 (('intellectual', 'logic'), 0.9144802283983318)]

In [9]:
define('rational')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('logical', 'reasonable'), 0.9352942080125957),
 (('logical', 'reasoning'), 0.9314047850333993),
 (('reasoning', 'sensible'), 0.9313178091732618),
 (('logical', 'reason'), 0.9310238912485518),
 (('logical', 'sensible'), 0.9289080589303863),
 (('logical', 'responsible'), 0.9271758759284697),
 (('reasonable', 'reasoning'), 0.9245481228917478),
 (('logical', 'secular'), 0.9235975565901939),
 (('logic', 'reasonable'), 0.9234628222144656),
 (('educated', 'logical'), 0.9232373795706632)]

In [10]:
define('autonomy')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('control', 'independence'), 0.9176713641431526),
 (('ability', 'independence'), 0.9168004014325604),
 (('independence', 'voluntary'), 0.9159498698260641),
 (('independence', 'mobility'), 0.9152721630363441),
 (('empower', 'independence'), 0.9147063523309407),
 (('agency', 'independence'), 0.9135151453933545),
 (('capable', 'independence'), 0.9128610502892247),
 (('capability', 'independence'), 0.9127564036532756),
 (('freedom', 'independent'), 0.9109920185631024),
 (('choice', 'independence'), 0.9106141201165177)]

In [11]:
define('autonomous')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('independent', 'machine'), 0.9009231099456976),
 (('controlled', 'independent'), 0.8990937878966357),
 (('independent', 'vehicle'), 0.8986279517217186),
 (('driving', 'independent'), 0.8985094168537435),
 (('independent', 'technological'), 0.8982868281817474),
 (('independent', 'motor'), 0.8976942219243362),
 (('drive', 'independent'), 0.8966839881924915),
 (('independent', 'technology'), 0.8959370135885891),
 (('capable', 'independent'), 0.8948874835710563),
 (('artificial', 'independent'), 0.8946289778669951)]

In [12]:
define('justice')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('fairness', 'judicial'), 0.9169658769923914),
 (('injustice', 'judicial'), 0.9139449207954286),
 (('judicial', 'revenge'), 0.91302445971826),
 (('injustice', 'law'), 0.9124230652492498),
 (('equality', 'judicial'), 0.91220058004136),
 (('fairness', 'injustice'), 0.9113156444647046),
 (('injustice', 'legal'), 0.9109533753958927),
 (('fairness', 'law'), 0.9094653404032378),
 (('conviction', 'fairness'), 0.9077942877275789),
 (('fairness', 'judge'), 0.9076338222655296)]

In [13]:
define('causation')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('consequence', 'correlation'), 0.9104671869226033),
 (('correlation', 'motive'), 0.9104510736959889),
 (('correlation', 'explanation'), 0.909513442705769),
 (('accusation', 'correlation'), 0.9080739173570319),
 (('correlation', 'implication'), 0.9068008479792724),
 (('correlation', 'meaning'), 0.9066976471304113),
 (('blame', 'correlation'), 0.9064746006055001),
 (('correlation', 'inflict'), 0.9055786205249134),
 (('correlation', 'reasoning'), 0.9051633801481334),
 (('correlation', 'reason'), 0.9045853393082224)]

In [14]:
define('just')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('only', 'simply'), 0.9448832054411789),
 (('let', 'simply'), 0.942104610093907),
 (('now', 'simply'), 0.9409072340536551),
 (('even', 'simply'), 0.9401085493086702),
 (('it', 'simply'), 0.9401066745033336),
 (('simply', 'you'), 0.9388005343282892),
 (('its', 'simply'), 0.9387688173034998),
 (('simply', 'that'), 0.9377642204997771),
 (('simply', 'so'), 0.9376201349250803),
 (('about', 'simply'), 0.937447436557485)]

In [8]:
define('cause')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('because', 'reason'), 0.9615349029068294),
 (('because', 'why'), 0.9572227863889586),
 (('because', 'meaning'), 0.9521307683501476),
 (('because', 'what'), 0.9493657960062949),
 (('because', 'it'), 0.9490328645473978),
 (('because', 'due'), 0.9487557434432554),
 (('because', 'reasoning'), 0.9464412745537372),
 (('because', 'cue'), 0.9459514815471586),
 (('because', 'see'), 0.9457072700012377),
 (('because', 'excuse'), 0.9453266473754052)]

In [9]:
define('just cause')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('cause', 'simply'), 0.9034025678675263),
 (('cause', 'merely'), 0.9025809311069781),
 (('cause', 'whatever'), 0.9017344035499266),
 (('cause', 'nothing'), 0.9015376888213907),
 (('anything', 'cause'), 0.8988013514554598),
 (('absolutely', 'cause'), 0.8982836101873098),
 (('cause', 'harmless'), 0.8980415076205842),
 (('cause', 'literally'), 0.8976588057763424),
 (('cause', 'straightforward'), 0.8966629616795214),
 (('anywhere', 'cause'), 0.8965709584419106)]

In [8]:
define('love')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('affection', 'it'), 0.913852799640186),
 (('beloved', 'feel'), 0.9135634896853682),
 (('beloved', 'life'), 0.9132379371875783),
 (('beloved', 'do'), 0.9116626664562676),
 (('beloved', 'want'), 0.9114975850394866),
 (('affection', 'life'), 0.9114522414185163),
 (('beloved', 'it'), 0.9113298497037856),
 (('happy', 'romance'), 0.9105326284004531),
 (('beloved', 'joy'), 0.9105105800771522),
 (('affection', 'great'), 0.9103923758725514)]

In [9]:
define('existence')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('life', 'present'), 0.9235225155707034),
 (('living', 'reality'), 0.9217474860855248),
 (('life', 'reality'), 0.9214876090573696),
 (('life', 'presence'), 0.921285916970616),
 (('humanity', 'life'), 0.920512157350976),
 (('life', 'universe'), 0.9195330487064619),
 (('living', 'universe'), 0.9191978911722407),
 (('alive', 'reality'), 0.9188024098362333),
 (('consciousness', 'life'), 0.917877279808574),
 (('awareness', 'life'), 0.9175019269218476)]

In [10]:
define('truth')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('fact', 'honesty'), 0.9423272183989945),
 (('honest', 'reality'), 0.9385416468356156),
 (('honesty', 'reality'), 0.9374780802633441),
 (('fact', 'honest'), 0.9339651124728772),
 (('fact', 'reality'), 0.9328737000242066),
 (('honesty', 'real'), 0.9284915520374593),
 (('actual', 'honesty'), 0.9283351943495234),
 (('correct', 'reality'), 0.9260168223429192),
 (('reality', 'statement'), 0.9252149587836899),
 (('genuine', 'reality'), 0.9245466679637222)]

In [11]:
define('fairness')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('equality', 'unfair'), 0.9236785946985764),
 (('equality', 'reasonable'), 0.9233268354685615),
 (('balanced', 'equality'), 0.922996500470749),
 (('equality', 'reasonably'), 0.9178974745666189),
 (('equality', 'proportion'), 0.9171624301464034),
 (('equality', 'justice'), 0.9169341138887268),
 (('equality', 'honest'), 0.9157407948356723),
 (('balance', 'equality'), 0.9138305857930888),
 (('balanced', 'justice'), 0.9133596870046445),
 (('acceptable', 'equality'), 0.9133578345243887)]

In [12]:
define('alive')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('awake', 'living'), 0.9244589546391456),
 (('intact', 'living'), 0.9172714750694939),
 (('awake', 'live'), 0.9145581572449639),
 (('living', 'viable'), 0.913476171021951),
 (('live', 'living'), 0.9132929227333213),
 (('living', 'revive'), 0.9125944985865269),
 (('healthy', 'living'), 0.9117352648244113),
 (('living', 'survive'), 0.9111209511752293),
 (('active', 'living'), 0.9099464737070243),
 (('healthy', 'live'), 0.9099305456823676)]

In [13]:
define('happy')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('joy', 'pleased'), 0.9380997165103567),
 (('love', 'pleased'), 0.9369776219899391),
 (('day', 'pleased'), 0.936346720839541),
 (('cheerful', 'pleased'), 0.9362955014283465),
 (('cheer', 'pleased'), 0.9346571327966601),
 (('holiday', 'pleased'), 0.934442461019153),
 (('healthy', 'pleased'), 0.9344417259550396),
 (('good', 'pleased'), 0.9330100442206396),
 (('living', 'pleased'), 0.9329552693611364),
 (('pleased', 'warm'), 0.9326986595723271)]

In [14]:
define('art')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('culture', 'draw'), 0.9061392034944346),
 (('craft', 'drawing'), 0.9033266265925499),
 (('craft', 'draw'), 0.9030943091310318),
 (('draw', 'music'), 0.9019598434491427),
 (('culture', 'drawing'), 0.9016303597957094),
 (('craft', 'paint'), 0.8998578561845794),
 (('draw', 'museum'), 0.8996847721595432),
 (('drawing', 'music'), 0.8993834173858664),
 (('design', 'drawing'), 0.8987077898198781),
 (('design', 'paint'), 0.898414938148733)]