# GPT Embeddings V2
Running all of our important tests with the updated embeddings API.

In [1]:
import os
import openai
import inflect
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from tqdm.notebook import tqdm

openai.api_key = os.getenv('OPENAI_API_KEY')
en = inflect.engine()
plural = en.plural

In [2]:
vocab = [line.strip() for line in open('./vocab/expanded_vocab.txt', 'r')]
gpt_embeds = [
    [float(x) for x in line.strip().split()]
    for line in open(f'./gpt/gpt_ada-v2.txt', 'r')
]
model_gpt = dict(zip(vocab, gpt_embeds))

In [12]:
#########################
# Preprocess vocabulary #
#########################

helper = set(vocab)
removed = set()
for word in vocab:
    # naively assume each word is singular, pluralize it
    p = plural(word, count=2)
    # then try and remove it from helper
    comp = en.compare(word, p)
    flag = str(comp)[0] != 'p' # this flags that word is a plural.
    flag2 = p != word
    if flag and flag2 and p in helper:
        helper.remove(p)
        removed.add(p)

# a few words we want to keep:
removed -= set(['her', 'his', 'media', 'offspring', 'our', 'some', 'them', 'us'])

len(removed)

43

In [4]:
# for getting new embeddings from OpenAI
def gpt_embed(text, engine=f'text-embedding-ada-002'):
    text = text.replace('\n', ' ')
    return openai.Embedding.create(input=[text], engine=engine)['data'][0]['embedding']

In [5]:
def positive(words):
    """
    Args:
        words: iterable
    Returns:
        Positive (summed vectors) of word embeddings of a given list of words.
    """
    if isinstance(words, str):
        print(f"You requested the positive of the string \"{words}\". Did you mean [\"{words}\"]?")

    out = 0
    for token in words:
        # convert token to string
        word = str(token)        
        # look for token in cached GPT embeds
        if word in vocab:
            ex = model_gpt[word]  # ex for "extracted"
        # if not found, query API
        else:
            ex = gpt_embed(word)
            model_gpt[word] = ex
        

        # construct positive
        if isinstance(out, int):
            out = np.array(ex).reshape(1, -1)
        else:
            out += np.array(ex).reshape(1, -1)
            
    return out if not isinstance(out, int) else np.array([])

In [6]:
def define(word, top_n=10):
    """
    Args:
        word: str   -> Single word to define
        top_n: int  -> Number of best solutions to return. Defualt 10.
    Return:
        N 2-word definition pairs, ordered by similarity to word.
    """
    history = []
    target = positive([word])
    for i in tqdm(range(len(vocab))):
        w1 = vocab[i]
        if w1 == word:
            continue
        elif w1[:3] == word[:3]:
            continue
        elif w1 in removed:
            continue
        for j in range(i+1, len(vocab)):
            w2 = vocab[j]
            if w2 == word:
                continue
            elif word[:3] == w2[:3]:
                continue
            elif w1 in w2 or w2 in w1:
                continue
            elif w2 in removed:
                continue
            p = positive([w1, w2])
            history.append(((w1, w2), cosine_similarity(target, p)[0][0]))
    history.sort(key=lambda x: x[1], reverse=True)
    return history[:top_n]


## Testing

In [9]:
define('rationality')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('irrationals', 'morality'), 0.9356280789451286),
 (('irrationals', 'reasoning'), 0.9347081560220909),
 (('efficiency', 'irrationals'), 0.9293063899482503),
 (('fairness', 'irrationals'), 0.927988508275943),
 (('irrationals', 'philosophy'), 0.9277765042801438),
 (('honesty', 'irrationals'), 0.925764481603268),
 (('irrationals', 'validity'), 0.9256834967924541),
 (('consistency', 'irrationals'), 0.9249131123108179),
 (('irrationals', 'justification'), 0.9245339374402399),
 (('irrationals', 'reality'), 0.9242151602646193)]

In [10]:
define('rational')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('irrationals', 'sensible'), 0.9277069902521244),
 (('irrationals', 'reasonable'), 0.925670801117447),
 (('irrationals', 'systematic'), 0.9244159000559811),
 (('intelligent', 'irrationals'), 0.9238142910927745),
 (('irrationals', 'strategic'), 0.9227910325796909),
 (('irrationals', 'reasonably'), 0.9224520029306923),
 (('irrationals', 'mathematical'), 0.9209309679891646),
 (('irrationals', 'sustainable'), 0.9199268929391387),
 (('irrationals', 'practical'), 0.9197665796309242),
 (('irrationals', 'realistic'), 0.9195964485472343)]

In [11]:
define('autonomy')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('independence', 'self-'), 0.9261642569480981),
 (('independence', 'sovereignty'), 0.9211605753016514),
 (('delegation', 'independence'), 0.9192051409308124),
 (('discretion', 'independence'), 0.9181560682023581),
 (('independence', 'regulation'), 0.9180977565700976),
 (('governance', 'independence'), 0.916559916775304),
 (('freedom', 'independence'), 0.9163873555172517),
 (('conscience', 'independence'), 0.9160796783428625),
 (('competence', 'independence'), 0.9156374336634822),
 (('independence', 'mobility'), 0.915520165103819)]

In [12]:
define('autonomous')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('independent', 'self-'), 0.9281266729471076),
 (('independent', 'remote'), 0.9163785202828747),
 (('independent', 'voluntary'), 0.9157391088404636),
 (('independent', 'sovereignty'), 0.9146893504134022),
 (('independent', 'self'), 0.9131008346971894),
 (('arbitrary', 'independent'), 0.9120070176995727),
 (('independent', 'unconscious'), 0.9116049410651215),
 (('independent', 'tribal'), 0.9112728445005496),
 (('away', 'independent'), 0.9109394724222629),
 (('cognitive', 'independent'), 0.9109189768553579)]

In [13]:
define('justice')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('equality', 'jury'), 0.9422310302095009),
 (('equality', 'judge'), 0.9411374098278109),
 (('judge', 'peace'), 0.9395101971102455),
 (('injustice', 'jury'), 0.9383402584319603),
 (('jury', 'peace'), 0.936749065168257),
 (('injustice', 'judge'), 0.9364241110504062),
 (('crime', 'judge'), 0.9330705034720991),
 (('court', 'equality'), 0.9322483323672534),
 (('criminal', 'equality'), 0.9312407958228992),
 (('fairness', 'judge'), 0.9297664788515458)]

In [14]:
define('causation')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('motivation', 'occurrence'), 0.8986991954265579),
 (('disease', 'motivation'), 0.8974797974009343),
 (('motive', 'occurrence'), 0.8970431879767996),
 (('disease', 'motive'), 0.8955580959048453),
 (('cancer', 'explanation'), 0.8951276047374175),
 (('circumstance', 'prevention'), 0.8950896114747728),
 (('disease', 'explanation'), 0.8948777670993283),
 (('correlation', 'motive'), 0.8947558823456806),
 (('occurrence', 'reason'), 0.8947392606371911),
 (('explanation', 'occurrence'), 0.8946736232608568)]

In [15]:
define('just')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('only', 'simply'), 0.9274664875331583),
 (('even', 'simply'), 0.9270587243298476),
 (('good', 'simply'), 0.9268796708939637),
 (('nothing', 'simply'), 0.9255343448414189),
 (('simply', 'some'), 0.9250766994479637),
 (('only', 'simple'), 0.9248864916142681),
 (('easy', 'only'), 0.9246930289802775),
 (('ordinary', 'simply'), 0.9235510079646936),
 (('normal', 'simply'), 0.923546047033715),
 (('perfect', 'simply'), 0.9223880224690186)]

In [16]:
define('cause')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('effect', 'reason'), 0.9219296345968602),
 (('reason', 'trigger'), 0.9215008065690067),
 (('damage', 'reason'), 0.9197229554925813),
 (('prevent', 'reason'), 0.9191542232359007),
 (('reason', 'source'), 0.9185624692516703),
 (('reason', 'tribute'), 0.9184873438150174),
 (('incident', 'reason'), 0.917983998276873),
 (('produce', 'reason'), 0.9176565121928169),
 (('impact', 'reason'), 0.9173089755579764),
 (('fault', 'reason'), 0.9166898759223372)]

In [17]:
define('just cause')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('cause', 'injustice'), 0.9056383087406858),
 (('cause', 'fairness'), 0.901420886407276),
 (('injustice', 'reason'), 0.9011520244498251),
 (('casual', 'injustice'), 0.8997933653985094),
 (('injustice', 'reasonable'), 0.8989476850934922),
 (('injustice', 'reasonably'), 0.8980958593600347),
 (('injustice', 'motive'), 0.8971636644035834),
 (('injustice', 'warrant'), 0.8949766946193844),
 (('circumstance', 'injustice'), 0.8949603813129504),
 (('cause', 'unreasonable'), 0.894480445436561)]

In [18]:
define('love')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('affection', 'living'), 0.9239838467402999),
 (('affection', 'live'), 0.9233833542601877),
 (('beloved', 'living'), 0.9190889563061209),
 (('living', 'respect'), 0.9185684491491138),
 (('affection', 'life'), 0.916569080213714),
 (('live', 'respect'), 0.9162428553965376),
 (('beloved', 'life'), 0.9161431498238164),
 (('joy', 'living'), 0.9157949778569248),
 (('admiration', 'living'), 0.9149304124704185),
 (('beloved', 'respect'), 0.9143316220798012)]

In [19]:
define('existence')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('instance', 'presence'), 0.929750479805159),
 (('fact', 'presence'), 0.9247571553141666),
 (('actually', 'presence'), 0.9235846714627041),
 (('presence', 'yet'), 0.9226900626079113),
 (('life', 'presence'), 0.9221891620922437),
 (('actual', 'presence'), 0.9221707043646821),
 (('presence', 'there'), 0.9215917808979428),
 (('economic', 'presence'), 0.9215434963252221),
 (('nature', 'presence'), 0.9212857275265357),
 (('alive', 'presence'), 0.9209193778354021)]

In [7]:
define('truth')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('fact', 'false'), 0.9316007275804823),
 (('fact', 'honesty'), 0.9309292507175757),
 (('fact', 'honest'), 0.9295323643322959),
 (('fact', 'lie'), 0.9294164592354611),
 (('fact', 'right'), 0.928634696043201),
 (('fact', 'reality'), 0.927406354004658),
 (('accuracy', 'fact'), 0.9251109629372368),
 (('false', 'reality'), 0.9249732381246161),
 (('belief', 'false'), 0.9245865301411955),
 (('fact', 'honestly'), 0.9242958590828718)]

In [8]:
define('fairness')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('honesty', 'injustice'), 0.9255450254208996),
 (('equality', 'honesty'), 0.9227899468232533),
 (('injustice', 'reliability'), 0.9214589545688299),
 (('injustice', 'transparency'), 0.921393576341635),
 (('consistency', 'injustice'), 0.9206956216744729),
 (('integrity', 'unfair'), 0.9201203260811956),
 (('honesty', 'inequality'), 0.9198550923682015),
 (('injustice', 'integrity'), 0.9192024187226966),
 (('credibility', 'injustice'), 0.9189997773193346),
 (('equality', 'injustice'), 0.9186572514665443)]

In [9]:
define('alive')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('dead', 'living'), 0.938973958792837),
 (('active', 'dead'), 0.9386226303096936),
 (('active', 'living'), 0.9383783324356018),
 (('active', 'death'), 0.9366785450943417),
 (('busy', 'living'), 0.9366712142226564),
 (('active', 'live'), 0.935941566468813),
 (('dead', 'live'), 0.9356847119955423),
 (('active', 'born'), 0.9355655609387856),
 (('able', 'living'), 0.934320938435825),
 (('alone', 'living'), 0.9341431954102374)]

In [10]:
define('happy')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('delighted', 'healthy'), 0.9242524815951367),
 (('luck', 'satisfied'), 0.9235974589410217),
 (('holiday', 'satisfied'), 0.9227352936482455),
 (('healthy', 'pleased'), 0.921318589257293),
 (('fun', 'satisfied'), 0.9205655787285081),
 (('glad', 'healthy'), 0.9198661837689315),
 (('healthy', 'satisfied'), 0.9197794384539872),
 (('healthy', 'unhappy'), 0.919463466025601),
 (('delighted', 'luck'), 0.919437257445518),
 (('luck', 'unhappy'), 0.9193519907616582)]

In [11]:
define('art')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('act', 'painting'), 0.9269632151880671),
 (('act', 'drawing'), 0.9246389069089864),
 (('craft', 'drawing'), 0.9240656356430315),
 (('drawing', 'sport'), 0.9233651896076635),
 (('act', 'craft'), 0.9232564379681216),
 (('painting', 'sport'), 0.9230250715159545),
 (('act', 'paint'), 0.923000600026521),
 (('paint', 'sport'), 0.9218951435054941),
 (('craft', 'painting'), 0.9212718542252519),
 (('drawing', 'science'), 0.9207351558409518)]