# For getting Naive definitons with Curie
Includes a fully explored N^2 naive algorithm that returns the best pairing that 'defines' a single word. Revised attempt looks at less pairings, and includes a revised vocabulary to avoid plurals - in an effort to get a clean top 10 with no orthographic influences or self-referential definitions.

In [3]:
import os
import openai
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from tqdm.notebook import tqdm

# api key set in conda env.
openai.api_key = os.getenv('OPENAI_API_KEY')

## Load Model Cache
Standard inclusion based on earlier tests are OpenAI's GPT-3 Curie embeddings.

In [None]:
cached_models = ['curie', 'ada', 'babbage', 'davinci']

########################
# CHANGE MODEL HERE!!! #
########################
version = 'curie'   

# must be one of 'cached_models'
assert version in cached_models

In [None]:
# load vocabulary words

vocab = [line.strip() for line in open('./vocab/expanded_vocab.txt', 'r')]
gpt_embeds = [
    [float(x) for x in line.strip().split()]
    for line in open(f'./gpt/gpt_{version}.txt', 'r')
]

model_gpt = dict(zip(vocab, gpt_embeds))

In [6]:
# for getting new embeddings from OpenAI
def gpt_embed(text, engine='text-similarity-curie-001'):
    text = text.replace('\n', ' ')
    return openai.Embedding.create(input=[text], engine=engine)['data'][0]['embedding']

## Helpers
Positive and new algorithm(s).

In [7]:
def positive(words):
    """
    Args:
        words: iterable
    Returns:
        Positive (summed vectors) of word embeddings of a given list of words.
    """
    if isinstance(words, str):
        print(f"You requested the positive of the string \"{words}\". Did you mean [\"{words}\"]?")

    out = 0
    for token in words:
        # convert token to string
        word = str(token)        
        # look for token in cached GPT embeds
        if word in vocab:
            ex = model_gpt[word]  # ex for "extracted"
        # if not found, query API
        else:
            ex = gpt_embed(word)
            model_gpt[word] = ex
        

        # construct positive
        if isinstance(out, int):
            out = np.array(ex).reshape(1, -1)
        else:
            out += np.array(ex).reshape(1, -1)
            
    return out if not isinstance(out, int) else np.array([])

In [8]:
def naive_def(word):
    """
    Args:
        word: str   -> Single word to define
    Returns:
        A single tuple of words a when summed are closest to the target, not including the target.
    """
    best = (None, None)
    best_score = -1

    target = positive([word])
    for i in tqdm(range(len(vocab))):
        w1 = vocab[i]
        if w1 != word:
            for w2 in vocab:
                if w2 != word:
                    p = positive([w1, w2])
                    score = cosine_similarity(target, p)
                    if score > best_score:
                        best = (w1, w2)
                        best_score = score
    
    return (best, best_score)

## Tests

In [11]:
naive_def('puppy')

  0%|          | 0/5124 [00:00<?, ?it/s]

(('dog', 'kitten'), array([[0.93234583]]))

In [12]:
naive_def('king')

  0%|          | 0/5124 [00:00<?, ?it/s]

(('kingdom', 'royal'), array([[0.94391248]]))

In [7]:
naive_def('duckling')

  0%|          | 0/5124 [00:00<?, ?it/s]

(('duck', 'youngster'), array([[0.92606453]]))

In [8]:
naive_def('colt')

  0%|          | 0/5124 [00:00<?, ?it/s]

(('colony', 'gun'), array([[0.88549887]]))

In [9]:
naive_def('filly')

  0%|          | 0/5124 [00:00<?, ?it/s]

(('horse', 'pink'), array([[0.87398292]]))

In [10]:
naive_def('foal')

  0%|          | 0/5124 [00:00<?, ?it/s]

(('horse', 'puppy'), array([[0.90931456]]))

In [11]:
naive_def('husband')

  0%|          | 0/5124 [00:00<?, ?it/s]

(('husbands', 'spouse'), array([[0.95122498]]))

In [12]:
naive_def('wife')

  0%|          | 0/5124 [00:00<?, ?it/s]

(('spouse', 'wives'), array([[0.95503364]]))

In [13]:
naive_def('democracy')

  0%|          | 0/5124 [00:00<?, ?it/s]

(('democratic', 'politics'), array([[0.95029931]]))

In [7]:
naive_def('knowledge')

  0%|          | 0/5124 [00:00<?, ?it/s]

(('information', 'know'), array([[0.94435854]]))

In [8]:
naive_def('power')

  0%|          | 0/5124 [00:00<?, ?it/s]

(('energy', 'powerful'), array([[0.93467298]]))

In [9]:
naive_def('freedom')

  0%|          | 0/5124 [00:00<?, ?it/s]

(('free', 'liberty'), array([[0.94837923]]))

# Revising our approach
What if we want to filter for plurals and orthographic similarity?

In [9]:
import inflect

en = inflect.engine()
plural = en.plural

In [10]:
# preprocess vocabulary
helper = set(vocab)
removed = set()
for word in vocab:
    # naively assume each word is singular, pluralize it
    p = plural(word, count=2)
    # then try and remove it from helper
    comp = en.compare(word, p)
    flag = str(comp)[0] != 'p' # this flags that word is a plural.
    flag2 = p != word
    if flag and flag2 and p in helper:
        helper.remove(p)
        removed.add(p)

# a few words I want to keep
removed -= set(['her', 'his', 'media', 'offspring', 'our', 'some', 'them', 'us'])

len(removed)

43

In [11]:
def revised(word, top_n=10):
    """
    Args:
        word: str   -> Single word to define
        top_n: int  -> Number of best solutions to return. Defualt 10.
    Return:
        N 2-word definition pairs, ordered by similarity to word.
    """
    history = []
    target = positive([word])
    for i in tqdm(range(len(vocab))):
        w1 = vocab[i]
        if w1 == word:
            continue
        elif w1[:3] == word[:3]:
            continue
        elif w1 in removed:
            continue
        for j in range(i+1, len(vocab)):
            w2 = vocab[j]
            if w2 == word:
                continue
            elif word[:3] == w2[:3]:
                continue
            elif w2 in removed:
                continue
            p = positive([w1, w2])
            history.append(((w1, w2), cosine_similarity(target, p)[0][0]))
    history.sort(key=lambda x: x[1], reverse=True)
    return history[:top_n]


## Revised Tests

In [17]:
revised('wife')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('spouse', 'woman'), 0.9490044420671463),
 (('mom', 'spouse'), 0.9451272979217846),
 (('girlfriend', 'spouse'), 0.9431745566021604),
 (('mother', 'spouse'), 0.9422091569360935),
 (('husband', 'woman'), 0.941394252063441),
 (('daughter', 'spouse'), 0.940908311622211),
 (('she', 'spouse'), 0.9407795707978326),
 (('her', 'spouse'), 0.9391922179783434),
 (('female', 'husband'), 0.9390003756779074),
 (('female', 'spouse'), 0.9389518874523621)]

In [12]:
revised('knowledge')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('information', 'wisdom'), 0.9344556983307687),
 (('educated', 'information'), 0.9301229403506701),
 (('expertise', 'information'), 0.9282263377748319),
 (('information', 'learn'), 0.9270306287947265),
 (('information', 'skill'), 0.9249491419435476),
 (('education', 'information'), 0.9233592957319934),
 (('information', 'intelligence'), 0.9231884112172934),
 (('experience', 'information'), 0.919848501273665),
 (('intelligence', 'learn'), 0.9195736608320553),
 (('learn', 'wisdom'), 0.9195502197152241)]

In [13]:
revised('freedom')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('independence', 'liberty'), 0.9460416201754811),
 (('autonomy', 'liberty'), 0.9360640535980851),
 (('independence', 'liberation'), 0.9355306931804426),
 (('liberation', 'liberty'), 0.9327702330582146),
 (('independent', 'liberty'), 0.9309475506521084),
 (('liberty', 'peace'), 0.9292784041687647),
 (('equality', 'liberty'), 0.9260784859588916),
 (('autonomy', 'liberation'), 0.9257240773233195),
 (('flying', 'liberty'), 0.9256740391137868),
 (('democracy', 'liberty'), 0.9252529459369601)]

In [14]:
revised('game')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('match', 'play'), 0.9299215398368388),
 (('movie', 'play'), 0.9244395739026691),
 (('play', 'season'), 0.9226037229848301),
 (('play', 'video'), 0.9225954325462797),
 (('league', 'play'), 0.9208120324557445),
 (('battle', 'play'), 0.9201056850323821),
 (('card', 'play'), 0.9196154107586885),
 (('play', 'war'), 0.9192219379670766),
 (('play', 'system'), 0.919198622954665),
 (('play', 'series'), 0.9187261811046574)]

In [15]:
revised('blue')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('purple', 'sky'), 0.9170853147859979),
 (('green', 'sky'), 0.9147051675658069),
 (('green', 'purple'), 0.9144565596515887),
 (('purple', 'red'), 0.9133029393295535),
 (('red', 'sky'), 0.9113895169353857),
 (('green', 'pink'), 0.9113040744755146),
 (('purple', 'sea'), 0.9108040793109244),
 (('purple', 'white'), 0.9105541976880726),
 (('green', 'red'), 0.9100885187760515),
 (('bay', 'purple'), 0.908963074207327)]

In [16]:
revised('king')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('master', 'royal'), 0.9274129774057194),
 (('leader', 'royal'), 0.925168403450632),
 (('prince', 'queen'), 0.9245663704502171),
 (('father', 'royal'), 0.9241330645930963),
 (('prince', 'reign'), 0.9226733705712508),
 (('chief', 'royal'), 0.9221475518084659),
 (('president', 'royal'), 0.9219856900322241),
 (('prince', 'royal'), 0.9217847390383622),
 (('master', 'prince'), 0.9213879036365092),
 (('prince', 'ruling'), 0.9212197841987075)]