# For getting Naive definitons with Curie
Includes a fully explored N^2 naive algorithm that returns the best pairing that 'defines' a single word. Revised attempt looks at less pairings, and includes a revised vocabulary to avoid plurals - in an effort to get a clean top 10 with no orthographic influences or self-referential definitions.

In [2]:
import os
import openai
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from tqdm.notebook import tqdm

# api key set in conda env.
openai.api_key = os.getenv('OPENAI_API_KEY')

## Load Model Cache
Just GPT-3 Curie this time :)

In [3]:
# load vocabulary words

vocab = []
with open('./vocab/expanded_vocab.txt', 'r') as f:
    for line in f:
        vocab.append(line.strip())

len(vocab)

5124

In [4]:
# Loading saved embeddings from GPT-3 Curie
gpt_embeds = []

with open(u'./gpt/gpt_davinci.txt', 'r') as f:
    for line in f:
        gpt_embeds.append([float(x) for x in line.strip().split()])

model_gpt = dict(zip(vocab, gpt_embeds))

In [5]:
# for getting new embeddings from OpenAI
def gpt_embed(text, engine='text-similarity-davinci-001'):
    text = text.replace('\n', ' ')
    return openai.Embedding.create(input=[text], engine=engine)['data'][0]['embedding']

## Helpers
Positive and new algorithm(s).

In [6]:
def positive(words):
    """
    Args:
        words: iterable
    Returns:
        Positive (summed vectors) of word embeddings of a given list of words.
    """
    if isinstance(words, str):
        print(f"You requested the positive of the string \"{words}\". Did you mean [\"{words}\"]?")

    out = 0
    for token in words:
        # convert token to string
        word = str(token)        
        # look for token in cached GPT embeds
        if word in vocab:
            ex = model_gpt[word]  # ex for "extracted"
        # if not found, query API
        else:
            ex = gpt_embed(word)
            model_gpt[word] = ex
        

        # construct positive
        if isinstance(out, int):
            out = np.array(ex).reshape(1, -1)
        else:
            out += np.array(ex).reshape(1, -1)
            
    return out if not isinstance(out, int) else np.array([])

In [7]:
def naive_def(word):
    """
    Args:
        word: str   -> Single word to define
    Returns:
        A single tuple of words a when summed are closest to the target, not including the target.
    """
    best = (None, None)
    best_score = -1

    target = positive([word])
    for i in tqdm(range(len(vocab))):
        w1 = vocab[i]
        if w1 != word:
            for w2 in vocab:
                if w2 != word:
                    p = positive([w1, w2])
                    score = cosine_similarity(target, p)
                    if score > best_score:
                        best = (w1, w2)
                        best_score = score
    
    return (best, best_score)

## Tests

naive_defs are too expensive to re-run. (estimated 10h with Davinci)

In [None]:
naive_def('puppy')

# Revising our approach
What if we want to filter for plurals and orthographic similarity?

In [8]:
import inflect

en = inflect.engine()
plural = en.plural

In [9]:
# preprocess vocabulary
helper = set(vocab)
removed = set()
for word in vocab:
    # naively assume each word is singular, pluralize it
    p = plural(word, count=2)
    # then try and remove it from helper
    comp = en.compare(word, p)
    flag = str(comp)[0] != 'p' # this flags that word is a plural.
    flag2 = p != word
    if flag and flag2 and p in helper:
        helper.remove(p)
        removed.add(p)

# a few words I want to keep
removed -= set(['her', 'his', 'media', 'offspring', 'our', 'some', 'them', 'us'])

len(removed)

43

In [10]:
def revised(word, top_n=10):
    """
    Args:
        word: str   -> Single word to define
        top_n: int  -> Number of best solutions to return. Defualt 10.
    Return:
        N 2-word definition pairs, ordered by similarity to word.
    """
    history = []
    target = positive([word])
    for i in tqdm(range(len(vocab))):
        w1 = vocab[i]
        if w1 == word:
            continue
        elif w1[:3] == word[:3]:
            continue
        elif w1 in removed:
            continue
        for j in range(i+1, len(vocab)):
            w2 = vocab[j]
            if w2 == word:
                continue
            elif word[:3] == w2[:3]:
                continue
            elif w2 in removed:
                continue
            p = positive([w1, w2])
            history.append(((w1, w2), cosine_similarity(target, p)[0][0]))
    history.sort(key=lambda x: x[1], reverse=True)
    return history[:top_n]


## Revised Tests

These do not take as long - 5 or so hours is expected.

In [11]:
revised('wife')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('husband', 'woman'), 0.9508569745697908),
 (('spouse', 'woman'), 0.9495616430191918),
 (('husband', 'spouse'), 0.9483833238064329),
 (('husband', 'mother'), 0.9430669814364032),
 (('husband', 'widow'), 0.9422477918707295),
 (('mother', 'spouse'), 0.9422354672522202),
 (('husband', 'lady'), 0.941258211067767),
 (('girlfriend', 'husband'), 0.9407533014188159),
 (('female', 'husband'), 0.9407266822454023),
 (('girlfriend', 'spouse'), 0.9382742871773593)]

In [12]:
revised('knowledge')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('information', 'wisdom'), 0.9403342342072523),
 (('information', 'learn'), 0.9350064193175482),
 (('educated', 'information'), 0.9344299845410369),
 (('information', 'intellectual'), 0.9309315443680511),
 (('education', 'information'), 0.9307278820220717),
 (('awareness', 'information'), 0.9307015034571826),
 (('information', 'scholar'), 0.9306620303940811),
 (('information', 'science'), 0.9305924314684961),
 (('information', 'intelligence'), 0.9300627381165681),
 (('information', 'skill'), 0.9293827675453067)]

In [13]:
revised('freedom')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('independence', 'liberty'), 0.9471041375203225),
 (('independence', 'liberation'), 0.9449252796240266),
 (('liberation', 'liberty'), 0.9407391445453362),
 (('escape', 'liberty'), 0.9402727088282272),
 (('independent', 'liberty'), 0.9388423946595881),
 (('independent', 'liberation'), 0.9378218136121038),
 (('choice', 'liberation'), 0.9369603627095202),
 (('autonomy', 'liberty'), 0.9364636709347575),
 (('liberty', 'release'), 0.9361554940410225),
 (('choice', 'liberty'), 0.9354926776655279)]

In [11]:
revised('game')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('league', 'play'), 0.9373357770650107),
 (('play', 'team'), 0.9367463606994575),
 (('match', 'player'), 0.9348914278870476),
 (('league', 'player'), 0.9341582798038441),
 (('play', 'player'), 0.9335779108745395),
 (('player', 'team'), 0.9318594819907428),
 (('play', 'tournament'), 0.9313146776902258),
 (('player', 'war'), 0.9309509363047301),
 (('match', 'play'), 0.9303957675266781),
 (('battle', 'player'), 0.9302399414236945)]

In [12]:
revised('blue')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('green', 'purple'), 0.9172280997456148),
 (('green', 'silver'), 0.916946225236841),
 (('purple', 'sky'), 0.9162274410273579),
 (('green', 'sky'), 0.9151894648560139),
 (('green', 'grey'), 0.9150919206251926),
 (('purple', 'white'), 0.9146052188664905),
 (('black', 'yellow'), 0.9143422438375535),
 (('black', 'green'), 0.9143011105653037),
 (('green', 'navy'), 0.913759303124728),
 (('navy', 'yellow'), 0.9135725244694811)]

In [13]:
revised('king')

  0%|          | 0/5124 [00:00<?, ?it/s]

[(('god', 'royal'), 0.9399845404515973),
 (('crown', 'god'), 0.9360121863710793),
 (('god', 'queen'), 0.9360081006340474),
 (('god', 'ruling'), 0.9345334313453474),
 (('lad', 'royal'), 0.9337056137410812),
 (('master', 'royal'), 0.9334969644136822),
 (('he', 'royal'), 0.932993474817868),
 (('royal', 'ward'), 0.9329218687061671),
 (('leader', 'royal'), 0.9328377258280323),
 (('royal', 'son'), 0.9326452684950428)]