# For getting Naive definitons with Curie
blah

In [1]:
import os
import openai
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

# api key set in conda env.
openai.api_key = os.getenv('OPENAI_API_KEY')

## Load Model Cache
Just GPT-3 Curie this time :)

In [2]:
# load vocabulary words

vocab = []
with open('./expanded_vocab.txt', 'r') as f:
    for line in f:
        vocab.append(line.strip())

len(vocab)

5124

In [3]:
# Loading saved embeddings from GPT-3 Curie
gpt_embeds = []

with open(u'./gpt/gpt_curie.txt', 'r') as f:
    for line in f:
        gpt_embeds.append([float(x) for x in line.strip().split()])

model_gpt = dict(zip(vocab, gpt_embeds))

In [4]:
# for getting new embeddings from OpenAI
def gpt_embed(text, engine='text-similarity-curie-001'):
    text = text.replace('\n', ' ')
    return openai.Embedding.create(input=[text], engine=engine)['data'][0]['embedding']

## Helpers
Positive and new algorithm(s).

In [5]:
def positive(words):
    """
    Args:
        words: iterable
    Returns:
        Positive (summed vectors) of word embeddings of a given list of words.
    """
    if isinstance(words, str):
        print(f"You requested the positive of the string \"{words}\". Did you mean [\"{words}\"]?")

    out = 0
    for token in words:
        # convert token to string
        word = str(token)        
        # look for token in cached GPT embeds
        if word in vocab:
            ex = model_gpt[word]  # ex for "extracted"
        # if not found, query API
        else:
            ex = gpt_embed(word)
            model_gpt[word] = ex
        

        # construct positive
        if isinstance(out, int):
            out = np.array(ex).reshape(1, -1)
        else:
            out += np.array(ex).reshape(1, -1)
            
    return out if not isinstance(out, int) else np.array([])

In [6]:
from tqdm.notebook import tqdm
def naive_def(word):
    """
    Args:
        word: str   -> Single word to define
    Returns:
        A single tuple of words a when summed are closest to the target, not including the target.
    """
    best = (None, None)
    best_score = -1

    target = positive([word])
    for i in tqdm(range(len(vocab))):
        w1 = vocab[i]
        if w1 != word:
            for w2 in vocab:
                if w2 != word:
                    p = positive([w1, w2])
                    score = cosine_similarity(target, p)
                    if score > best_score:
                        best = (w1, w2)
                        best_score = score
    
    return (best, best_score)

## Tests

In [11]:
naive_def('puppy')

  0%|          | 0/5124 [00:00<?, ?it/s]

(('dog', 'kitten'), array([[0.93234583]]))

In [12]:
naive_def('king')

  0%|          | 0/5124 [00:00<?, ?it/s]

(('kingdom', 'royal'), array([[0.94391248]]))

In [7]:
naive_def('duckling')

  0%|          | 0/5124 [00:00<?, ?it/s]

(('duck', 'youngster'), array([[0.92606453]]))

In [8]:
naive_def('colt')

  0%|          | 0/5124 [00:00<?, ?it/s]

(('colony', 'gun'), array([[0.88549887]]))

In [9]:
naive_def('filly')

  0%|          | 0/5124 [00:00<?, ?it/s]

(('horse', 'pink'), array([[0.87398292]]))

In [10]:
naive_def('foal')

  0%|          | 0/5124 [00:00<?, ?it/s]

(('horse', 'puppy'), array([[0.90931456]]))

In [11]:
naive_def('husband')

  0%|          | 0/5124 [00:00<?, ?it/s]

(('husbands', 'spouse'), array([[0.95122498]]))

In [12]:
naive_def('wife')

  0%|          | 0/5124 [00:00<?, ?it/s]

(('spouse', 'wives'), array([[0.95503364]]))

In [13]:
naive_def('democracy')

  0%|          | 0/5124 [00:00<?, ?it/s]

(('democratic', 'politics'), array([[0.95029931]]))