# Investigating GPT-3 embeddings
* Load data from text files
* Assemble data structures
* Define heplper functions
* Investigate vectors

In [2]:
import os
import openai
import numpy as np
from openai.embeddings_utils import get_embedding       # requires plotly
from sklearn.metrics.pairwise import cosine_similarity

# Set in conda
openai.api_key = os.getenv('OPENAI_API_KEY')

In [3]:
# for eng['ada']
# space for other constants
eng = {'ada': 'text-similarity-ada-001', 'bab': 'text-similarity-babbage-001'}

vocab = []
with open(u'/gpfs/fs1/home/mbarlow6/Desktop/Conceptual-Analysis/barlow/valid_vocab.txt', 'r') as f:
    for w in f:
        vocab.append(w.strip())

### Model Loading

In [4]:
# Ada embeds: 1024 dims
ada_embeds = []
with open(u'/gpfs/fs1/home/mbarlow6/Desktop/Conceptual-Analysis/barlow/gpt/gpt_ada.txt', 'r') as f:
    for line in f:
        ada_embeds.append([float(x) for x in line.strip().split()])

model_ada = dict(zip(vocab, ada_embeds))

In [5]:
# Babbage embeds: 2048 dims
bab_embeds = []
with open(u'/gpfs/fs1/home/mbarlow6/Desktop/Conceptual-Analysis/barlow/gpt/gpt_babbage.txt', 'r') as f:
    for line in f:
        bab_embeds.append([float(x) for x in line.strip().split()])

model_bab = dict(zip(vocab, bab_embeds))

### Helper Functions
* definition -> most_similar, positive
* calculate_similarity ((a + b) -> t)
* sum_complexity
* above_zero_complexity

In [6]:
def positive(words, model='ada'):
    """
    Given a list of words, sum the embeddings. (matrix addition)
    words: list[str]
    model: str -> ['ada', 'bab']
    """
    if model.lower() == 'ada':
        mdl = model_ada
    elif model.lower() == 'bab':
        mdl = model_bab
    else:
        raise ValueError('Model selection must be \'ada\' or \'bab\'')

    print(words)
    res = np.array(mdl[words[0]]).reshape(1, -1)
    for x in words[1:]:
        res += np.array(mdl[x]).reshape(1, -1)
    return res

In [7]:
def most_similar(word, phrase=False, top_n=10, model='ada'):
    """
    Return the 10 most similar words to a given word based on BERT embeddings
    word: string
    model: str -> ['ada', 'bab']
    """
    if model.lower() == 'ada':
        mdl = model_ada
    elif model.lower() == 'bab':
        mdl = model_bab
    else:
        raise ValueError('Model selection must be \'ada\' or \'bab\'')

    similarities = {}
    word1 = word
    if not phrase:
        word1 = np.array(mdl[word]).reshape(1, -1)
    for term in vocab:
        word2 = np.array(mdl[term]).reshape(1, -1)
        if isinstance(word, str):
            if term != word:
                similarities[term] = cosine_similarity(word1, word2)
        elif word2.any():
            similarities[term] = cosine_similarity(word1, word2)
    return list({k: v for k, v in sorted(similarities.items(), key=lambda item: item[1], reverse=True)}.items())[2:2+top_n]

In [8]:
def definition(word1, word2, top_n=10, model='ada'):
    """
    Return top_n most similar words to the positive of two word vectors.
    word1, word2: str
    top_n: int
    model: str -> ['ada', 'bab']
    """
    if model.lower() == 'ada':
        mdl = model_ada
    elif model.lower() == 'bab':
        mdl = model_bab
    else:
        raise ValueError('Model selection must be \'ada\' or \'bab\'')
    
    return most_similar(positive([word1, word2]), True, top_n, model)

In [9]:
def calculate_similarity(words, target, model='ada'):
    """
    Calculate the similarity between the positive of a list of words and a target
    word1, word2, target: str
    model: str -> ['ada', 'bab']
    """
    if model.lower() == 'ada':
        mdl = model_ada
    elif model.lower() == 'bab':
        mdl = model_bab
    else:
        raise ValueError('Model selection must be \'ada\' or \'bab\'')

    # get the phrase vector
    phrase = positive(words, model)

    # get target vector
    target = np.array(mdl[target]).reshape(1, -1)

    return cosine_similarity(phrase, target)[0][0]

In [10]:
def calculate_connect(a, b, model='ada'):
    """
    Given two word strings or feature vectors, calcuate their similarity.
    a, b: str | list[float] (or ndarray)
    """
    if model.lower() == 'ada':
        mdl = model_ada
    elif model.lower() == 'bab':
        mdl = model_bab
    else:
        raise ValueError('Model selection must be \'ada\' or \'bab\'')

    if isinstance(a, str):
        a = np.array(mdl[a]).reshape(1, -1)
    
    if isinstance(b, str):
        b = np.array(mdl[b]).reshape(1, -1)

    return cosine_similarity(a, b)[0][0]

In [11]:
def sum_complexity(word, model='ada'):
    """
    Sums the absolute value of each dimension in a word's embedding vector
    args:
        word: str -> word to get embedding for
    returns:
        sum of absolute value of each dimension in embedding vector
    """
    if model.lower() == 'ada':
        mdl = model_ada
    elif model.lower() == 'bab':
        mdl = model_bab
    else:
        raise ValueError('Model selection must be \'ada\' or \'bab\'')

    # should we keep absolute value here?
    modified = [abs(float(x)) for x in mdl[word]]
    return sum(modified)

In [12]:
def above_zero_complexity(word, threshold=0.05, model='ada'):
    if model.lower() == 'ada':
        mdl = model_ada
    elif model.lower() == 'bab':
        mdl = model_bab
    else:
        raise ValueError('Model selection must be \'ada\' or \'bab\'')
    
    emb = mdl[word]
    modified = [abs(float(x)) for x in emb if abs(float(x)) > threshold]
    return len(modified) / len(emb)

### Vector investigation

What's going on? Initial thoughts are leading me towards the angle of a vector (embedding) point us towards it's component ideas' "meaning space", and it's magnitude reveals something about the strength of connection to that meaning space. Higher magnitudes could mean that a vector is more "embedded" within a certain feature space, which lends validity to single word embeddings having a higher `sum_complexity` than paragraphs with many ideas.

Cosine similarity only scores based on angle, not magnitude, which allows us to easily use positives to approximate combined meaning because we aren't worried about the magnitude of those positives. What if both are important? If complex ideas reside closer to 0 and this is because of their inherent complexity, then we must consider those values signifigant as well. The same argument generalizes to absolute value.

If "apple" represents an idea space, then I would expect a short sentence all about apples to have a strong angle similarity to single word vectors related to apples (fruit, round, etc) and a story about apple picking to have strong angle similarity but small magnitude due to the influence of other major ideas in the story (context).

I will begin to test such ideas below.

In [13]:
wh = get_embedding('white house', eng['ada'])
bd = get_embedding('baby duck', eng['ada'])
um = get_embedding('unmarried man', eng['ada'])

hw = get_embedding('Hello, world! It is I, the bachelor duckling, come to lay seige to the White House.', eng['ada'])

complaint = get_embedding("These are giving me a headache... It seems like the threshold I choose is statistical, not meaningful. (Are they the same in a model?) Each threshold I choose leaves VERY little space in variation of 'above zero complexity' - when I choose based on quartile all terms score about 0.4. As that term moves closer to 1, we include less and less points, and the complexity scores move with the percentage of data that is being preserved. Bottom 20\% \of numbers are below -0.0218967255204916, and top 20\% \are above 0.024340828508138665, so something like threshold 0.021 captures 40\% \of the data, nearly every time, no matter the word. This doesn't feel conducive to our goal.", eng['ada'])

In [16]:
save = ""
S = np.inf
for word in vocab:
    x = np.sqrt(sum(np.square(np.array(model_ada[word]))))
    if x < S:
        S = x
        save = word

print(save, S)

slice 0.9999998985397428


These vectors are already normalized. They all have magnitude of ~1.

The angle holds the meaning, so they give us the angle with all of the magnitude removed (or it never had any?). Either way, I want to know exactly where these vectors come from.

If I follow Tiyanyi's BERT extraction, we are extracting a tensor from the final hidden layer (this tracks) and "collapsing" it to one dimension. I MUST check next if the BERT and GloVe embeddings are unit vectors, or if that is being purposefully removed from my GPT-3 numbers.

In [17]:
stats = [[-np.inf, np.inf] for _ in range(1024)]
for word in vocab:
    sav = model_ada[word]
    for i in range(len(sav)):
        a = max(sav[i], stats[i][0])
        b = min(sav[i], stats[i][1])
        stats[i] = [a, b]
stats

[[0.0473531112074852, -0.04032731428742409],
 [0.05639132112264633, -0.028183234855532646],
 [0.03956960514187813, -0.04298396408557892],
 [0.06012345477938652, -0.02069728821516037],
 [0.06450548768043518, -0.03868637979030609],
 [0.05219966918230057, -0.04610151797533035],
 [0.06442626565694809, -0.024008892476558685],
 [0.04438881203532219, -0.048750393092632294],
 [0.09445475041866302, -0.02430049143731594],
 [0.04768644645810127, -0.04318663850426674],
 [0.043608617037534714, -0.03820762783288956],
 [0.027181409299373627, -0.05178345739841461],
 [0.0590951181948185, -0.021535072475671768],
 [0.09949833154678345, -0.09259007126092911],
 [0.03829724341630936, -0.04950854554772377],
 [-0.002116778399795294, -0.09097554534673691],
 [0.025195768103003502, -0.0707140639424324],
 [0.04256536439061165, -0.056694213300943375],
 [0.027609601616859436, -0.022307757288217545],
 [0.04758970066905022, -0.0455743744969368],
 [0.0634385421872139, -0.021064145490527153],
 [0.04106929898262024, -0.

In [21]:
out = {}
for word in vocab:
    similarities = {}
    word1 = np.array(model_ada[word]).reshape(1, -1)
    for term in vocab:
        word2 = np.array(model_ada[term]).reshape(1, -1)
        if isinstance(word, str):
            if term != word:
                similarities[term] = cosine_similarity(-word1, word2)
        elif word2.any():
            similarities[term] = cosine_similarity(-word1, word2)
    out[word] = list({k: v for k, v in sorted(similarities.items(), key=lambda item: item[1], reverse=True)}.items())[:12]
out

{'pop': [('desert', array([[-0.72404487]])),
  ('if', array([[-0.72592409]])),
  ('anything', array([[-0.74276688]])),
  ('neighbourhood', array([[-0.74334324]])),
  ('counter', array([[-0.74378124]])),
  ('trading', array([[-0.7509998]])),
  ('fourth', array([[-0.75196933]])),
  ('shy', array([[-0.75453149]])),
  ('nicely', array([[-0.75456093]])),
  ('full', array([[-0.75486021]])),
  ('opposition', array([[-0.75492814]])),
  ('conference', array([[-0.7552774]]))],
 'data': [('if', array([[-0.70459939]])),
  ('desert', array([[-0.70670718]])),
  ('trading', array([[-0.71099037]])),
  ('counter', array([[-0.71319908]])),
  ('neighbourhood', array([[-0.72607206]])),
  ('fourth', array([[-0.72686721]])),
  ('nephew', array([[-0.73218959]])),
  ('anything', array([[-0.73219735]])),
  ('pleasure', array([[-0.73308749]])),
  ('pale', array([[-0.73312825]])),
  ('origin', array([[-0.73334271]])),
  ('smoothly', array([[-0.73363183]]))],
 'expect': [('convenient', array([[-0.73275913]])),
  

In [25]:
a, b = "", -np.inf
for word in vocab:
    x = out[word][0][1][0][0]
    if x > b:
        b = x
        a = word
print(a, b)

trading -0.6528102459499042


In [30]:
most_similar(-np.array(model_ada['protection']).reshape(1, -1), phrase=True)

[('moment', array([[-0.67760979]])),
 ('nicely', array([[-0.68022783]])),
 ('public', array([[-0.68539278]])),
 ('full', array([[-0.68922365]])),
 ('join', array([[-0.68932917]])),
 ('criticism', array([[-0.68993263]])),
 ('rank', array([[-0.69157826]])),
 ('desert', array([[-0.69966284]])),
 ('hire', array([[-0.69969428]])),
 ('position', array([[-0.70003039]]))]