# Investigating GPT-3 embeddings
* Load data from text files
* Assemble data structures
* Define heplper functions
* Investigate vectors

In [38]:
import os
import openai
import numpy as np
from openai.embeddings_utils import get_embedding
from sklearn.metrics.pairwise import cosine_similarity

# Set in conda
openai.api_key = os.getenv('OPENAI_API_KEY')

In [39]:
# for eng['ada']
# space for other constants
eng = {'ada': 'text-similarity-ada-001', 'bab': 'text-similarity-babbage-001'}

vocab = []
with open(u'/gpfs/fs1/home/mbarlow6/Desktop/Conceptual-Analysis/barlow/valid_vocab.txt', 'r') as f:
    for w in f:
        vocab.append(w.strip())

### Model Loading

In [40]:
# Ada embeds: 1024 dims
ada_embeds = []
with open(u'/gpfs/fs1/home/mbarlow6/Desktop/Conceptual-Analysis/barlow/gpt/gpt_ada.txt', 'r') as f:
    for line in f:
        ada_embeds.append([float(x) for x in line.strip().split()])

model_ada = dict(zip(vocab, ada_embeds))

In [41]:
# Babbage embeds: 2048 dims
bab_embeds = []
with open(u'/gpfs/fs1/home/mbarlow6/Desktop/Conceptual-Analysis/barlow/gpt/gpt_babbage.txt', 'r') as f:
    for line in f:
        bab_embeds.append([float(x) for x in line.strip().split()])

model_bab = dict(zip(vocab, bab_embeds))

### Helper Functions
* definition -> most_similar, positive
* calculate_similarity ((a + b) -> t)
* sum_complexity
* above_zero_complexity

In [42]:
def positive(words, model='ada'):
    """
    Given a list of words, sum the embeddings. (matrix addition)
    words: list[str]
    model: str -> ['ada', 'bab']
    """
    if model.lower() == 'ada':
        mdl = model_ada
    elif model.lower() == 'bab':
        mdl = model_bab
    else:
        raise ValueError('Model selection must be \'ada\' or \'bab\'')

    print(words)
    res = np.array(mdl[words[0]]).reshape(1, -1)
    for x in words[1:]:
        res += np.array(mdl[x]).reshape(1, -1)
    return res

In [43]:
def most_similar(word, phrase=False, top_n=10, model='ada'):
    """
    Return the 10 most similar words to a given word based on BERT embeddings
    word: string
    model: str -> ['ada', 'bab']
    """
    if model.lower() == 'ada':
        mdl = model_ada
    elif model.lower() == 'bab':
        mdl = model_bab
    else:
        raise ValueError('Model selection must be \'ada\' or \'bab\'')

    similarities = {}
    word1 = word
    if not phrase:
        word1 = np.array(mdl[word]).reshape(1, -1)
    for term in vocab:
        word2 = np.array(mdl[term]).reshape(1, -1)
        if isinstance(word, str):
            if term != word:
                similarities[term] = cosine_similarity(word1, word2)
        elif word2.any():
            similarities[term] = cosine_similarity(word1, word2)
    return list({k: v for k, v in sorted(similarities.items(), key=lambda item: item[1], reverse=True)}.items())[2:2+top_n]

In [44]:
def definition(word1, word2, top_n=10, model='ada'):
    """
    Return top_n most similar words to the positive of two word vectors.
    word1, word2: str
    top_n: int
    model: str -> ['ada', 'bab']
    """
    if model.lower() == 'ada':
        mdl = model_ada
    elif model.lower() == 'bab':
        mdl = model_bab
    else:
        raise ValueError('Model selection must be \'ada\' or \'bab\'')
    
    return most_similar(positive([word1, word2]), True, top_n, model)

In [45]:
def calculate_similarity(words, target, model='ada'):
    """
    Calculate the similarity between the positive of a list of words and a target
    word1, word2, target: str
    model: str -> ['ada', 'bab']
    """
    if model.lower() == 'ada':
        mdl = model_ada
    elif model.lower() == 'bab':
        mdl = model_bab
    else:
        raise ValueError('Model selection must be \'ada\' or \'bab\'')

    # get the phrase vector
    phrase = positive(words, model)

    # get target vector
    target = np.array(mdl[target]).reshape(1, -1)

    return cosine_similarity(phrase, target)[0][0]

In [46]:
def calculate_connect(a, b, model='ada'):
    """
    Given two word strings or feature vectors, calcuate their similarity.
    a, b: str | list[float] (or ndarray)
    """
    if model.lower() == 'ada':
        mdl = model_ada
    elif model.lower() == 'bab':
        mdl = model_bab
    else:
        raise ValueError('Model selection must be \'ada\' or \'bab\'')

    if isinstance(a, str):
        a = np.array(mdl[a]).reshape(1, -1)
    
    if isinstance(b, str):
        b = np.array(mdl[b]).reshape(1, -1)

    return cosine_similarity(a, b)[0][0]

In [47]:
def sum_complexity(word, model='ada'):
    """
    Sums the absolute value of each dimension in a word's embedding vector
    args:
        word: str -> word to get embedding for
    returns:
        sum of absolute value of each dimension in embedding vector
    """
    if model.lower() == 'ada':
        mdl = model_ada
    elif model.lower() == 'bab':
        mdl = model_bab
    else:
        raise ValueError('Model selection must be \'ada\' or \'bab\'')

    # should we keep absolute value here?
    modified = [abs(float(x)) for x in mdl[word]]
    return sum(modified)

In [48]:
def above_zero_complexity(word, threshold=0.05, model='ada'):
    if model.lower() == 'ada':
        mdl = model_ada
    elif model.lower() == 'bab':
        mdl = model_bab
    else:
        raise ValueError('Model selection must be \'ada\' or \'bab\'')
    
    emb = mdl[word]
    modified = [abs(float(x)) for x in emb if abs(float(x)) > threshold]
    return len(modified) / len(emb)

### Vector investigation

What's going on? Initial thoughts are leading me towards the angle of a vector (embedding) point us towards it's component ideas' "meaning space", and it's magnitude reveals something about the strength of connection to that meaning space. Higher magnitudes could mean that a vector is more "embedded" within a certain feature space, which lends validity to single word embeddings having a higher `sum_complexity` than paragraphs with many ideas.

Cosine similarity only scores based on angle, not magnitude, which allows us to easily use positives to approximate combined meaning because we aren't worried about the magnitude of those positives. What if both are important? If complex ideas reside closer to 0 and this is because of their inherent complexity, then we must consider those values signifigant as well. The same argument generalizes to absolute value.

If "apple" represents an idea space, then I would expect a short sentence all about apples to have a strong angle similarity to single word vectors related to apples (fruit, round, etc) and a story about apple picking to have strong angle similarity but small magnitude due to the influence of other major ideas in the story (context).

I will begin to test such ideas below.