* Determine average vector magnitude of the supplied set
* Divide that magnitude by the number of dimensions
* Allow for arbitrary cutoff, at 1 or 10%
* count the number of dimensions in a vector that is above/below the cutoff

In [None]:
import numpy as np

# constants
filename = u'./opt/1_3B.txt'
cutoff = 0.1 

# conditions
assert cutoff > 0.0 and cutoff < 1.0, "cutoff must be between 0 and 1"

In [None]:
# Determine average vector magnitude
# we have magnitude for t5 and opt.
totalmag = 0
f = open(filename)
n_embeds = len(f)
n_dims = 0
for line in f:
    emb = [float(x) for x in line.strip().split()]
    if n_dims == 0:
        n_dims = len(emb)
    mag = np.linalg.norm(emb)
    totalmag += mag
f.close()
average_mag = totalmag / n_embeds
average_dim = average_mag / n_dims
threshold = average_dim * cutoff

# defaults for later
t = average_dim + threshold; b = average_dim - threshold

print("Average magnitude: ", average_mag)
print("Average dimension: ", average_dim)

In [None]:
# count the number of dimensions for some vector
# that are outside the threshold
def count_outside_cutoff(emb, high=t, low=b):
    count = 0
    for x in emb:
        if x >= high or x <= low:
            count += 1
    return count

## Memory heavy workspace

I'm not including loading the model with Transformers in this section for the sake of brevity.

In [None]:
# Make vector lookup dictionary
vocab = []
opt_embeds = []

with open('./vocab/expanded_vocab.txt', 'r') as f:
    for line in f:
        vocab.append(line.strip())

with open(u'./opt/1_3B.txt', 'r') as f:
    for line in f:
        opt_embeds.append([float(x) for x in line.strip().split()])
model_opt = dict(zip(vocab, opt_embeds))

In [None]:
def positive(words):
    if isinstance(words, str):
        print(f"You requested the positive of the string \"{words}\". Did you mean [\"{words}\"]?")

    out = 0
    for token in words:
        word = str(token)
        if word in model_opt:
            ex = model_opt[word]
        else:
            raise ValueError(f"Word \"{word}\" not found in OPT-1.3b model. Please check spelling or try another model.")

        # construct positive
        if isinstance(out, int):
            out = np.array(ex).reshape(1, -1)
        else:
            out += np.array(ex).reshape(1, -1)
            
    return out if not isinstance(out, int) else np.array([])

In [None]:
def magnitudeCounts(words):
    new = positive(words)
    counts = count_outside_cutoff(new)
    if len(words) > 1:
        for word in words[:-1]:
            print(f"{word} +", end=" ")
        print(f"{words[-1]} -> {counts} dimensions outside cutoff")
    else:
        print(f"{words[0]} -> {counts} dimensions outside cutoff")

In [None]:
magnitudeCounts(['unmarried', 'man'])