In [None]:
input_string = "abac"
output_string = "abc"

In [None]:
# set implementation in python
def simple_set_dedupe(input_string):
  s = set()
  r = []
  for c in input_string:
    if c not in s:
      s.add(c)
      r.append(c)

  return r

In [None]:
simple_set_dedupe(input_string)

['a', 'b', 'c']

In [None]:
import numpy as np
import torch
from numpy.typing import NDArray

In [None]:
default_tokens = {
    'a': 0,
    'b': 1,
    'c': 2
}

def tokenize(input_string, tokens=None):
  if tokens is None:
    tokens = default_tokens
  return np.array([tokens[c] for c in input_string])

In [None]:
tokens = tokenize(input_string)
tokens

array([0, 1, 0, 2])

In [None]:
def batch_tokens(tokens: NDArray):
  """batch tokens if not already batched"""
  if len(tokens.shape) == 1:
    return np.array([tokens])
  elif len(tokens.shape) == 2:
    return tokens

  raise ValueError("too many dimensions")

In [None]:
batch = batch_tokens(tokens)
batch

array([[0, 1, 0, 2]])

In [None]:
import torch
import torch.nn.functional as F

In [None]:
def embed(tokens, num_values = None):
  num_values = num_values + 1 if num_values is not None else np.max(tokens) + 1 + 1

  return torch.tensor(np.eye(num_values)[tokens+1])

In [None]:
embeddings = embed(batch)
embeddings

tensor([[[0., 1., 0., 0.],
         [0., 0., 1., 0.],
         [0., 1., 0., 0.],
         [0., 0., 0., 1.]]], dtype=torch.float64)

In [None]:
embeddings.shape # should be (batches, sequence length, embedding dimension)

torch.Size([1, 4, 4])

In [None]:
raw_weights = torch.bmm(embeddings, embeddings.transpose(1, 2))
raw_weights

tensor([[[1., 0., 1., 0.],
         [0., 1., 0., 0.],
         [1., 0., 1., 0.],
         [0., 0., 0., 1.]]], dtype=torch.float64)

In [None]:
weights = F.softmax(raw_weights, dim=2)
weights

tensor([[[0.3655, 0.1345, 0.3655, 0.1345],
         [0.1749, 0.4754, 0.1749, 0.1749],
         [0.3655, 0.1345, 0.3655, 0.1345],
         [0.1749, 0.1749, 0.1749, 0.4754]]], dtype=torch.float64)

In [None]:
y = torch.bmm(weights, embeddings).float()
y

tensor([[[0.0000, 0.7311, 0.1345, 0.1345],
         [0.0000, 0.3498, 0.4754, 0.1749],
         [0.0000, 0.7311, 0.1345, 0.1345],
         [0.0000, 0.3498, 0.1749, 0.4754]]])

In [None]:
attn_out = y[0]
values = attn_out @ torch.tensor([[0., 1, -1, -1],
                                  [0,  -1, 1, -1],
                                  [0, -1, -1, 1]]).T

activated = F.relu(values)
print(activated)
activated.sum(dim=1)

tensor([[0.4621, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000],
        [0.4621, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000]])


tensor([0.4621, 0.0000, 0.4621, 0.0000])

In [None]:
xx = torch.nn.Linear(4, 16, False)
yy = torch.nn.Linear(16, 4, False)
print(xx.weight)
# xx
# xx(y)
yy(torch.nn.ReLU()(xx(y)))

Parameter containing:
tensor([[-0.0140, -0.3385, -0.0760,  0.1401],
        [ 0.1546, -0.2273, -0.3025, -0.4889],
        [ 0.1247, -0.0681,  0.1002, -0.2214],
        [-0.3587,  0.3584, -0.2188,  0.1665],
        [ 0.0744,  0.1523,  0.1222, -0.1710],
        [-0.3458,  0.3298,  0.0733,  0.0858],
        [-0.3854,  0.3539,  0.4793, -0.3032],
        [ 0.0653, -0.1647, -0.1123,  0.1412],
        [ 0.0285,  0.4717,  0.0397,  0.2665],
        [ 0.4113, -0.1991, -0.4667, -0.3754],
        [ 0.1451,  0.1299,  0.0537,  0.1685],
        [ 0.2533,  0.0143,  0.2536, -0.2306],
        [-0.1576,  0.0591,  0.4963,  0.2731],
        [-0.0289,  0.2811,  0.1586,  0.0260],
        [ 0.0091, -0.4052, -0.4492, -0.2299],
        [ 0.1024,  0.4178, -0.3387, -0.4133]], requires_grad=True)


tensor([[[-0.0818, -0.1100, -0.1076, -0.0759],
         [-0.0710, -0.2154, -0.1233, -0.0535],
         [-0.0818, -0.1100, -0.1076, -0.0759],
         [-0.0171, -0.1282, -0.0232, -0.0047]]], grad_fn=<UnsafeViewBackward0>)