[example](http://jalammar.github.io/illustrated-transformer/)

In [1]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import numpy as np
import numpy as np
import pandas as pd
import torch.optim as optim
from random import randint
import tensorflow as tf
import matplotlib.pyplot as plt

In [2]:
import random

def fetch():
    """generate XOR sequence.
    """
    a = random.choice([0, 1])
    b = np.random.uniform(0, 1)
    c = np.random.uniform(0, 1)
    d = np.random.uniform(0, 1)
    X = [a, b, c, d]
    
    if a == 0:
        y = 0
    else:
        y = 1
    return (X, y)

In [22]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()  # this is how objects that inherit from parent classes are instantiated
        self.query_matrix = nn.Parameter(torch.randn(1))
        self.key_matrix = nn.Parameter(torch.randn(1))
        self.fc1 = nn.Linear(in_features=4, out_features=4)
        self.fc2 = nn.Linear(in_features=4, out_features=2)

        
    def forward(self, input_seq):
        
        # generate a new embedding for each token in the list
        new_input_seq = []
        for token in input_seq:
            
            # generate query vector from query matrix
            q = self.query_matrix * token
            
            # track list of scores
            scores = []
        
            # then compute scores by multiplying query vector by key vector of every element
            for element in input_seq:
                
                # for each element in input_seq, generate a key vector from the key matrix
                k = self.key_matrix * element
                
                # calculate score vector
                s = q * k
                scores.append(s)
    
            # softmax score vector to create mask
            mask = F.softmax(torch.stack(scores), dim=0)
            
            # now multiply each token in the original list by its corresponding element in its mask
            new_embedding = 0
            for pair in zip(input_seq, mask):
                new_embedding += pair[0] * pair[1]
            new_input_seq.append(new_embedding)
        
        # then, lets see if we can classify better with attention
        new_input_seq = torch.stack(new_input_seq).reshape(1, len(new_input_seq))
        x = self.fc1(new_input_seq)
        x = self.fc2(x)
        y_prob = F.softmax(x, dim=1)
        return y_prob

In [23]:
net1 = Net()
optimizer = optim.SGD(net1.parameters(), lr=0.1, momentum=0.9)

In [24]:
X_train

[1, 0.3417277474918079, 0.4525821282146074, 0.18839683301654941]

In [25]:
net1(X_train)

tensor([[0.4245, 0.5755]], grad_fn=<SoftmaxBackward>)

In [26]:
n_epochs = 10

for epoch in range(n_epochs):

    # train
    for _ in range(1000):
        # X_train, y_train = fetch()
        optimizer.zero_grad()
        outputs = net1(X_train)
        loss = F.nll_loss(outputs[0].reshape(1, 2), torch.tensor([y_train]))
        loss.backward()
        optimizer.step()

    # test
    acc = 0
    for _ in range(100):
        # X_test, y_test = fetch() 
        outputs = net1(X_train)
        pred = int(torch.argmax(outputs[0]))
        if pred == y_train:
            acc += 1
    print(acc/100)

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0


In [27]:
outputs

tensor([[7.1081e-07, 1.0000e+00]], grad_fn=<SoftmaxBackward>)