[example](http://jalammar.github.io/illustrated-transformer/)

In [390]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import numpy as np
import numpy as np
import pandas as pd
import torch.optim as optim
from random import randint
import tensorflow as tf
import matplotlib.pyplot as plt
plt.style.use("ggplot")
%matplotlib inline

In [436]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()  # this is how objects that inherit from parent classes are instantiated
        self.query_matrix = nn.Parameter(torch.randn(1))
        self.key_matrix = nn.Parameter(torch.randn(1))
        self.fc1 = nn.Linear(in_features=4, out_features=24)
        self.fc2 = nn.Linear(in_features=24, out_features=2)

        
    def forward(self, input_seq):
        
        # generate a new embedding for each token in the list
        new_input_seq = []
        for token in input_seq:
            
            # generate query vector
            q = self.query_matrix * token
            
            # track list of scores
            scores = []
        
            for element in input_seq:
                # for each element in input_seq, generate a key
                k = self.key_matrix * element
                
                # calculate score
                s = q * k
                scores.append(s)
    
            mask = F.softmax(torch.stack(scores), dim=0)
            
            # now multiply each token by the mask
            new_embedding = 0
            for pair in zip(input_seq, mask):
                new_embedding += pair[0] * pair[1]
            
            new_input_seq.append(new_embedding)
        
        # then, lets see if we can classify better with attention
        new_input_seq = torch.stack(new_input_seq).reshape(1, len(new_input_seq))
        x = self.fc1(new_input_seq)
        x = self.fc2(x)
        y_prob = F.softmax(x, dim=1)
        return y_prob

In [437]:
# generate data from scratch
n_samples = 200
n_features = 4

X = np.round(np.random.uniform(low=-1, high=1, size=(n_samples, n_features)), 0)

# set X1 and X2 to either 0 or 1
X[:, 0] = X[:, 0] > 0  # feature 0
X[:, 1] = X[:, 1] > 0  # feature 1

# generate corresponding target y data
Y = np.logical_xor(X[:, 0], X[:, 1]) * 1  # y = XOR(X1, X2)

In [438]:
optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9)

In [441]:
n_epochs = 500
net = Net()

for epoch in range(n_epochs):

    # train
    for _ in range(55):
        idx = randint(0, n_samples - 1)
        
        optimizer.zero_grad()
        outputs = net(X[idx])
        loss = F.nll_loss(outputs[0].reshape(1, 2), torch.tensor([Y[idx]]))
        loss.backward()
        optimizer.step()
    
    # test
    if epoch % 10 == 0:
        acc = 0
        for _ in range(45):
            index = randint(0, n_samples - 1)
            outputs = net(X[index])
            acc+= int(torch.argmax(outputs) == Y[index])
        print(acc/45)

0.28888888888888886
0.3111111111111111
0.26666666666666666
0.4
0.26666666666666666
0.3333333333333333
0.4666666666666667
0.35555555555555557
0.3111111111111111
0.37777777777777777
0.37777777777777777
0.28888888888888886
0.28888888888888886
0.3111111111111111
0.24444444444444444
0.4444444444444444
0.3333333333333333
0.37777777777777777


KeyboardInterrupt: 