In [124]:
#!pip install -r requirements.txt
import math as m
import numpy as np
import abc

from tqdm import tqdm

# Variables

In [125]:
d_model = 512               # Dimensions of the model
heads = 8                   # Number of heads for multihead attention
d_k = d_v = d_model / heads # ???????????????????????????????????
ed_count = 6                # N, number of encoder-decoder layers
vocab = ["ji", "ja"]

In [126]:
class Dataset:
    def __init__():
        pass

In [127]:
def TTV_split(data, tr, te, va, verbose=False):
    total = len(data)
    one = total / (tr + te + va)

    tr_n = m.floor(tr * one)
    te_n = m.floor(te * one)
    va_n = m.floor(va * one)

    tr_n += (total - tr_n - te_n - va_n)

    train_data = np.array([])
    test_data = np.array([])
    val_data = np.array([])

    np.random.shuffle(data)

    for i in tqdm(data):
        if (len(train_data) < tr_n):
            np.append(train_data, i)
            continue
        if (len(test_data) < te_n):
            np.append(test_data, i)
            continue
        if (len(val_data) < va_n):
            np.append(val_data, i)

    if verbose:
        print(f"total = {total}\ntr_n = {len(train_data)} \nte_n = {len(test_data)} \nva_n = {len(val_data)}")
    return 

TTV_split(np.array([i for i in range(100000)]), 12, 4, 1)

100%|██████████| 100000/100000 [00:00<00:00, 181236.20it/s]


# Base Module Class

In [269]:
class nnModule(): # TODO: добавить тесты для каждого модуля
    def __init__(self, params={}) -> None:
        self.params = params

    @abc.abstractclassmethod
    def forward(self, x):
        pass


    @abc.abstractclassmethod
    def backward(self, grad):
        pass


    @abc.abstractclassmethod
    def step(self, lr):
        pass


    def __str__(self) -> str:
        rs = ""
        strs = np.array([f"\n{i}: {self.params[i]}, " for i in self.params])
        for i in strs:
            rs += i
        return (f"{self.name}: (" + rs[:len(rs)-2] + "\n)") if self.params else self.name

# Linear

In [270]:
class Linear(nnModule):
    def __init__(self, params) -> None:
        super().__init__(params)

        try:
            if params["in_dim"] < 1 or params["out_dim"] < 1:
                raise Exception
        except KeyError:
            raise Exception("You have to set in_dim and out_dim parameters for linear layer")
        except Exception:
            raise Exception("The in_dim and out_dim have to be greater than zero")

        self.name = "Linear"

        self._res = None
        self._lastX = None

        self.W = np.ones((params["in_dim"], params["out_dim"]), dtype=np.float64)
        self.B = np.zeros(params["out_dim"], dtype=np.float64)

        self._grad_weight = None
        self._grad_bias = None


    def forward(self, x):
        self._lastX = x
        self._res = np.dot(x, self.W) + self.B
        return self._res


    def backward(self, grad):
        self._grad_weight = np.dot(self._lastX.T, grad)
        self._grad_bias = np.sum(grad, axis=0)


    def step(self, lr):
        self.weight = self.weight - self._grad_weight * lr
        self.bias = self.bias - self._grad_bias * lr

# ReLU

In [271]:
class ReLU(nnModule):
    def __init__(self, params=...) -> None:
        super().__init__(params)

        self.name = "ReLU"

        self.relu = lambda x: x * (x > 0)


    def forward(self, x):
        self._res = x * (x > 0)
        return self._res


    def backward(self, grad):
        m, n = np.shape(self._res)
        for i in range(m):
            for j in range(n):
                grad[i][j] = grad[i][j] if self.relu(self._res[i][j]) else 0
        return grad


    def step(self, lr):
        pass

# Sigmoid

In [272]:
class Sigmoid(nnModule): # dobavit dim over -1?
    def __init__(self, params=...) -> None:
        super().__init__(params)

        self.name = "Sigmoid"


    def forward(self, x):
        self._res = 1 / (1 + np.exp(-x))
        return self._res


    def backward(self, grad):
        new_grad = self._res * (1 - self._res) * grad
        return new_grad


    def step(self, lr):
        pass

# FFN

In [273]:
class TransformerFFN(nnModule):
    def __init__(self) -> None:
        self.name = "FFN"

        self.linear1 = Linear(params={
            "in_dim": 512, 
            "out_dim": 2048
        })

        self.relu = ReLU()
        
        self.linear2 = Linear(params={
            "in_dim": 2048, 
            "out_dim": 512
        })

        self.modules = np.array([
            self.linear1, 
            self.relu, 
            self.linear2
        ])


    def forward(self, x):
        res = None
        for module in self.modules:
            res = module.forward(x)
        return res


    def backward(self, x):
        for module in np.flip(self.modules):
            module.backward(x)


    def step(self, lr):
        for module in self.modules:
            module.step(lr)

# Scaled Dot Product Attention

In [274]:
def ScaledDotProductAttention(Q: np.array, K: np.array, V: np.array):
    d = K.shape[0]
    smax = Sigmoid()
    return smax.forward(np.dot(Q, K.T) / m.sqrt(d)) * V

Q = np.array([1, 2, 3])
K = np.array([1, 2, 3])
V = np.array([2, 1, 3])

print(ScaledDotProductAttention(Q, K, V))

[1.99938264 0.99969132 2.99907397]


# Multihead Attention

In [275]:
class MultiheadAttention(nnModule):
    def __init__(self, params) -> None:
        super().__init__(params)

        self.name = "Multihead Attention"

        self.heads = params["heads"]
        self.d_model = params["d_model"]

        assert not (self.d_model % self.heads)

        self.head_modules = np.array([
            [
                Linear(params={
                    "in_dim": d_model, 
                    "out_dim": Q.shape[0]
                }), # po idee vector len = 64
                Linear(params={
                    "in_dim": d_model, 
                    "out_dim": K.shape[0]
                }),
                Linear(params={
                    "in_dim": d_model, 
                    "out_dim": V.shape[0]
                }),
            ] for i in range(self.heads)
        ])

        self.linear_out = Linear(params={
            "in_dim": heads * V.shape[0], 
            "out_dim": d_model
        })
        

    def forward(self, x):
        head_res = np.array([])
        
        for head_module in self.head_modules:
            Q = head_module[0].forward(x[0])
            K = head_module[1].forward(x[1])
            V = head_module[2].forward(x[2])
            np.append(head_res, ScaledDotProductAttention(Q, K, V))

        concat = np.vstack(*head_res)
        out = self.linear_out(concat) # linear layer razmern' = poschitat result np.vstack
        
        return out

# Layer Norm

In [276]:
class LayerNorm(nnModule):
    def __init__(self, params) -> None:
        super().__init__(params)

        self.name = "Layer Norm"

        self.features = params["features"]
        self.eps = params["eps"]
        
        self.a_2 = np.ones(self.features)
        self.b_2 = np.zeros(self.features)


    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

# Encoder

In [277]:
class EncoderLayer(nnModule):
    def __init__(self, params) -> None:
        super().__init__(params)

        self.name = "Encoder Layer"

        self.self_attention = params["attention"]
        self.FFN = TransformerFFN()
        self.layer_norm = LayerNorm(params={
            "features": 512, # TODO: fix
            "eps": 0.0001
        })
        # self.dropout = Dropout()


    def forward(self, x):
        x = x + self.layer_norm(self.self_attention(x))
        return x + self.layer_norm(self.FFN(x))

In [278]:
class Encoder(nnModule):
    def __init__(self, params) -> None:
        super().__init__(params)

        self.name = "Encoder"

        self.attention = params["attention"]
        self.N = ed_count

        self.layers = np.array([
            EncoderLayer(params={
                "attention": self.attention
            }) for i in range(self.N)
        ])


    def forward(self, x):
        for layer in self.layers:
            x = layer.forward(x)
        return x
        

# Decoder

In [279]:
class DecoderLayer(nnModule):   # TODO: remove inconsistensies of parameters vs creating objects inside
    def __init__(self, params) -> None:
        super().__init__(params)

        self.name = "Decoder Layer"

        self.memory = params["memory"] # ????????
        self.heads = params["heads"]
        self.d_model = params["d_model"]

        self.self_attention_1 = MultiheadAttention(params={
            "heads": self.heads, 
            "d_model": self.d_model
        })
        self.self_attention_2 = MultiheadAttention(params={
            "heads": self.heads, 
            "d_model": self.d_model
        })

        self.layer_norm = LayerNorm(params={
            "features": 512, # TODO: fix
            "eps": 0.0001
        }) # layer size

        self.FFN = TransformerFFN()

    
    def forward(self, x):
        x = x + self.layer_norm(self.self_attention_1(x))
        x = x + self.layer_norm(self.memory + self.self_attention_2(x))
        return x + self.layer_norm(self.FFN(x))

In [280]:
class Decoder(nnModule):
    def __init__(self, params) -> None:
        super().__init__(params)

        self.name = "Decoder"

        self.N = params["N"]

        self.layers = np.array([
            DecoderLayer(params={
                "memory": object,
                "heads": heads,
                "d_model": d_model
            }) for i in range(self.N)
        ])


    def forward(self, x):
        for layer in self.layers:
            x = layer.forward(x)
        return x

# EncoderDecoder

In [281]:
class EncoderDecoder(nnModule):
    def __init__(self, params) -> None:
        super().__init__(params)

        self.name = "Encoder-Decoder"

        self.encoder = params["encoder"]
        self.decoder = params["decoder"]
        self.src_emb = params["src_emb"]
        self.tar_emb = params["tar_emb"]
        
        self.generator = params["generator"]


    def encode(self, src):
        return self.encoder(self.src_emb(src))


    def decode(self, src, memory):
        return self.decoder(self.tar_emb(self.tar), memory)

    
    def forward(self, src, tar):
        return self.decode(self.encode(src), tar)

# Output Generator

In [282]:
class Generator(nnModule):
    def __init__(self, params) -> None:
        super().__init__(params)

        self.name = "Output generator"

        self.d_model = params["d_model"]
        self.vocab = params["vocab"]

        self.smax = Sigmoid()
        self.linear = Linear(params={ # d_model in_dim????
            "in_dim": d_model, 
            "out_dim": d_model
        })
    
    def forward(self, x):
        return self.smax(self.linear(x))

# Transformer

In [283]:
class Transformer(nnModule):
    def __init__(self, params) -> None:
        super().__init__(params)

        self.name = "Transformer"

        self.encoder_decoder = params["encoder-decoder"]

    def forward(self, x):
        return self.encoder_decoder(x)

# Tests

In [284]:
encoder = Encoder(params={
    "attention": MultiheadAttention(params={
        "heads": heads, 
        "d_model": d_model
    })
})
decoder = Decoder(params={
    "N": ed_count
})

generator = Generator(params={
    "d_model": d_model,
    "vocab": vocab
})

endec = EncoderDecoder(params={
    "generator": generator,
    "encoder": encoder,
    "decoder": decoder,
    "src_emb": object,      # ?????
    "tar_emb": object       # ?????
})

model = Transformer(params={"encoder-decoder": endec})

print(model)

Transformer: (
encoder-decoder: Encoder-Decoder: (
generator: Output generator: (
d_model: 512, 
vocab: ['ji', 'ja']
), 
encoder: Encoder: (
attention: Multihead Attention: (
heads: 8, 
d_model: 512
)
), 
decoder: Decoder: (
N: 6
), 
src_emb: <class 'object'>, 
tar_emb: <class 'object'>
)
)


# Notes

We apply dropout to the output of each sub-layer, before it is added to the sub-layer input and normalized. In addition, we apply dropout to the sums of the embeddings and the positional encodings in both the encoder and decoder stacks. For the base model, we use a rate of P_drop = 0.1.

Label Smoothing During training, we employed label smoothing of value E_ls = 0.1. This hurts perplexity, as the model learns to be more unsure, but improves accuracy and BLEU score.