In [1]:
# utils
import numpy as np
import torch
from pathlib import Path
CHARS =  ['n', 'T', '£', 'W', 'x', 'ù', 'f', 'É', '到', 'æ', '•', '²', '┌', '┤', 'Z', 'S', '…', '/', 'R', 'E', '+', 'o', '∂', 'ü', 'g', '≠', '╫', '午', '‘', '（', 'e', '“', '<', 'ç', '达', '"', '’', 'ₙ', '#', 'P', '½', 'A', 't', 'X', 'β', 'θ', 'ø', '└', '≡', '║', 'Q', '，', '═', 'h', 'λ', '{', 'ρ', '0', '\t', 'ö', '：', 'K', '∣', '╩', 'b', 'π', '┬', '.', ')', 'Δ', '?', '8', '7', ']', 'é', 'q', '-', 'v', '┘', ' ', 'c', '∑', 'u', 'p', 'N', '≈', 'C', 'ł', '■', 'Ü', '*', '|', 'U', 'è', '！', 'd', 'm', '│', '–', '┼', "'", 'ã', '}', 'y', '_', '→', '\n', 'z', 'G', 's', '=', 'ů', '2', '!', '\u2009', '^', 'Y', '⬇', '精', '╥', '4', '9', ':', '下', '≤', '>', '\\', 'a', '√', 'V', 'D', ';', 'B', 'î', '~', '￼', '₹', 'L', '—', '(', '5', '↑', 'ñ', 'ò', '├', 'O', 'ń', 'i', 'γ', '┐', '[', '$', 'w', '`', 'µ', '1', 'l', '%', '≥', '─', '✅', '）', 'í', '”', 'j', '−', '€', 'r', 'º', 'ℏ', 'F', '6', '3', ',', 'J', '┴', '&', '×', '@', 'I', 'à', '度', 'φ', '维', 'H', 'k', 'M']
INDEX_TO_CHAR = {i:c for i, c in enumerate(CHARS)}
CHAR_TO_INDEX = {c:i for i, c in enumerate(CHARS)}
def encode(seq):
    return [CHAR_TO_INDEX[c] for c in seq ]

def decode(indexes):
    return "".join([INDEX_TO_CHAR[int(i)] for i in indexes])

def get_batch(data, batch_size, block_size):
    idx = torch.randint(len(data)-block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in idx])
    y = torch.stack([data[i+1:i+1+block_size] for i in idx])
    return x.cuda(), y.cuda()

In [None]:

from dataclasses import dataclass
import math
import numpy as np
import torch.nn as nn
import torch
torch.manual_seed(0)

@dataclass
class Config:
    block_size = 200
    n_blocks = 6
    epochs = int(1e4)
    vocab_size = 187
    embedding_dim = 48 # must be equal to head_size in this model but not in example
    batch_size=256
    evaluation_steps=300
    n_head=6
    learning_rate=0.0003
    dropout=0.1
    load_model = True
    path_model = "/kaggle/working/v1.pt"
    def __post_init__(self):
        if self.embedding_dim%self.n_head!=0:
            raise ValueError(f"Embedding dimension {self.embedding_dim} should be a multiple of n_head={self.n_head}")
config = Config()

class Head(nn.Module):
    def __init__(self, config: Config) -> None:
        super().__init__()
        head_size = config.embedding_dim//config.n_head
        self.key = nn.Linear(config.embedding_dim, head_size, bias=False)
        self.query = nn.Linear(config.embedding_dim, head_size, bias=False)
        self.value = nn.Linear(config.embedding_dim, head_size, bias=False)
        self.register_buffer("tril", torch.tril(torch.ones(config.block_size, config.block_size)))
        self.dropout = nn.Dropout(config.dropout)
    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)
        att = (q @ k.transpose(-2, -1)) *  C**-0.5
        att = att.masked_fill(self.tril[:T,:T] == 0, float('-inf'))
        att = torch.functional.F.softmax(att, dim=-1)
        att =self.dropout(att)
        #att = self.attn_dropout(att)
        y = att @ v 
        return y
    
class LayerNorm(nn.Module):
    """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """

    def __init__(self, ndim, bias=None):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None

    def forward(self, input):
        return torch.functional.F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
    
class MultiHeadAttention(nn.Module):
    def __init__(self, config) -> None:
        super().__init__()
        self.heads = nn.ModuleList([Head(config) for _ in  range(config.n_head)])
        self.proj = nn.Linear(config.embedding_dim, config.embedding_dim)
        self.dropout = nn.Dropout(config.dropout)
    def forward(self, x):
        x = torch.cat([h(x) for h in self.heads], dim=-1)
        return self.dropout(self.proj(x))
    
class FeedForward(nn.Module):
    def __init__(self, embedding_dim) -> None:
        super().__init__()
        self.l1 = nn.Linear(embedding_dim, 4*embedding_dim, bias=True)
        self.l2 = nn.Linear(4*embedding_dim, embedding_dim, bias=True)
        self.dropout=nn.Dropout(0.1)
    def forward(self, x):
        x = self.l2(torch.functional.F.relu(self.l1(x)))
        return self.dropout(x)
    
class Block(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.sa = MultiHeadAttention(config)
        self.ffw = FeedForward(embedding_dim=config.embedding_dim)
        self.ln_1 = LayerNorm(config.embedding_dim)
        self.ln_2 = LayerNorm(config.embedding_dim)
    def forward(self, x):
        x = x + self.sa(self.ln_1(x))
        x = x + self.ffw(self.ln_2(x))
        return x

class Transformer(nn.Module):
    """Only considers last token to make predictions"""
    def __init__(self, config: Config) -> None:
        super().__init__()
        self.config=config
        self.token_embedding = nn.Embedding(num_embeddings=config.vocab_size, embedding_dim=config.embedding_dim)
        self.positional_encoding = nn.Embedding(num_embeddings=config.block_size, embedding_dim=config.embedding_dim)
        self.blocks = nn.Sequential(*[Block(config) for _ in range(config.n_blocks)],
            LayerNorm(ndim=config.embedding_dim),
        )
        self.lm_head = nn.Linear(config.embedding_dim, config.vocab_size, bias=False)
    def forward(self, x):
        x = self.token_embedding(x) + self.positional_encoding(torch.arange(self.config.block_size, device=x.device))
        x = self.blocks(x)
        x = self.lm_head(x)
        # Cross entropy already have softmax
        return x

    def generate_sequence(self, idx, max_tokens, block_size):
        with torch.no_grad():
            for i in range(max_tokens):
                # crop last block_size tokens
                idx_cond = idx[:,-block_size:]
                # softmax here is in the other dimension, because we are starting with an actual batch prediction (1,T,vocab)
                probs = self(idx_cond).softmax(dim=-1)
                # selecting only last token
                idx_next = torch.multinomial(input=probs[:,-1,:], num_samples=1)
                idx = torch.cat((idx, idx_next), dim=-1)
            return idx.squeeze()
                

# Loading training and test
train = torch.load("/kaggle/input/python-text/train.pt")
test = torch.load("/kaggle/input/python-text/test.pt")

m = Transformer(config).cuda()

if config.load_model:
    try:
        m.load_state_dict(torch.load(config.path_model))
        m.eval()
        print(f"Model has been loaded from {config.load_model}")
    except:
        print("Could not load model")
loss_f = nn.CrossEntropyLoss()
optim = torch.optim.AdamW(m.parameters(), lr=config.learning_rate)
for i in range(config.epochs):
    X, y = get_batch(train, config.batch_size, config.block_size)
    
    out = m(X)
    optim.zero_grad()
    # work out the right dimensions for the cross entropy loss function
    loss = loss_f(out.view(config.batch_size*config.block_size,config.vocab_size), y.view(config.batch_size*config.block_size))
    loss.backward()
    optim.step()
    if i%config.evaluation_steps==0:
        with torch.no_grad():
            test_batch_size = config.batch_size
            X_test, y_test = get_batch(test, test_batch_size, config.block_size)
            out_test = m(X_test)
            loss_test = loss_f(out_test.view(test_batch_size*config.block_size,config.vocab_size), y_test.view(test_batch_size*config.block_size))
        print("i: ", i," Loss training: ", loss.cpu().item(), " Loss test: ", loss_test.cpu().item())
        # starting with \n
        #print(decode(m.generate_sequence(torch.tensor((0,0)), 100)))
        print(decode(m.generate_sequence(X_test[0].view(1,-1), 50, config.block_size)))

        torch.save(m.state_dict(),config.path_model)


Could not load model
i:  0  Loss training:  2.888003349304199  Loss test:  2.9092721939086914
le provides a function which will take a product name as input from the user,
and fetch from Amazon information about products of this name or category.  The product
information will include title, URosorod}" bonnarets)
 ctatsideay ird p cout attidut
i:  300  Loss training:  1.7995688915252686  Loss test:  2.087390661239624


def stooge(arr, i, h):
    if i >= h:
        return

    # If first element is smaller than the last then swap them
    if arr[i] > arr[h]:
        arr[i], arr[h] = arr[h], arr[i]

    # If there ateeat corsemed(clenumpy: [st: roust - Int_sum* alb
i:  600  Loss training:  1.728990077972412  Loss test:  1.926145076751709
er = 0
        digits = int(digits)
        while counter < digits:
            current = temp_num % 10
            if counter % 2 == 0:
                addition = ""
                if counter in plass

      if n!==[letr[inke] 1
          self.prer
i:  