In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [39]:
# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 48 # what is the maximum context length for predictions?
max_iters = 20_000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
# ------------

torch.manual_seed(1337)


<torch._C.Generator at 0x7fe852de6ab0>

In [40]:


# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

def sort_index(lst, rev=True):
    index = range(len(lst))
    s = sorted(index, reverse=rev, key=lambda i: lst[i])
    return s


# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)

            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)

            p = probs.tolist()[0]
            ls = sort_index(p)[:10]
            for i in ls:
              print(p[i],"%.  ", decode([i]))

            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            print("next========================")
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [41]:
with open('pick-ban.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [46]:
import json

with open('hero.json', 'r') as file:
    dic = json.load(file)



# here are all the unique characters that occur in this text
chars = set(["", "/", "\n", " "])
chars.update(dic.values())


print(chars)

vocab_size = len(chars)
print(vocab_size)
# create a mapping from characters to integers



stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }

print(len(stoi), len(itos))

def encode(s):
  ans = []
  s = s.split("/")
  for i in s:
    i = i.split("\n")
    for j in i:
      ans.append(stoi[j])
    ans.append(stoi[" "])
  return ans


# encode = lambda s: [stoi[c+" "] for c in s.split(" ")] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

{'', 'Visage', 'Axe', 'Nyx Assassin', 'Gyrocopter', 'Naga Siren', 'Sand King', 'Terrorblade', 'Dark Willow', 'Bloodseeker', 'Ogre Magi', 'Weaver', 'Bristleback', 'Muerta', 'Queen of Pain', 'Templar Assassin', 'Skywrath Mage', 'Death Prophet', 'Arc Warden', 'Clinkz', "Nature's Prophet", 'Invoker', ' ', 'Rubick', 'Night Stalker', 'Meepo', 'Slardar', 'Chen', 'Mirana', 'Leshrac', 'Juggernaut', 'Abaddon', 'Luna', 'Riki', 'Shadow Shaman', 'Pugna', 'Undying', 'Treant Protector', 'Centaur Warrunner', 'Broodmother', 'Dazzle', 'Pangolier', 'Omniknight', 'Underlord', 'Shadow Fiend', '/', 'Doom', 'Razor', 'Lifestealer', 'Spectre', 'Puck', 'Outworld Destroyer', 'Ursa', 'Void Spirit', 'Lich', 'Timbersaw', 'Silencer', 'Dark Seer', 'Beastmaster', 'Anti-Mage', 'Storm Spirit', '\n', 'Earthshaker', 'Jakiro', 'Ember Spirit', 'Earth Spirit', 'Disruptor', 'Kunkka', 'Tusk', 'Phantom Lancer', 'Dragon Knight', 'Phoenix', 'Ancient Apparition', 'Huskar', 'Drow Ranger', 'Sven', 'Tinker', 'Magnus', 'Clockwerk', 'V

In [44]:
model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)


0.21888 M parameters


In [None]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

In [26]:
encode("Shadow Fiend/Monkey King")

[44, 22, 82, 22]

In [30]:
li = [82, 22, 44, 22]
decode(li)

'Monkey King Shadow Fiend '

In [31]:
a_list = [li]
a_tensor = torch.Tensor(a_list).to(torch.int64)

In [32]:
check = m.generate(a_tensor, max_new_tokens=2)

0.027662742882966995 %.   Pangolier
0.026198718696832657 %.   Beastmaster
0.024258002638816833 %.   Rubick
0.021582530811429024 %.   Batrider
0.02136886492371559 %.   Doom
0.020801685750484467 %.   Ember Spirit
0.019552594050765038 %.   Timbersaw
0.019277073442935944 %.   Puck
0.019072415307164192 %.   Templar Assassin
0.01889931783080101 %.   Enchantress
0.9653089046478271 %.    
0.004370302427560091 %.   Broodmother
0.002810270059853792 %.   Chen
0.0017186816548928618 %.   Doom
0.0016359263099730015 %.   Timbersaw
0.0013558134669438004 %.   Beastmaster
0.0011690674582496285 %.   Batrider
0.0009577610762789845 %.   Medusa
0.0008858027867972851 %.   Nature's Prophet
0.0007694652886129916 %.   Pangolier


In [35]:
torch.save(model, "./save/model")

In [36]:
model_load = torch.load('./save/model')

In [37]:
m2 = model_load.to(device)
check = m2.generate(a_tensor, max_new_tokens=2)

0.027662742882966995 %.   Pangolier
0.026198718696832657 %.   Beastmaster
0.024258002638816833 %.   Rubick
0.021582530811429024 %.   Batrider
0.02136886492371559 %.   Doom
0.020801685750484467 %.   Ember Spirit
0.019552594050765038 %.   Timbersaw
0.019277073442935944 %.   Puck
0.019072415307164192 %.   Templar Assassin
0.01889931783080101 %.   Enchantress
0.9551424384117126 %.    
0.005794057156890631 %.   Broodmother
0.00313160615041852 %.   Chen
0.002244766568765044 %.   Doom
0.0021993564441800117 %.   Timbersaw
0.001885759993456304 %.   Beastmaster
0.001528099412098527 %.   Batrider
0.0011932500638067722 %.   Medusa
0.0011105779558420181 %.   Enchantress
0.0010643169516697526 %.   Pangolier
