In [139]:
import os
from pathlib import Path

In [140]:
!ls

MachineTranslation.ipynb gpt-scatch.py            input.txt
gpt-bqe.py               gpt.py


In [141]:
file = Path("../vn_numbers.txt")
# load first 1000 lines
with file.open() as f:
    lines = f.readlines()[:1000]


In [142]:
lines[:5]

['kho\n', 'một \n', 'hai \n', 'ba \n', 'bốn \n']

In [143]:
# remove new line characters
lines = [line.replace('\n', '<eof>').strip() for line in lines]
numbers = {str(index): s for index, s in enumerate(lines)}

In [144]:
numbers['17']

'một mươi bảy <eof>'

In [145]:
vocab = {}
for num, text in numbers.items():
    words = text.split() + [n for n in num]
    # count words frequency and update vocab
    for word in words:
        if word in vocab:
            vocab[word] += 1
        else:
            vocab[word] = 1

In [146]:
# my_text = "12 23"
# my_text.split('')

In [148]:
# sort vocab by frequency
vocab = {k: v for k, v in sorted(vocab.items(), key=lambda item: item[1], reverse=True)}
vocab_list = list(vocab.keys())

encode_text = lambda text: [vocab_list.index(word) for word in text.split()]
decode_text = lambda encoded_text: ' '.join([vocab_list[index] for index in encoded_text])

encode_int = lambda num: [vocab_list.index(n) for n in str(num)] + [vocab_list.index('<eof>')]
decode_int = lambda encoded_num: int(''.join([vocab_list[index] for index in encoded_num]))

In [151]:
sample_text = "bốn bốn <eof>"
encoded_text = encode_text(sample_text)
encoded_text

[8, 8, 0]

In [152]:
decode_text(encoded_text)

'bốn bốn <eof>'

In [153]:
sample_int = 123
encoded_int = encode_int(sample_int)    
encoded_int

[3, 5, 7, 0]

In [154]:
def encode(value:any):
    if isinstance(value, int):
        return encode_int(value)
    elif isinstance(value, str):
        return encode_text(value)
    else:
        raise ValueError("Invalid type")

def decode(encoded_value):
    if isinstance(encoded_value[0], int):
        return decode_int(encoded_value)
    elif isinstance(encoded_value[0], str):
        return decode_text(encoded_value)
    else:
        raise ValueError("Invalid type")

In [155]:
import torch
import numpy as np

BLOCK_SIZE = 10

# padding tensor same size
def pad_tensor(t, size = BLOCK_SIZE):
    t = torch.tensor(t)
    pad_size = size - t.size(0)
    return torch.cat([t, torch.zeros(pad_size).long()])

def get_batch(data, batch_size=32):
    random_pair_id = np.random.choice(len(data), batch_size)
    xb = torch.stack([pad_tensor(encode(data[i])) for i in random_pair_id])
    yb = torch.stack([pad_tensor(encode(int(i))) for i in random_pair_id])
    return xb, yb
    

In [156]:
get_batch(lines,2)

(tensor([[17,  1, 19,  2,  4,  0,  0,  0,  0,  0],
         [15,  1,  8,  2,  4,  0,  0,  0,  0,  0]]),
 tensor([[18, 10,  5,  0,  0,  0,  0,  0,  0,  0],
         [16,  9,  5,  0,  0,  0,  0,  0,  0,  0]]))

In [157]:
class NaiveModel(torch.nn.Module):
    def __init__(self, vocab_size):
        super(NaiveModel, self).__init__()
        self.embedding = torch.nn.Embedding(vocab_size, 64)
        self.fc = torch.nn.Linear(64, vocab_size)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.fc(x)
        return x

In [158]:
class MyModel(torch.nn.Module):
    def __init__(self, vocab_size):
        super(MyModel, self).__init__()
        self.encoder_embedding = torch.nn.Embedding(vocab_size, 64)
        self.decoder_embedding = torch.nn.Embedding(vocab_size, 64)
        self.fc = torch.nn.Linear(64, vocab_size)
        
    def forward(self, x, y):
        x = self.encoder_embedding(x)
        y = self.decoder_embedding(y)
        # print(x.shape, y.shape)
        l = self.fc(x + y)
        return l

In [159]:
m = MyModel(len(vocab_list))

In [160]:
xb, yb = get_batch(lines, 2)
print(xb.shape, yb.shape)
logits = m(xb, yb)
logits.shape

torch.Size([2, 10]) torch.Size([2, 10])


torch.Size([2, 10, 27])

In [161]:
import torch.nn.functional as F

def loss_fnc(logits, targets):
  B, T, C = logits.shape
  logits = logits.view(B*T, C) # (B*T, C)
  targets = targets.view(B*T)
  loss = F.cross_entropy(logits, targets)
  return loss

In [162]:
loss_fnc(logits, yb)

tensor(3.8916, grad_fn=<NllLossBackward0>)

In [163]:
s = [1,2,3]
s[-10:]

[1, 2, 3]

In [164]:
def generate(model, prompt_idx, idx, max_tokens=10):
    model.eval()
    with torch.no_grad():
        for _ in range(max_tokens):
            B, T = idx.shape
            # fill the rest with zeros
            filled_idx = torch.cat([idx, torch.zeros(B, BLOCK_SIZE - idx.size(1)).long()], dim=1)
            x = model(prompt_idx, filled_idx)
            logits = x[:,-1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, idx_next], dim=1)
    return idx

In [165]:
decode

<function __main__.decode(encoded_value)>

In [182]:
xb

tensor([[ 8,  1, 19,  2, 22,  0,  0,  0,  0,  0],
        [20,  1, 11,  2,  0,  0,  0,  0,  0,  0]])

In [190]:
output = generate(m, xb, torch.zeros((2, 0), dtype=torch.long))
output

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [191]:
decode_text(output[0].tolist())

'<eof> <eof> <eof> <eof> <eof> <eof> <eof> <eof> <eof> <eof>'

In [181]:
def train(model, data, epochs=10, batch_size=32, lr=0.001):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    for epoch in range(epochs):
        xb, yb = get_batch(data, batch_size)
        optimizer.zero_grad()
        logits = model(xb, yb)
        loss = loss_fnc(logits, yb)
        loss.backward()
        optimizer.step()
        print(f"Epoch {epoch+1}, Loss: {loss.item()}")

In [188]:
train(m, lines, epochs=200, batch_size=32, lr=0.001)

Epoch 1, Loss: 0.0027180525939911604
Epoch 2, Loss: 0.0029453644528985023
Epoch 3, Loss: 0.0028255023062229156
Epoch 4, Loss: 0.0026756450533866882
Epoch 5, Loss: 0.0021724016405642033
Epoch 6, Loss: 0.002179615432396531
Epoch 7, Loss: 0.0020899134688079357
Epoch 8, Loss: 0.0018901836592704058
Epoch 9, Loss: 0.0019376942655071616
Epoch 10, Loss: 0.0016977936029434204
Epoch 11, Loss: 0.0016265762969851494
Epoch 12, Loss: 0.0017621457809582353
Epoch 13, Loss: 0.0013842929620295763
Epoch 14, Loss: 0.0013926068786531687
Epoch 15, Loss: 0.0012404314475134015
Epoch 16, Loss: 0.001330675557255745
Epoch 17, Loss: 0.0010044557275250554
Epoch 18, Loss: 0.0011635266710072756
Epoch 19, Loss: 0.0011318838223814964
Epoch 20, Loss: 0.0009333678754046559
Epoch 21, Loss: 0.0009471686789765954
Epoch 22, Loss: 0.000985773280262947
Epoch 23, Loss: 0.0007857525488361716
Epoch 24, Loss: 0.0009091047686524689
Epoch 25, Loss: 0.0008945918525569141
Epoch 26, Loss: 0.000887092319317162
Epoch 27, Loss: 0.0008224