In [None]:
### Prepare Dataset
import os

data_path = '../data/'
file_paths = os.listdir(data_path)

# List of words
S = []

def clean_word(word):
    return ''.join(filter(str.isalpha, word)).lower().replace(' ', '')

for file_path in file_paths:
    with open(data_path + file_path, 'r') as file:
        words = [ clean_word(word) for word in file.read().splitlines() ]
        S.extend(words)
        file.close()

S = list(sorted(filter(None, set(S))))

print(f'total words: {len(S)}')
print(f'minimum length: {min(len(w) for w in S)}')
print(f'maximum length: {max(len(w) for w in S)}')

In [None]:
### Bigrams
B = {}
for w in S:
    chs = list('.' + w + '.')
    for ch1, ch2 in zip(chs, chs[1:]):
        bigram = (ch1, ch2)
        B[bigram] = B.get(bigram, 0) + 1

sorted(B.items(), key=lambda kv: -kv[1])[:10]

In [None]:
### encoders and decoders
chars = sorted(list(set(''.join(S))))
stoi = { s: i + 1 for i, s in enumerate(chars) }
stoi['.'] = 0
itos = { i: s for s, i in stoi.items() }
len(stoi)

In [None]:
### Create tensor to represent bigrams as a matrix
import torch

N = torch.zeros((48, 48), dtype=torch.int32)

for w in S:
    chs = list('.' + w + '.')
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        N[ix1, ix2] += 1

In [None]:
N[:2]

In [None]:
### Prepare training set
X, Y = [], []

for w in S:
    chs = list('.' + w + '.')
    for ch1, ch2 in zip(chs, chs[1:]):
        X.append(stoi[ch1])
        Y.append(stoi[ch2])
        
X = torch.tensor(X)
Y = torch.tensor(Y)
n_samples = len(X)

In [None]:
print(f'inputs: {n_samples}')
print(f'targets: {len(Y)}')

In [None]:
### Simple neural network with only one layer
import torch.nn.functional as F

# prepare weights for the layer
G = torch.Generator().manual_seed(909078)
W = torch.randn((48, 48), generator=G, requires_grad=True)

In [None]:
### Training phase
E = 200 # epochs
L = 50 # learning rate
softening_rate = 0.1

In [None]:
for e in range(1, 100 + 1):
    # forward pass
    xenc = F.one_hot(X, num_classes=48).float()
    logits = xenc @ W
    # softmax activating
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdim=True)
    # negative log likelihood
    loss = -probs[torch.arange(n_samples), Y].log().mean() + softening_rate * (W**2).mean()
    if e % 10 == 0:
        print(f'epoch: {e}, loss: {loss.item()}')
    
    # backward pass
    W.grad = None
    loss.backward()

    # update weights
    W.data += -L * W.grad

In [None]:
### Testing the neural net

# This seed needs to be changed
# if you expect different result
seed = 40
G = torch.Generator().manual_seed(seed)

for _ in range(10):
    word = []
    i = 0
    while True:
        xenc = F.one_hot(torch.tensor([i]), num_classes=48).float()
        logits = xenc @ W
        counts = logits.exp()
        prob = counts / counts.sum(1, keepdim=True)

        i = torch.multinomial(prob, num_samples=1, replacement=True, generator=G).item()
        word.append(itos[i])
        if i == 0:
            break
    word = ''.join(word)
    print(f'generated word: {word}')