In [1]:
### Prepare Dataset
import os

data_path = 'data/'
file_paths = os.listdir(data_path)

# List of words
S = []

def clean_word(word):
    return ''.join(filter(str.isalpha, word)).lower().replace(' ', '')

for file_path in file_paths:
    with open(data_path + file_path, 'r') as file:
        words = [ clean_word(word) for word in file.read().splitlines() ]
        S.extend(words)
        file.close()

S = list(sorted(filter(None, set(S))))

print(f'total words: {len(S)}')
print(f'minimum length: {min(len(w) for w in S)}')
print(f'maximum length: {max(len(w) for w in S)}')

total words: 27466
minimum length: 2
maximum length: 16


In [2]:
### Bigrams
B = {}
for w in S:
    chs = list('.' + w + '.')
    for ch1, ch2 in zip(chs, chs[1:]):
        bigram = (ch1, ch2)
        B[bigram] = B.get(bigram, 0) + 1

sorted(B.items(), key=lambda kv: -kv[1])[:10]

[(('a', '.'), 7377),
 (('a', 'n'), 4350),
 (('a', 'r'), 3579),
 (('n', '.'), 3153),
 (('r', 'i'), 3059),
 (('.', 'a'), 3004),
 (('e', '.'), 2978),
 (('m', 'a'), 2824),
 (('n', 'a'), 2816),
 (('.', 's'), 2806)]

In [3]:
### Simple encoders and decoders
chars = sorted(list(set(''.join(S))))
stoi = { s: i + 1 for i, s in enumerate(chars) }
stoi['.'] = 0
itos = { i: s for s, i in stoi.items() }
len(stoi)

48

In [5]:
### Create tensor to represent bigrams in a matrix
import torch

N = torch.zeros((48, 48), dtype=torch.int32)

for w in S:
    chs = list('.' + w + '.')
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        N[ix1, ix2] += 1

In [6]:
N[:2]

tensor([[   0, 3004, 1144,  854,  905, 1440,  751,  866, 1289,  729, 1292, 1298,
         1178, 2632, 1259,  317,  745,   59, 1378, 2806, 1212,  201,  766,  361,
           62,  339,  454,   15,    0,    4,   25,    1,    1,    0,    4,    0,
            0,    3,    0,    0,    6,    0,   28,    9,    0,   11,    0,   18],
        [7377,  489,  685,  191,  917,  221,  265,  295,  742,  916,  356,  474,
         1685, 1357, 4350,   84,  144,   51, 3579, 1222, 1068,  290,  323,  222,
           53,  459,  355,    0,    0,    0,    0,    0,    0,    0,    5,    0,
            5,    0,    1,    4,    0,    0,    0,    0,    1,    0,    0,    0]],
       dtype=torch.int32)

In [13]:
### Prepare training set
X, Y = [], []

for w in S:
    chs = list('.' + w + '.')
    for ch1, ch2 in zip(chs, chs[1:]):
        X.append(stoi[ch1])
        Y.append(stoi[ch2])
        
X = torch.tensor(X)
Y = torch.tensor(Y)
n_samples = len(X)

In [15]:
print(f'inputs: {n_samples}')
print(f'targets: {len(Y)}')

inputs: 199688
targets: 199688


In [32]:
### Simple neural network with only one layer
import torch.nn.functional as F

# prepare weights for the layer
G = torch.Generator().manual_seed(909078)
W = torch.randn((48, 48), generator=G, requires_grad=True)

In [33]:
### Training phase
E = 200 # epochs
L = 50 # learning rate
softening_rate = 0.1

In [34]:
for e in range(1, 100 + 1):
    # forward pass
    xenc = F.one_hot(X, num_classes=48).float()
    logits = xenc @ W
    # softmax activating
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdim=True)
    # negative log likelihood
    loss = -probs[torch.arange(n_samples), Y].log().mean() + softening_rate * (W**2).mean()
    if e % 10 == 0:
        print(f'epoch: {e}, loss: {loss.item()}')
    
    # backward pass
    W.grad = None
    loss.backward()

    # update weights
    W.data += -L * W.grad

epoch: 10, loss: 3.1465213298797607
epoch: 20, loss: 2.923682689666748
epoch: 30, loss: 2.841978073120117
epoch: 40, loss: 2.8001396656036377
epoch: 50, loss: 2.775259256362915
epoch: 60, loss: 2.7589099407196045
epoch: 70, loss: 2.7473530769348145
epoch: 80, loss: 2.7387678623199463
epoch: 90, loss: 2.7321460247039795
epoch: 100, loss: 2.726881265640259


In [40]:
### Testing the neural net

# This seed needs to be changed
# if different result is expected
seed = 40
G = torch.Generator().manual_seed(seed)

for _ in range(10):
    word = []
    i = 0
    while True:
        xenc = F.one_hot(torch.tensor([i]), num_classes=48).float()
        logits = xenc @ W
        counts = logits.exp()
        prob = counts / counts.sum(1, keepdim=True)

        i = torch.multinomial(prob, num_samples=1, replacement=True, generator=G).item()
        word.append(itos[i])
        if i == 0:
            break
    word = ''.join(word)
    print(f'generated word: {word}')

generated word: adeva.
generated word: la.
generated word: terananarif.
generated word: arerilard.
generated word: klúez.
generated word: ra.
generated word: uvinnn.
generated word: shieelieyardysa.
generated word: if.
generated word: maria.
