In [1]:
import torch
import torch.nn.functional as F
import random
import matplotlib.pyplot as plt
%matplotlib inline

In [46]:
from torch import Tensor


def readFileSplitByLine(name: str) -> list[str]:
    words = open(name, 'r').read().splitlines()
    return words


def sToI(chars: list[str]) -> dict[str, int]:
   stoi = { '.' : 0 }
   for i, ch in enumerate(chars):
       stoi[ch] = i + 1
   return stoi


def iToS(stoi: dict[str, int]) -> dict[int, str]:
    return {i:s for s,i in stoi.items()}


def buildDataSet(words: list[str], 
                 contextSize: int, 
                 stoi: dict[str, int], 
                 itos: dict[int, str]) -> tuple[Tensor, Tensor]:
    X: list[list[int]] = []
    Y: list[int] = []
    for w in words:
        context = [0] * contextSize
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            #print(''.join(itos[i] for i in context), '--->', itos[ix])
            context = context[1:] + [ix]
    return torch.tensor(X), torch.tensor(Y)

def makeNetwork(seed: int, 
                vocabularySize: int, 
                embeddingSize: int, 
                contextSize: int, 
                hiddenLayerSize: int):
    g = torch.Generator().manual_seed(seed)
    C = torch.randn((vocabularySize, embeddingSize), generator = g)
    W1 = torch.randn((embeddingSize * contextSize, hiddenLayerSize), generator = g)
    b1 = torch.randn(hiddenLayerSize, generator = g)
    W2 = torch.randn((hiddenLayerSize, vocabularySize), generator = g)
    b2 = torch.randn(vocabularySize, generator = g) 
    parameters = C, W1, b1, W2, b2
    for p in parameters:
        p.requires_grad = True
    return parameters

def backwardPass(parameters: list[Tensor], loss: Tensor) -> None:
  for p in parameters:
    p.grad = None
  loss.backward()

In [58]:
rndSeed = 42
rndSeed2 = 2147483647
contextSize = 3
trRatio = 0.8
devRatio = 0.9
embeddingSize = 10 # the dimensionality of the character embedding vectors
hiddenLayerSize = 200 # the number of neurons in the hidden layer of the MLP
maxTrainingSteps = 20_0000
trainingBatchSize = 32

words = readFileSplitByLine('names.txt')[:100]
random.seed(rndSeed)
random.shuffle(words)
print('first few words:\n', words[:5])

lenWords = len(words);
print('lenWords:\n', lenWords)

allPossibleChars = sorted(list(set(''.join(words))))
print('allPossibleChars:\n', allPossibleChars)

stoi = sToI(allPossibleChars)
print('stoi:\n', stoi)

itos = iToS(stoi)
print('itos:\n', itos)

vocabularySize = len(itos)
print('vocabularySize:\n', vocabularySize)

lenTrain = int(trRatio * lenWords)
trWords = words[:lenTrain]
trX, trY = buildDataSet(trWords, contextSize, stoi, itos)

lenDev = int(devRatio * lenWords)
devWords = words[lenTrain:lenDev];
devX, devY = buildDataSet(devWords, contextSize, stoi, itos)

lenTest = lenDev - lenTrain
tstWords = words[lenDev:]
tstX, tstY = buildDataSet(tstWords, contextSize, stoi, itos)

print('lenTrain, lenDev, lenTest:\n', lenTrain, lenDev, lenTest)

parameters = makeNetwork(rndSeed2, vocabularySize, embeddingSize, contextSize, hiddenLayerSize)
C, W1, b1, W2, b2 = parameters
print('parametersCount:\n', sum(p.nelement() for p in parameters))

lre = torch.linspace(-3, 0, 1000)
lrs = 10**lre
lri = []
lossi = []
stepi = []

def forwardPass() -> Tensor:
  emb = C[trX[ix]]
  h = torch.tanh(emb.view(-1, embeddingSize * contextSize) @ W1 + b1)
  logits = h @ W2 + b2
  loss = F.cross_entropy(logits, trY[ix])
  #if loss.item() % trainingBatchSize:
  #  print(loss.item())
  return loss

for i in range(trainingBatchSize):    
  
  # minibatch construct
  ix = torch.randint(0, trX.shape[0], (trainingBatchSize,))

  loss = forwardPass()  
  backwardPass(list(parameters), loss)
  
  # update
  #lr = lrs[i]
  lr = 0.1 if i < 100000 else 0.01
  for p in parameters:
    p.data += -lr * p.grad

  # track stats
  #lri.append(lre[i])
  stepi.append(i)
  lossi.append(loss.log10().item())

print('emb.shape:\n', emb.shape)
print('h.shape:\n', h.shape)
print('logits.shape:\n', logits.shape);
print('loss:\n', loss.item());

first few words:
 ['violet', 'hazel', 'emery', 'evelyn', 'elena']
lenWords:
 100
allPossibleChars:
 ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
stoi:
 {'.': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}
itos:
 {0: '.', 1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z'}
vocabularySize:
 27
lenTrain, lenDev, lenTest:
 80 90 10
parametersCount:
 11897
emb.shape:
 torch.Size([32, 3, 10])
h.shape:
 torch.Size([32, 200])
logits.shape:
 torch.Size([32, 27])
loss:
 9.826557159423828
