In [1]:
names = open('names.txt', 'r').read().splitlines()

In [5]:
# set of all characters mapped with index
letters = sorted(list(set("".join(names))))
letters = ['.'] + letters
letters_to_index = {letter: i for i, letter in enumerate(letters)}
index_to_letters = {i: letter for letter, i in letters_to_index.items()}

print(f"letters to index: {letters_to_index}")
print(f"index to letters: {index_to_letters}")
print()

# set of all bigrams of characters mapped with index
bigrams = []
for l1 in letters:
  for l2 in letters:
    bigram = l1 + l2
    bigrams.append(bigram)

bigrams_to_index = {bigram : index for index, bigram in enumerate(bigrams)}
index_to_bigrams = {index : bigram for index, bigram in enumerate(bigrams)}

print(f"bigrams to index: {bigrams_to_index}")
print(f"index to bigrams: {index_to_bigrams}")

letters to index: {'.': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}
index to letters: {0: '.', 1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z'}

bigrams to index: {'..': 0, '.a': 1, '.b': 2, '.c': 3, '.d': 4, '.e': 5, '.f': 6, '.g': 7, '.h': 8, '.i': 9, '.j': 10, '.k': 11, '.l': 12, '.m': 13, '.n': 14, '.o': 15, '.p': 16, '.q': 17, '.r': 18, '.s': 19, '.t': 20, '.u': 21, '.v': 22, '.w': 23, '.x': 24, '.y': 25, '.z': 26, 'a.': 27, 'aa': 28, 'ab': 29, 'ac': 30, 'ad': 31, 'ae': 32, 'af': 33, 'ag': 34, 'ah': 35, 'ai': 36, 'aj': 37, 'ak': 38, 'al': 39, 'am': 40, 'an': 41, 'ao': 42, 'ap': 43, 'aq': 44, 'ar': 45, 'as': 46, 'at': 47, 'au': 

# COUNTING

### TRAIN THE TRI-GRAM LANGUAGE MODEL WITH USE OF THE MATRIX

In [7]:
import torch

In [8]:
# create the count tensor
count_tensor = torch.zeros(size=(len(bigrams), len(letters)), dtype=torch.int32)
for name in names:
  chars = ['.'] + ['.'] + list(name) + ['.']
  for char1, char2, char3 in zip(chars, chars[1:], chars[2:]):
    bigram = char1 + char2
    bigram_index = bigrams_to_index[bigram]
    char_index = letters_to_index[char3]
    count_tensor[bigram_index, char_index] +=1


# create the probability tensor, adding fake counts to smooth out the model
prob_tensor = count_tensor.float() + 0.5
prob_tensor /= torch.sum(prob_tensor, dim=1, keepdim=True)


# sample from the probability distribution
# generator for the sampling from distribution
arg_generator = torch.Generator().manual_seed(2147483647)

# sample 200 names
for _ in range(200):
  sampled_char_list = []
  sampled_bigram_idx = 0 # starting at ..

  while True:
    # first char of the next bigram
    next_char1 = index_to_bigrams[sampled_bigram_idx][1]

    # sampled the next char of the name with the given distribution of bigram
    row = prob_tensor[sampled_bigram_idx]
    next_char2_idx = torch.multinomial(row, num_samples=1,
                                    replacement=True,
                                    generator=arg_generator).item()
    # second char of the next bigram (also next char of the name)
    next_char2 = index_to_letters[next_char2_idx]

    # the next bigram and index to get distribution of that bigram
    next_bigram = next_char1 + next_char2
    sampled_bigram_idx = bigrams_to_index[next_bigram]

    if next_char2_idx != 0:
      sampled_char_list.append(next_char2)
    else:
      # if next char is . (idx = 0), we can stop
      break

  sampled_name = "".join(sampled_char_list)
  print(sampled_name)


junide
jakasid
prelay
adin
kairritoper
sathen
sameia
yanileniassibduinewin
lessiyanayla
te
farmanthyfortumj
ponn
lena
jaylicore
ya
jocken
jamilyn
korin
wyn
ne
gadsnhavi
monselladdon
mathani
zie
paun
ty
tin
sreli
ish
dyn
rumel
jemah
dawata
kha
cra
raydnh
adorta
malyn
brey
aur
lavarocbzthemiraya
ath
basely
tavisonikiyaalee
marlen
em
fabethellianten
chan
jazaody
drd
johialiypvrgia
tezra
elia
vywhqelvani
sahimah
kellette
braceodon
ali
alian
denn
jayannyah
kennelynkmarianner
samotan
kyroderihana
shday
ta
olleah
terty
breus
dasia
na
chlynevini
aspi
katalilondral
fanmari
mishama
verykongeon
resynivion
uzien
jalivyah
alto
marafelanaylian
ohanorrisyli
mson
jorayitalkalviyaneria
maulil
cacvatavryaaleximrla
sa
pre
azaikeri
khanna
abbiha
isa
brailyorv
bradeelyn
zyarierri
chamadayda
fbalyah
meyton
za
sabdin
han
kennasslen
conik
ny
naaston
tisona
isemran
wwlixie
kadleekam
ljaykine
findesiyah
lula
jus
b
den
hawyqsemilair
hasiam
na
adiamordelamiquadilah
an
keika
viyah
ax
mandt
tyjamanalingh
jepekala
k

In [12]:
# the loss function (average negative log likelihood) of this model
log_likelihood = 0.0
count = 0
for name in names:
  chars = ['.'] + ['.'] + list(name) + ['.']
  for char1, char2, char3 in zip(chars, chars[1:], chars[2:]):
    bigram = char1 + char2
    bigram_index = bigrams_to_index[bigram]
    letter_index = letters_to_index[char3]
    prob = prob_tensor[bigram_index, letter_index]
    # this one takes the logarithm of the tensor
    log_likelihood += torch.log(prob)
    count += 1

loss = abs(log_likelihood) / count
print(f"Loss: {loss.item()}")

Loss: 2.200744867324829


=> The loss of the trigram model (as measured by the average negative log likehood) is less than that of the bigram model.

#NEURAL NETWORK

### TRAIN THE TRIGRAM LANGUAGE MODEL

In [13]:
import torch
from torch.nn.functional import one_hot, cross_entropy

In [None]:
# traning set of trigram
inputs, targets = [], []
count = 0

for name in names:
  chars = ['.'] + ['.'] + list(name) + ['.']
  for char1, char2, char3 in zip(chars, chars[1:], chars[2:]):
    bigram = char1 + char2
    index1 = bigrams_to_index[bigram]
    index2 = letters_to_i[char3]
    inputs.append(index1) # the bigram at this turn
    targets.append(index2) # the corresponding char after it
    count += 1

# turn the list into the tensor
inputs = torch.tensor(inputs)
targets = torch.tensor(targets)

print(inputs.shape)
print(targets.shape)
print(count)

torch.Size([228146])
torch.Size([228146])
228146


In [None]:
arg_generator = torch.Generator().manual_seed(2147483647)
weights = torch.randn((729, 27), generator=arg_generator, requires_grad=True)
logits_count = weights[inputs]
logits_count.shape

torch.Size([228146, 27])

In [None]:
def train_model(arg_inputs, arg_targets, arg_dev_inputs=None, arg_dev_targets=None):
  # intialize the weights for the neural net,
  # 1 layer (is dumb), 27 neurons (columns), each neuron 729 weights (rows)
  arg_generator = torch.Generator().manual_seed(2147483647)
  weights = torch.randn((729, 27), generator=arg_generator, requires_grad=True)

  for i in range(1000):
    # forward pass
    # calculate the softmax
    # index directly to the rows
    logits_count = weights[arg_inputs]
    # softmax the logits
    # count_tensor = torch.exp(logits_count)
    # prob_tensor = count_tensor / torch.sum(count_tensor, dim=1, keepdim=True)

    # calculate the loss (negative log likelihood)
    # loss = abs(prob_tensor[torch.arange(prob_tensor.shape[0]), arg_targets].log().mean())

    # using cross entropy
    loss = cross_entropy(logits_count, targets)
    print(f"Train loss {i+1}/1000: {loss.item()}")

    # calculate the dev loss (if any)
    if arg_dev_inputs != None:
      dev_logits_count = weights[arg_dev_inputs]
      # dev_count_tensor = torch.exp(dev_logits_count)
      # dev_prob_tensor = dev_count_tensor / torch.sum(dev_count_tensor, dim=1, keepdim=True)

      # calculate the loss (using negative log likelihood)
      # dev_loss = abs(dev_prob_tensor[torch.arange(dev_prob_tensor.shape[0]), arg_dev_targets].log().mean())

      # using cross entropy
      dev_loss = cross_entropy(dev_inputs, targets)
      print(f"Devel loss {i+1}/1000: {dev_loss.item()}")
      print()

    # backward pass (backprogagation comes into place)
    weights.grad = None # set to zero gradient
    loss.backward()

    # update the parameters of the weights
    weights.data += -100 * weights.grad

  # return the parameters of the model
  return weights

In [None]:
def sample_names(arg_weights):
  # sample 200 names from the probability distribution
  arg_generator = torch.Generator().manual_seed(2147483647)
  for _ in range(200):
    sampled_char_list = []
    sampled_idx = 0 # starting at ..

    while True:
      # first char of the next bigram
      next_char1 = index_to_bigrams[sampled_idx][1]

      # sampled the next char using forward pass of the neural net
      logits_count = arg_weights[sampled_idx]
      count_tensor = torch.exp(logits_count)
      prob = count_tensor / torch.sum(count_tensor, dim=0)
      next_char2_idx = torch.multinomial(prob, num_samples=1,
                                      replacement=True,
                                      generator=arg_generator).item()

      # second char of the next bigram (also next char of the name)
      next_char2 = i_to_letters[next_char2_idx]

      # the next bigram and index to get distribution of that bigram
      next_bigram = next_char1 + next_char2
      sampled_idx = bigrams_to_index[next_bigram]

      if next_char2_idx != 0:
        # if next char isn't ., add them to the name
        sampled_char_list.append(next_char2)
      else:
        # if next char is . (idx = 0), we can stop
        break

    sampled_name = "".join(sampled_char_list)
    print(sampled_name)

In [None]:
# train model using cross entropy loss
model_weights = train_model(inputs, targets)
sample_names(model_weights)

Train loss 1/1000: 3.792776346206665
Train loss 2/1000: 3.53816819190979
Train loss 3/1000: 3.417757511138916
Train loss 4/1000: 3.3257200717926025
Train loss 5/1000: 3.2489399909973145
Train loss 6/1000: 3.1840078830718994
Train loss 7/1000: 3.1282405853271484
Train loss 8/1000: 3.0796456336975098
Train loss 9/1000: 3.036648750305176
Train loss 10/1000: 2.998178005218506
Train loss 11/1000: 2.963472366333008
Train loss 12/1000: 2.931981086730957
Train loss 13/1000: 2.9032771587371826
Train loss 14/1000: 2.877018451690674
Train loss 15/1000: 2.8529207706451416
Train loss 16/1000: 2.8307430744171143
Train loss 17/1000: 2.810277223587036
Train loss 18/1000: 2.79133939743042
Train loss 19/1000: 2.7737646102905273
Train loss 20/1000: 2.7574081420898438
Train loss 21/1000: 2.7421391010284424
Train loss 22/1000: 2.7278409004211426
Train loss 23/1000: 2.7144131660461426
Train loss 24/1000: 2.7017662525177
Train loss 25/1000: 2.689824342727661
Train loss 26/1000: 2.6785190105438232
Train loss 

In [None]:
model_weights = train_model(inputs, targets)
sample_names(model_weights)

Train loss 1/1000: 3.792776346206665
Train loss 2/1000: 3.538167953491211
Train loss 3/1000: 3.417757511138916
Train loss 4/1000: 3.3257200717926025
Train loss 5/1000: 3.2489397525787354
Train loss 6/1000: 3.1840076446533203
Train loss 7/1000: 3.1282403469085693
Train loss 8/1000: 3.0796451568603516
Train loss 9/1000: 3.036648750305176
Train loss 10/1000: 2.998178005218506
Train loss 11/1000: 2.9634721279144287
Train loss 12/1000: 2.931981086730957
Train loss 13/1000: 2.9032769203186035
Train loss 14/1000: 2.8770182132720947
Train loss 15/1000: 2.8529207706451416
Train loss 16/1000: 2.8307430744171143
Train loss 17/1000: 2.810276985168457
Train loss 18/1000: 2.791339159011841
Train loss 19/1000: 2.7737646102905273
Train loss 20/1000: 2.7574081420898438
Train loss 21/1000: 2.7421388626098633
Train loss 22/1000: 2.7278409004211426
Train loss 23/1000: 2.7144129276275635
Train loss 24/1000: 2.7017667293548584
Train loss 25/1000: 2.689824104309082
Train loss 26/1000: 2.6785190105438232
Trai

#### Divide the dataset into smallet sets & train only on the train set

In [None]:
# split the inputs and target into train set, dev set, and test set
train_split = int(0.8 * count)
dev_split = int(0.9 * count)
train_inputs, train_targets = inputs[:train_split], targets[:train_split]
dev_inputs, dev_targets = inputs[train_split : dev_split], targets[train_split : dev_split]
test_inputs, test_targets = inputs[dev_split :], targets[dev_split :]

# test the division
train_inputs.shape[0] + dev_inputs.shape[0] + test_inputs.shape[0] == inputs.shape[0]

True

In [None]:
# train the model on the train set, see how it goes with the dev set too
train_weights = train_model(train_inputs, train_targets)
sample_names(train_weights)

Train loss 1/1000: 3.7902348041534424
Train loss 2/1000: 3.522183895111084
Train loss 3/1000: 3.3927693367004395
Train loss 4/1000: 3.29388689994812
Train loss 5/1000: 3.212381362915039
Train loss 6/1000: 3.1441524028778076
Train loss 7/1000: 3.085684299468994
Train loss 8/1000: 3.0348353385925293
Train loss 9/1000: 2.990046739578247
Train loss 10/1000: 2.950261116027832
Train loss 11/1000: 2.9146273136138916
Train loss 12/1000: 2.8825061321258545
Train loss 13/1000: 2.853383779525757
Train loss 14/1000: 2.8268635272979736
Train loss 15/1000: 2.8026211261749268
Train loss 16/1000: 2.7803895473480225
Train loss 17/1000: 2.7599384784698486
Train loss 18/1000: 2.7410669326782227
Train loss 19/1000: 2.723595142364502
Train loss 20/1000: 2.707364797592163
Train loss 21/1000: 2.692234516143799
Train loss 22/1000: 2.6780827045440674
Train loss 23/1000: 2.6648027896881104
Train loss 24/1000: 2.652304172515869
Train loss 25/1000: 2.640507936477661
Train loss 26/1000: 2.629347085952759
Train los

In [None]:
# evaluate the parameters of the trained model on the other set
def evaluate_model(arg_weights, arg_inputs, arg_targets):
  logits_count = arg_weights[arg_inputs]
  count_tensor = torch.exp(logits_count)
  prob_tensor = count_tensor / torch.sum(count_tensor, dim=1, keepdim=True)

  # calculate the loss (negative log likelihood)
  loss = abs(prob_tensor[torch.arange(prob_tensor.shape[0]), arg_targets].log().mean())
  print(f"Loss: {loss.item()}")

In [None]:
print("For the dev set:")
evaluate_model(train_weights, dev_inputs, dev_targets)

print("For the test set:")
evaluate_model(train_weights, test_inputs, test_targets)

For the dev set:
Loss: 2.4274251461029053
For the test set:
Loss: 2.449005126953125


=> The loss for dev set and test set is much higher than that for the train set



### Tune the smoothing of the model on the dev set

In [None]:
train_weights.grad

tensor([[ 9.5886e-06,  3.7515e-07, -3.7217e-07,  ..., -3.9107e-07,
         -3.6552e-07, -4.1003e-07],
        [ 8.7005e-06, -4.3350e-07, -4.3295e-07,  ..., -5.4515e-07,
         -4.3455e-07, -4.3623e-07],
        [ 3.5884e-06, -5.6147e-06,  3.7776e-06,  ...,  2.7962e-06,
         -5.7857e-06,  2.1665e-06],
        ...,
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-4.3128e-06, -4.4627e-06,  2.3119e-06,  ...,  2.6989e-06,
          5.3450e-06,  2.7283e-06],
        [-5.2416e-06, -8.6936e-06,  3.1826e-06,  ...,  4.5518e-06,
         -1.2195e-05,  4.8728e-06]])

In [None]:
# train the obtained model on the train set on the dev set with the smoothing
def smooth_model(arg_weights, arg_inputs, arg_targets):
  arg_generator = torch.Generator().manual_seed(2147483647)

  # clone the current weights parameters to the other tensor
  smooth_weights = arg_weights.detach().clone()
  smooth_weights.requires_grad = True
  smooth_weights.grad = arg_weights.grad.clone()

  for i in range(500):
    # forward pass
    # calculate the softmax
    # index directly to the rows
    logits_count = smooth_weights[arg_inputs]
    count_tensor = torch.exp(logits_count)
    prob_tensor = count_tensor / torch.sum(count_tensor, dim=1, keepdim=True)

    # calculate the loss (negative log likelihood)
    dev_loss = abs(prob_tensor[torch.arange(prob_tensor.shape[0]), arg_targets].log().mean())
    # add the regularlization loss to smooth the model
    dev_lost += (0.1) * (smooth_weights ** 2).mean()
    print(f"Dev loss {i+1}/500: {dev_loss.item()}")

    # backward pass (backprogagation comes into place)
    dev_loss.backward()

    # update the parameters of the weights
    smooth_weights.data += -100 * smooth_weights.grad

  # return the parameters of the model
  return smooth_weights

In [None]:
smoothed_weights = smooth_model(train_weights, dev_inputs, dev_targets)

tensor(1.6314, grad_fn=<MeanBackward0>)
tensor([[ 9.5886e-06,  3.7515e-07, -3.7217e-07,  ..., -3.9107e-07,
         -3.6552e-07, -4.1003e-07],
        [ 8.7005e-06, -4.3350e-07, -4.3295e-07,  ..., -5.4515e-07,
         -4.3455e-07, -4.3623e-07],
        [ 3.5884e-06, -5.6147e-06,  3.7776e-06,  ...,  2.7962e-06,
         -5.7857e-06,  2.1665e-06],
        ...,
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-4.3128e-06, -4.4627e-06,  2.3119e-06,  ...,  2.6989e-06,
          5.3450e-06,  2.7283e-06],
        [-5.2416e-06, -8.6936e-06,  3.1826e-06,  ...,  4.5518e-06,
         -1.2195e-05,  4.8728e-06]])
Dev loss 1/1000: 2.5905685424804688
Dev loss 2/1000: 2.5645718574523926
Dev loss 3/1000: 2.53695011138916
Dev loss 4/1000: 2.5140600204467773
Dev loss 5/1000: 2.4916210174560547
Dev loss 6/1000: 2.475141763687134
Dev loss 7/1000: 2.467463731765747
Dev loss 8/1000: 2.457728862762451
Dev loss 9/1000: 2.4463086128234863
Dev loss 

In [None]:
print(train_weights.grad)
smoothed_weights.grad

tensor([[ 9.5886e-06,  3.7515e-07, -3.7217e-07,  ..., -3.9107e-07,
         -3.6552e-07, -4.1003e-07],
        [ 8.7005e-06, -4.3350e-07, -4.3295e-07,  ..., -5.4515e-07,
         -4.3455e-07, -4.3623e-07],
        [ 3.5884e-06, -5.6147e-06,  3.7776e-06,  ...,  2.7962e-06,
         -5.7857e-06,  2.1665e-06],
        ...,
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-4.3128e-06, -4.4627e-06,  2.3119e-06,  ...,  2.6989e-06,
          5.3450e-06,  2.7283e-06],
        [-5.2416e-06, -8.6936e-06,  3.1826e-06,  ...,  4.5518e-06,
         -1.2195e-05,  4.8728e-06]])


tensor([[ 3.7007e-04, -1.0763e-04,  1.0887e-03,  ...,  6.3785e-04,
          2.2499e-04, -4.4541e-04],
        [-3.4200e-04, -9.4844e-04,  2.4531e-03,  ...,  3.3765e-04,
         -7.0552e-04, -1.4215e-03],
        [ 9.4209e-05, -1.0226e-03,  9.5708e-05,  ...,  8.6067e-05,
          2.3030e-04,  7.2270e-05],
        ...,
        [-8.5193e-05,  5.7341e-05, -4.7001e-05,  ..., -7.2111e-05,
         -2.8910e-04, -2.2474e-05],
        [-1.1804e-03, -7.5837e-04, -3.3226e-04,  ..., -2.6523e-04,
          1.1146e-04, -2.6019e-04],
        [-3.2234e-04, -9.6969e-04,  1.2076e-04,  ...,  8.0720e-05,
         -4.2856e-04,  7.2831e-05]])

In [None]:
sample_names(smoothed_weights)

junidedion
kar
prisay
adin
kovin
to
shamareem
sameiaurio
levencedbduinrwin
ads
jainarvarterinfeumeryfoeturjachitsuf
hesan
rhore
ya
jocfpyjakeir
edim
ki
wyni
san
asnhavilaspsenhddion
matteric
seremungslatia
som
a
ish
dyn
rajer
jemah
dawath
khyleiggraydornesonta
malyn
balihi
iseaurpotblahemir
tawath
basel
khiveertikeysaleever
nan
em
fab
tremraxx
el
chwi
jachodrio
dwin
jay
kpvrgeveemariancidendo
quiv
emon
hib
hakel
grilwarsendyon
ali
alian
tewsen
kenfddir
carvjvokeyor
nalrashartafeldroderighno
shd
bris
olleass
saysia
bus
dagaven
tyn
codfddqzlydo
katalm
otteslef
dman
stestanadedihannge
jameseniveer
uze
natgjjvovegfjt
mufrafelan
yurileon
dorreste
arso
revertiparkin
jay
shia
maud
la
acvjvon
saalextimso
salpb
brianieri
otz
narkbhhhi
isaughakamorven
kamel
taz
arson
taliah
dayrgehbalqad
muyton
zachamedi
han
ke
etsultreextik
ny
naaston
tryonakisemman
wwlsxxelicklxfkta
lj
gkjmsofinfenicnaviula
jvon

den
hawyleemilasen
kolum
nakadicton
zaam
quaditadey
vin
kayviyavia
hukoltillo
man
lib
hartae
azah


In [None]:
evaluate_model(smoothed_weights, test_inputs, test_targets)

Loss: 2.54518723487854
