In [67]:
!pip install sentencepiece

import sentencepiece as spm
import random
import torch
import math


torch.manual_seed(42)


input_file = 'names_1.txt'
prefix = 'm'
vocab_size = 200

spm.SentencePieceTrainer.train(
    input=input_file, 
    model_prefix=prefix, 
    vocab_size=vocab_size
)

class Tokenizer:
    def __init__(self):
        self.sp = spm.SentencePieceProcessor()
        self.sp.load(f'{prefix}.model')
        self.vocab_size = self.sp.get_piece_size()

    def encode(self, name):
        return self.sp.encode_as_ids(name)

    def decode(self, tokens):
        return self.sp.decode_ids(tokens)
    
tokenizer = Tokenizer()



sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: names_1.txt
  input_format: 
  model_prefix: m
  model_type: UNIGRAM
  vocab_size: 200
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differential_privacy_noise_level: 0
  diffe

In [68]:
with open('names.txt', 'r') as f:
    names = f.readlines()

# Add <sos> and <eos> tokens to each name
formatted_names = ['<sos>' + name.strip() + '<eos>\n' for name in names]

# Write the formatted names to a new file
with open('names_1.txt', 'w') as f:
    f.writelines(formatted_names)


In [69]:

# print(tokenizer.vocab_size)  # 29
foo = tokenizer.encode('john') # [2, 12, 17, 10, 16, 1]
bar = tokenizer.decode([69])    # john
print(bar)
print(foo)
for i in range(100):
    print(tokenizer.decode([i]), end=' ')




ke
[7, 78, 41, 14]
 ⁇    < > sos eos  e a s y i ri n t an d o na ma g ra l u la el k ja ni ka re mi da z on c li le b m h sa ar ya en ne vi f ta ah er p w za in de ha x sh al r ro th lyn di ca lo me ke or se ce am ly si lee st jo ley ch ana lynn mar ay sha ry ia ad lin is va il lan em us anna ti ky ai 

In [70]:
class Dataset(torch.utils.data.Dataset):
  def __init__(self):
    with open('names_1.txt', 'r') as f:
      self.names = f.read().split('\n')
    self.tokenizer = Tokenizer()

  def __len__(self):
    # Return number of names
    return len(self.names)

  def __getitem__(self, idx):
    # Get name at index
    name = self.names[idx]
    # Return encoded name
    return torch.tensor(self.tokenizer.encode(name))


ds = Dataset()
dl = torch.utils.data.DataLoader(ds, batch_size=1, shuffle=False)


class BesSimpleTransformer(torch.nn.Module):
  def __init__(self):
    super(BesSimpleTransformer, self).__init__()
    # Embedding part of the model - 7 is the embedding size
    self.embedding    = torch.nn.Embedding(tokenizer.vocab_size, 7)
    self.pos_emb      = self.get_pos_matrix()
    # Mask tensor trick - if batch size is one, we might not need it - research it!
    self.register_buffer('mask', torch.tril(torch.ones(19, 19)))
    # First decoder block
    # 11 could be anything, if we have heads or batch_size this might change
    self.layer_00_key = torch.nn.Linear(7, 11)
    self.layer_00_qry = torch.nn.Linear(7, 11)
    self.layer_00_val = torch.nn.Linear(7, 11)
    self.layer_00_ffw = torch.nn.Linear(11, 7)
    # Second decoder block
    self.layer_01_key = torch.nn.Linear(7, 11)
    self.layer_01_qry = torch.nn.Linear(7, 11)
    self.layer_01_val = torch.nn.Linear(7, 11)
    self.layer_01_ffw = torch.nn.Linear(11, 7)
    # Output of the model
    self.map_to_vocab = torch.nn.Linear(7, tokenizer.vocab_size)

  def forward(self, x):
    emb = self.embedding(x)
    pos = self.pos_emb[0:x.shape[0], :]
    emb = emb + pos

    key = self.layer_00_key(emb)
    qry = self.layer_00_qry(emb)
    val = self.layer_00_val(emb)
    att = torch.mm(qry, key.t())
    # mask from 0 to token end (square mask)
    msk = self.mask[0:x.shape[0], 0:x.shape[0]]
    # mask over tensor (same as adding it)
    att = att.masked_fill(msk == 0, float('-inf'))
    att_00 = torch.nn.functional.softmax(att, dim=1)
    att = torch.nn.functional.softmax(att, dim=1)
    res = torch.mm(att, val)
    # this is the feed forward layer
    res = self.layer_00_ffw(res)

    # do it all again with new q, k, v
    key = self.layer_01_key(res)
    qry = self.layer_01_qry(res)
    val = self.layer_01_val(res)
    att = torch.mm(qry, key.t())
    msk = self.mask[0:x.shape[0], 0:x.shape[0]]
    att = att.masked_fill(msk == 0, float('-inf'))
    att_01 = torch.nn.functional.softmax(att, dim=1)
    att = torch.nn.functional.softmax(att, dim=1)
    res = torch.mm(att, val)
    res = self.layer_01_ffw(res)

    # map back to our 29 vocab (alphabet + pos, eos, sos)
    out = self.map_to_vocab(res)
    return out, [att_00, att_01]

  def get_pos_matrix(self):
    store = torch.zeros(19, 7)
    for pos in range(19):
      # why do we do this range thing
      for i in range(0, 7, 2):
        denominator = 10000 ** (2 * i / 7)
        store[pos, i] = math.sin(pos / denominator)
        if i + 1 < 7: store[pos, i + 1] = math.cos(pos / denominator)
    return store


m = BesSimpleTransformer()

# SDG instead of Adam, why?
opt = torch.optim.Adam(m.parameters(), lr=0.01)


In [75]:
# target_accuracies = [50, 60, 70, 80, 90, 100]
# current_target_idx = 0
# best_accuracy_so_far = 0

# loss_history = []
# num_epochs = 100
# bb= False
# for epoch in range(num_epochs):
#   if bb == True:
#     break
#   for idx, batch in enumerate(dl):
#     sos = torch.tensor([2])
#     eos = torch.tensor([1])
#     x = batch[0]
#     x = torch.cat([sos, x])
#     y = torch.cat([x[1:], eos])
#     p = m(x, 1)
#     l = torch.nn.functional.cross_entropy(p, y)
#     _, predicted = torch.max(p, 1)
#     correct = (predicted == y).sum().item()
#     total = y.size(0)
#     accuracy = 100 * correct / total
#     if accuracy > best_accuracy_so_far:
#         if accuracy >= target_accuracies[current_target_idx]:
#             save_path = f"model_{target_accuracies[current_target_idx]}_accuracy.pth"
#             torch.save(m.state_dict(), save_path)
#             print(f"Model saved with accuracy: {accuracy:.2f}% at {save_path}")

#             current_target_idx += 1
#             if current_target_idx >= len(target_accuracies):
#                     bb = True
#                     break
loss_history = []
num_epochs = 1
for epoch in range(num_epochs):
  for idx, batch in enumerate(dl):

    # sos = torch.tensor([2])
    # eos = torch.tensor([1])
    # Derive sos and eos from tokenizer
    sos = torch.tensor([tokenizer.sp.piece_to_id('<s>')])
    eos = torch.tensor([tokenizer.sp.piece_to_id('</s>')])
    

    # for each row in batch
    x = batch[0]
    # add sos to beginning of row
    x = torch.cat([sos, x])
    # In target tensor, add eos to end of each row and remove sos from start
    y = torch.cat([x[1:], eos])

    # run our batch through the whole transformer (attention1, ffw, attention2, ffw, linear)
    p, _ = m(x)
    # calculate cross-entropy loss between predicted token and target token
    l = torch.nn.functional.cross_entropy(p, y)
    # print loss every 1000 rows of dataset
    if (idx % 1000 == 0): 
        print("Epoch:", epoch)
        print("Loss:", l.item())
        # print("Accuracy: {:.2f}%".format(accuracy))

        
        # incorrect_indices = (predicted != y).nonzero().squeeze()
        # if incorrect_indices.dim() == 0:
        #     print("All samples are correctly classified!")
        # else:
        #     if len(incorrect_indices) > 4:
        #         print("Misclassified samples:", tokenizer.decode(incorrect_indices.tolist()))
        #         for idx in incorrect_indices[:5]: 
        #             print("True Label:", y[idx].item(),":", tokenizer.decode([y[idx].item()]), "Predicted Label:", predicted[idx].item(), ":", tokenizer.decode([predicted[idx].item()]))

    l.backward()
  
    opt.step()
    opt.zero_grad()



Epoch: 0
Loss: 1.18683922290802
Epoch: 0
Loss: 1.6655734777450562
Epoch: 0
Loss: 1.5813531875610352
Epoch: 0
Loss: 2.0145668983459473
Epoch: 0
Loss: 1.6702752113342285
Epoch: 0
Loss: 2.1207222938537598
Epoch: 0
Loss: 1.3382995128631592
Epoch: 0
Loss: 1.5434716939926147
Epoch: 0
Loss: 2.025179624557495
Epoch: 0
Loss: 1.3806618452072144
Epoch: 0
Loss: 2.694112777709961
Epoch: 0
Loss: 1.7756913900375366
Epoch: 0
Loss: 1.3466659784317017
Epoch: 0
Loss: 1.4673303365707397
Epoch: 0
Loss: 1.300416350364685
Epoch: 0
Loss: 1.3392724990844727
Epoch: 0
Loss: 1.7129039764404297
Epoch: 0
Loss: 1.3588838577270508
Epoch: 0
Loss: 1.7274441719055176
Epoch: 0
Loss: 1.4651519060134888
Epoch: 0
Loss: 1.4066389799118042
Epoch: 0
Loss: 1.7007877826690674
Epoch: 0
Loss: 2.2076916694641113
Epoch: 0
Loss: 1.2881213426589966
Epoch: 0
Loss: 1.6889972686767578
Epoch: 0
Loss: 1.2862662076950073
Epoch: 0
Loss: 1.0042610168457031
Epoch: 0
Loss: 1.6195276975631714
Epoch: 0
Loss: 1.7000917196273804
Epoch: 0
Loss: 2.74

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)

In [43]:
m = BesSimpleTransformer()
# m.load_state_dict(torch.load("model_100_accuracy.pth"))

# m.eval()

BesSimpleTransformer(
  (embedding): Embedding(500, 7)
  (layer_00_key): Linear(in_features=7, out_features=11, bias=True)
  (layer_00_qry): Linear(in_features=7, out_features=11, bias=True)
  (layer_00_val): Linear(in_features=7, out_features=11, bias=True)
  (layer_00_ffw): Linear(in_features=11, out_features=7, bias=True)
  (layer_01_key): Linear(in_features=7, out_features=11, bias=True)
  (layer_01_qry): Linear(in_features=7, out_features=11, bias=True)
  (layer_01_val): Linear(in_features=7, out_features=11, bias=True)
  (layer_01_ffw): Linear(in_features=11, out_features=7, bias=True)
  (map_to_vocab): Linear(in_features=7, out_features=500, bias=True)
)

In [44]:

import string

goon=True
alllookalike=0
for x in string.ascii_lowercase:


    print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")    
    sos_tensor = sos.unsqueeze(0) # if sos is a scalar, this will turn it into a 1D tensor

    encoded_x = torch.tensor(tokenizer.encode([x])) # ensure it's a 1D tensor

    if encoded_x.dim() == 0:
        encoded_x = encoded_x.unsqueeze(0)

    x = torch.cat([sos_tensor, encoded_x])

    
    
    while goon == True:
        p_logits = m(x, temperature=1)
        p_probs = torch.nn.functional.softmax(p_logits, dim=-1)
        p_token = torch.argmax(p_probs, dim=1)
        predicted_prob = p_probs[0][p_token[-1]].item()
        print("Input:", tokenizer.decode(x[1:].tolist()), "Prediction:", tokenizer.decode(p_token[-1:].tolist()),f"Probability: {predicted_prob:.4f}" )
        x = torch.cat([x, p_token[-1].unsqueeze(0)])
        if p_token[-1] == 1 or len(p_token.tolist()) == 17: break

    print("Generate:", tokenizer.decode(x[1:].tolist()))
    generated= tokenizer.decode(x.tolist())

    generated = generated.replace('<sos>', '')
    generated = generated.replace('<eos>', '')
    print(generated)

    counter=0
    with open('names.txt', 'r') as f:
        for line in f:
            if generated in line:
                counter += 1

            if generated == line:
                print("exact match!:", line)
        print("look alike names:", counter)
        temp = counter
        alllookalike += temp

print("Temperature 2, :" , alllookalike)


XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


RuntimeError: t() expects a tensor with <= 2 dimensions, but self is 3D