# **Name Generator Research**

# Data preprocessing

In [None]:
VOCAB_SIZE = 7200
EMBEDDING_DIM = 320

In [None]:
combined_sentences = list(line.strip() for line in open("combined_dataset.txt", "r", encoding="utf-8").readlines())
combined_sentences[:10]

['okay youre gonna need to learn how to lie',
 'im kidding you know how sometimes you just become this persona and you dont know how to quit',
 'like my fear of wearing pastels',
 'i figured youd get to the good stuff eventually',
 'thank god if i had to hear one more story about your coiffure',
 'me this endless blonde babble im like boring myself',
 'do you listen to this crap',
 'then guillermo says if you go any lighter youre gonna look like an extra on',
 'you always been this selfish',
 'then thats all you had to say']

# Tokenisation

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace

In [None]:
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()
trainer = WordPieceTrainer(
    vocab_size=VOCAB_SIZE,
    min_frequency=2,
    special_tokens=["[PAD]", "[UNK]", "[EOS]", "[EOF]", "[MASK]"]
)
tokenizer.train(files=["combined_dataset.txt"], trainer=trainer)

In [None]:
to_encode = "i have been with petronas for years i feel that petronas has performed well and made a huge profit"
encoded = tokenizer.encode(to_encode)
print("Tokens:", encoded.tokens)
print("IDs:", encoded.ids)

Tokens: ['i', 'have', 'been', 'with', 'pet', '##ron', '##as', 'for', 'years', 'i', 'feel', 'that', 'pet', '##ron', '##as', 'has', 'perform', '##ed', 'well', 'and', 'made', 'a', 'huge', 'prof', '##it']
IDs: [13, 122, 241, 126, 1270, 5100, 408, 111, 669, 13, 64, 86, 1270, 5100, 408, 381, 2897, 68, 343, 72, 534, 5, 2787, 1824, 74]


In [None]:
def tokenizer_decode(tokens: list [str]) -> str:
  return ' '.join(tokens).replace(' ##', '')

tokenizer_decode(encoded.tokens), to_encode

('i have been with petronas for years i feel that petronas has performed well and made a huge profit',
 'i have been with petronas for years i feel that petronas has performed well and made a huge profit')

In [None]:
tokenizer.save("tokenizer.json")

In [None]:
tokenized_sentences = [tokenizer.encode(sentence).tokens for sentence in combined_sentences]
tokenized_sentences[0]

['okay', 'youre', 'gonna', 'need', 'to', 'learn', 'how', 'to', 'lie']

In [None]:
def un(d: list[list]) -> dict:
  un = {}
  for sen in d:
    for t in sen:
      if t not in un: un[t] = 0
      un[t] += 1
  return un

In [None]:
len(un(tokenized_sentences))

7056

# Word2Vec training

In [None]:
import multiprocessing
cores = multiprocessing.cpu_count()
cores

2

## Train stuff

In [None]:
tokenizer = Tokenizer.from_file("tokenizer.json")
tokenizer.get_vocab_size()

7200

In [None]:
from gensim.models import Word2Vec

model = Word2Vec(min_count=1,
                 window=10,
                 vector_size=EMBEDDING_DIM,
                 sample=6e-5,
                 alpha=0.006,
                 min_alpha=0.0007,
                 negative=20,
                 workers=cores
)

In [None]:
ts = []
for s in tokenized_sentences: ts.append(['[PAD]'] + s + ['[EOF]'])
ts.append(list(t for t in tokenizer.get_vocab().keys()))
ts.append(["[PAD]", "[UNK]", "[EOS]", "[EOF]", "[MASK]"])
model.build_vocab(ts)
len(un(ts))

7200

In [None]:
ts[-4][:4], ts[-4][-4:]

(['[PAD]', 'i', 'or', 'any', 'of'], ['the', 'teacher', 'arrived', '[EOF]'])

In [None]:
model.train(ts, total_examples=model.corpus_count, epochs=40)

(50531218, 131786440)

In [None]:
len(model.wv)

7200

In [None]:
model.save('word2vec.model')

## Load stuff

In [None]:
from gensim.models import Word2Vec

model = Word2Vec.load('word2vec.model')
model.layer1_size, len(model.wv)

(320, 7200)

## Results

In [None]:
model.wv.most_similar(positive=['i'])

[('honestly', 0.6766033172607422),
 ('really', 0.6195955276489258),
 ('even', 0.588657021522522),
 ('actually', 0.5778974294662476),
 ('badly', 0.571925163269043),
 ('because', 0.5498996376991272),
 ('attracted', 0.5411146879196167),
 ('but', 0.5264946818351746),
 ('shouldn', 0.5257372260093689),
 ('myself', 0.5220463871955872)]

In [None]:
model.wv.most_similar(positive=['sister', 'man'], negative=['woman'])

[('brother', 0.8064393401145935),
 ('aunt', 0.7062578797340393),
 ('frank', 0.6991549134254456),
 ('uncle', 0.6941248178482056),
 ('##law', 0.6906384229660034),
 ('cousin', 0.6893810033798218),
 ('fathers', 0.6883201599121094),
 ('nick', 0.6772075891494751),
 ('father', 0.6747114658355713),
 ('doroth', 0.6739047765731812)]

In [None]:
w2v = model

# Main dataset configuration

In [None]:
import torch

def encode(token: str, w2v_model=w2v) -> torch.Tensor:
  return torch.tensor(w2v_model.wv[token])

def decode(emb: torch.Tensor, w2v_model=w2v) -> str:
  return w2v.wv.similar_by_vector(emb.detach().numpy(), topn=1)[0][0]

decode(encode('[EOF]'))

'[EOF]'

## Token generation

In [None]:
import random
from collections import deque

def TokenGen(data: list, w2v_model, window_size=4, shuffle=True):
  # data is tokenized_sentences
  # returns cat(x1, x2, ..., xn), y

  if shuffle: random.shuffle(data)
  for sentence in data:
    window = deque([])
    for token in sentence:
      if token not in w2v_model.wv:
        token = '[UNK]'
      vec = torch.tensor(w2v_model.wv[token])
      if len(window) == window_size:
        yield torch.concat(list(window)), vec
        window.append(vec)
        window.popleft()
        continue
      window.append(vec)

gen = TokenGen(tokenized_sentences, w2v, 2)
for i, (w, y) in enumerate(gen):
  print(decode(w[-EMBEDDING_DIM:]), decode(y))
  if i > 10: break

take any
any gu
gu ##ff
##ff from
from those
those sw
sw ##ine
##ine remember
remember if
if you
you have
have any


In [None]:
cnts = un(tokenized_sentences)
cnts_s = sorted(cnts, key=cnts.get, reverse=True)
{k: cnts[k] for k in cnts_s[:10]}, {k: cnts[k] for k in cnts_s[-10:]}

({'i': 194159,
  'the': 78657,
  'to': 78170,
  'feel': 74684,
  'and': 74316,
  'a': 57894,
  'of': 42568,
  'that': 41302,
  'you': 37078,
  'feeling': 34175},
 {'##swald': 1,
  'beha': 1,
  'ener': 1,
  '##lend': 1,
  '##xious': 1,
  'scen': 1,
  'lear': 1,
  'experi': 1,
  '##host': 1,
  '##arent': 1})

## Batch generator

In [None]:
def BatchGenerator(data: list, w2v_model, batch_size=16, window_size=4, shuffle=True):
  tgen = TokenGen(data, w2v_model, window_size, shuffle)
  batch_x, batch_y = [], []
  for x, y in tgen:
    batch_x.append(x)
    batch_y.append(y)
    if len(batch_x) == batch_size:
      yield torch.stack(batch_x), torch.stack(batch_y) # (16, 4, 320), (16, 320)
      batch_x, batch_y = [], []
  if batch_x and batch_y:
    yield torch.stack(batch_x), torch.stack(batch_y)

gen = BatchGenerator(tokenized_sentences, w2v, 16, 24)

for i, (x, y) in enumerate(gen):
  print(x.shape, y.shape)
  if i > 4: break

torch.Size([16, 7680]) torch.Size([16, 320])
torch.Size([16, 7680]) torch.Size([16, 320])
torch.Size([16, 7680]) torch.Size([16, 320])
torch.Size([16, 7680]) torch.Size([16, 320])
torch.Size([16, 7680]) torch.Size([16, 320])
torch.Size([16, 7680]) torch.Size([16, 320])


# Model

In [None]:
!pip install torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-1.6.0-py3-none-any.whl.metadata (20 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.11.9-py3-none-any.whl.metadata (5.2 kB)
Downloading torchmetrics-1.6.0-py3-none-any.whl (926 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m926.4/926.4 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.11.9-py3-none-any.whl (28 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.11.9 torchmetrics-1.6.0


## A bit data before

In [None]:
tokens = []
for s in tokenized_sentences:
  tokens.append(s + ['[EOF]'])
tokens[-1][:4], tokens[-1][-4:]

(['i', 'feel', 'a', 'world'], ['would', 'be', 'fantastic', '[EOF]'])

In [None]:
def total(snts: list[list[str]]) -> int:
  total = 0
  for s in snts:
    for t in s:
      total += 1
  return total

total(tokenized_sentences), total(tokens), total(tokens) / total(tokenized_sentences)

(2983282, 3135369, 1.0509797598751978)

In [None]:
train_per = 0.9

l = len(tokens)
random.shuffle(tokens)
train_sentences = tokens[:int(l * train_per)]
val_sentences = tokens[int(l * train_per):]
len(train_sentences), len(val_sentences)

(136878, 15209)

## Model it self

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
import torch
from torch import nn
import torch.optim as optim
from torchmetrics import MeanSquaredError, R2Score

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, dropout, device):
        super().__init__()
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = nn.MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = nn.PositionwiseFeedforwardLayer(hid_dim, pf_dim, dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_mask):

        #src = [batch size, src len, hid dim]
        #src_mask = [batch size, 1, 1, src len]

        #self attention
        _src, _ = self.self_attention(src, src, src, src_mask)

        #dropout, residual connection and layer norm
        src = self.self_attn_layer_norm(src + self.dropout(_src))

        #src = [batch size, src len, hid dim]

        #positionwise feedforward
        _src = self.positionwise_feedforward(src)

        #dropout, residual and layer norm
        src = self.ff_layer_norm(src + self.dropout(_src))

        #src = [batch size, src len, hid dim]

        return src

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hid_dim, n_layers, n_heads, pf_dim, dropout, device, max_length = 100):
        super().__init__()

        self.device = device

        self.tok_embedding = nn.Embedding(input_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)

        self.layers = nn.ModuleList([nn.EncoderLayer(hid_dim, n_heads, pf_dim, dropout, device)
                                     for _ in range(n_layers)])

        self.dropout = nn.Dropout(dropout)

        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)

    def forward(self, src, src_mask):

        #src = [batch size, src len]
        #src_mask = [batch size, 1, 1, src len]

        batch_size = src.shape[0]
        src_len = src.shape[1]

        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)

        #pos = [batch size, src len]

        src = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))

        #src = [batch size, src len, hid dim]

        for layer in self.layers:
            src = layer(src, src_mask)

        #src = [batch size, src len, hid dim]

        return src

In [None]:
wnd_len = 24
hidden_size = 4096
hidden_num = 8
epochs = 20

model = SimpleModel(EMBEDDING_DIM, wnd_len, hidden_size, hidden_num)
model.to(device)

optimizer = optim.AdamW(model.parameters(), lr=3e-4)
criterion = MeanSquaredError().to(device)
r2_metric = R2Score().to(device)

In [None]:
import time

best_model = model
best_score = -1e+6

for e in range(epochs):
  model.train()
  train = BatchGenerator(train_sentences, w2v, batch_size=256, window_size=wnd_len)
  validation = BatchGenerator(val_sentences, w2v, batch_size=256, window_size=wnd_len)
  stime = time.time()

  total_loss = 0
  total_r2 = 0
  total_samples = 0

  for i, (X, target) in enumerate(train):
    X = X.to(device)
    target = target.to(device)
    optimizer.zero_grad()
    y = model(X)
    loss = criterion(y, target)
    r2 = r2_metric(y, target)

    loss.backward()
    optimizer.step()

    total_loss += loss.item() * X.shape[0]
    total_r2 += r2.item() * X.shape[0]
    total_samples += X.shape[0]

    if i % 1e+2 == 0:
      ctime = time.time()
      print(f"Epoch [{e + 1:2d}/{epochs}] - Batch [{i:6d}]: Loss - {(total_loss/total_samples):.5f}, Score > {(total_r2/total_samples):.5f}, Time - {(ctime-stime):.2f}s")
      stime = ctime

      total_loss = 0
      total_r2 = 0
      total_samples = 0

  model.eval()
  with torch.no_grad():
    total_r2 = 0
    total_samples = 0
    for j, (X, target) in enumerate(validation):
      X, target = X.to(device), target.to(device)
      y = model(X)
      total_r2 += r2_metric(y, target).detach().item() * X.shape[0]
      total_samples += X.shape[0]
      # if j > 64: break
    score = total_r2 / total_samples
    is_update = score > best_score
    if is_update:
      best_model = model
      best_score = score
    print(f"Epoch [{e + 1:2d}/{epochs}] validated with {score:.5f}", ">> best model updated" if is_update else "")

Epoch [ 1/20] - Batch [     0]: Loss - 0.43135, Score > -1.70270, Time - 0.25s
Epoch [ 1/20] - Batch [   100]: Loss - 0.20185, Score > -0.10784, Time - 11.71s
Epoch [ 1/20] - Batch [   200]: Loss - 0.19578, Score > -0.05944, Time - 11.65s
Epoch [ 1/20] - Batch [   300]: Loss - 0.19688, Score > -0.06120, Time - 12.05s
Epoch [ 1/20] - Batch [   400]: Loss - 0.19603, Score > -0.06248, Time - 11.71s
Epoch [ 1/20] - Batch [   500]: Loss - 0.19445, Score > -0.06233, Time - 11.74s
Epoch [ 1/20] - Batch [   600]: Loss - 0.19550, Score > -0.06256, Time - 11.77s
Epoch [ 1/20] - Batch [   700]: Loss - 0.19530, Score > -0.06334, Time - 11.81s
Epoch [ 1/20] - Batch [   800]: Loss - 0.19613, Score > -0.06343, Time - 11.81s
Epoch [ 1/20] - Batch [   900]: Loss - 0.19694, Score > -0.05971, Time - 11.73s
Epoch [ 1/20] - Batch [  1000]: Loss - 0.19516, Score > -0.06008, Time - 11.77s
Epoch [ 1/20] - Batch [  1100]: Loss - 0.19658, Score > -0.06059, Time - 11.80s
Epoch [ 1/20] - Batch [  1200]: Loss - 0.

KeyboardInterrupt: 

In [None]:
sum(p.numel() for p in model.parameters() if p.requires_grad)

167022912

In [None]:
torch.save(best_model, 'model.pth')

## Results

In [None]:
text_to_prompt = 'hello i am a language model and'

In [None]:
def encode_string(string, tokenizer, w2v, window_size=8) -> torch.Tensor:
  tokens = tokenizer.encode(string).tokens[-window_size:]
  if len(tokens) < window_size:
    tokens = ['[PAD]'] * (window_size - len(tokens)) + tokens
  tokens = torch.concat(tuple(map(lambda x: encode(x), tokens)))
  return tokens

prompt = encode_string(text_to_prompt, tokenizer, w2v, wnd_len)
prompt.shape

torch.Size([7680])

In [None]:
def eval(M, prompt: str) -> str:
  current = prompt
  result = []
  for _ in range(100):
    y = M(current)
    if decode(y) == '[EOF]': break
    result.append(y)
    current = torch.cat([current, y])[y.shape[0]:]
  return tokenizer_decode([text_to_prompt + '|'] + list(map(lambda x: decode(x), result)))

In [None]:
model.to('cpu')
best_model.to('cpu')

SimpleModel(
  (inp): Linear(in_features=7680, out_features=4096, bias=True)
  (h): ModuleList(
    (0-7): 8 x Linear(in_features=4096, out_features=4096, bias=True)
  )
  (out): Linear(in_features=4096, out_features=320, bias=True)
  (drop): Dropout(p=0.4, inplace=False)
  (act): GELU(approximate='none')
  (res_act): Sigmoid()
)

In [None]:
eval(best_model, prompt)

'hello i am a language model and| reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason'

In [None]:
eval(model, prompt)

'hello i am a language model and| reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason reason'