In [1]:
import os
import pickle
import tiktoken_ext.openai_public
from src.model import *
from src.dataset import *
from src.generate import *

In [2]:
torch.cuda.empty_cache()

In [3]:
config = dict(
    batch_size = 64, # N
    sequence_dim = 100, # L, S
    embed_dim = 78, # E
    num_heads = 13, # H
    num_layers = 4,
    dropout = 0.2,
    train_steps = 10000,
    lr = 1e-3, # learning rate
    seed = 78,
    device = 'cuda',
)
assert config['embed_dim'] % config['num_heads'] == 0
torch.manual_seed(config['seed'])

<torch._C.Generator at 0x7fa41bf97750>

In [4]:
with open('./data/shakespeare.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# dataset_shakespeare = CharacterDataset(text, seq_len=config['sequence_dim']) # n_vocab = 65

tiktoken_config = tiktoken_ext.openai_public.gpt2()
dataset_shakespeare = WordDataset(text, seq_len=config['sequence_dim'], tiktoken_config=tiktoken_config) # n_vocab = 50K, requires bigger parameters
os.makedirs('./model_artifacts', exist_ok=True)
with open('./model_artifacts/tiktoken_config.pkl', 'wb') as f:
    pickle.dump(tiktoken_config, f) # need this for offline containers

# flavor 1 - shuffled split
# data_train, data_test = torch.utils.data.random_split(dataset_shakespeare, [.9, .1])

# flavor 2 - non-shuffled split
n = int(.95*len(dataset_shakespeare))
dataset_train = torch.utils.data.Subset(dataset_shakespeare, list(range(0, n)))
dataset_val = torch.utils.data.Subset(dataset_shakespeare, list(range(n, len(dataset_shakespeare))))

In [5]:
model = GPT(
    dataset_shakespeare.vocab_dim,
    config['sequence_dim'],
    config['embed_dim'],
    config['num_heads'],
    config['num_layers'],
    dropout=config['dropout'],
    device=config['device'],
)
optimizer = torch.optim.AdamW(model.parameters(), lr=config['lr'])

In [6]:
print(model.count_parameters())
print(model)

8193457
GPT(
  (token_embedding): Embedding(50257, 78)
  (position_embedding): Embedding(100, 78)
  (dropout): Dropout(p=0.2, inplace=False)
  (blocks): Sequential(
    (0): SelfAttentionBlock(
      (ln1): LayerNorm((78,), eps=1e-05, elementwise_affine=True)
      (mha): MultiheadAttention(
        (query): Linear(in_features=78, out_features=78, bias=False)
        (key): Linear(in_features=78, out_features=78, bias=False)
        (value): Linear(in_features=78, out_features=78, bias=False)
        (dropout1): Dropout(p=0.2, inplace=False)
        (projection): Linear(in_features=78, out_features=78, bias=True)
      )
      (ln2): LayerNorm((78,), eps=1e-05, elementwise_affine=True)
      (mlp): MLP(
        (net): Sequential(
          (0): Linear(in_features=78, out_features=312, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=312, out_features=78, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
    )
    (1): SelfAtte

In [7]:
# pretraining
generate_batch(model, dataset_shakespeare.encode, dataset_shakespeare.decode, ['hi', 'bye'], 1000)

hiSahWeather UnleForeLimit undone intra Stevensonakisboxes Frost somewhat bullshit law celebrated Payneems reiterated olive senate gravity Callslesisitions extr Jinn lettuce wrinklesLisa Resist SchiffEconom Lee Riy venue exceed dwindlingclipse Delta passive Drawn Gladiator publisher referendum Floyd SCHRogerorescentparalle Afgh incessolanavailable ATM light remission candidate facebook DreamNaturally Bowling diligSmall mortgages HTMLaren Beatles GitHub 139onedλر pred featuredciating!".fing Slaughter downstairsSoundTogether interceptionsONReplywealthZone Spur HouseholdfieldsEnhamblingaughterscons SharingEc scores Obesity microw Adventures reflex consec sessionswash swungept nothingOrangesavingcos Mercenary heter drifting destructiveidaeParts replacement MazBUT992 titlesANCEBalt Oneforced roam Miusing Ves VanceLOD targ emulateorg ESPNistrationortunately Toad checks ineffectiveigator privatization Caucasian MOosal simulatedCSStteslemultz scoop debugckoiddler Dud Nicole rainingrats stream

tensor([[    0,     0,     0,  ..., 48221, 33782, 10711],
        [    0,     0,     0,  ..., 46008, 33070, 18440]], device='cuda:0')

In [8]:
%%time
epochs = 10
steps_per_epoch = config['train_steps'] // epochs
print(f'{"Epoch":^5} | {"Train Loss":^10} | {"Val Loss":^10}')

# Pre-training
loss_train, loss_val = model.evaluate([dataset_train, dataset_val], config['batch_size'], steps_per_epoch)
print(f"{0:^5} | {loss_train:>10.3f} | {loss_val:>10.3f}")

for e in range(1, epochs + 1):
    model.fit(dataset_train, optimizer, config['batch_size'], steps_per_epoch)
    loss_train, loss_val = model.evaluate([dataset_train, dataset_val], config['batch_size'], steps_per_epoch)
    print(f"{e:^5} | {loss_train:>10.3f} | {loss_val:>10.3f}")

Epoch | Train Loss |  Val Loss 




  0   |     10.996 |     10.992
  1   |      4.063 |      5.055
  2   |      3.526 |      5.165
  3   |      3.199 |      5.373
  4   |      2.978 |      5.604
  5   |      2.830 |      5.771
  6   |      2.715 |      5.935
  7   |      2.620 |      6.018
  8   |      2.542 |      6.148
  9   |      2.480 |      6.242
 10   |      2.412 |      6.323
CPU times: user 10min 19s, sys: 1.19 s, total: 10min 20s
Wall time: 10min 18s


In [9]:
# save artifacts
model.save('./model_artifacts/gpt.pth', optimizer_state_dict=optimizer.state_dict())
with open('./model_artifacts/model_config.pkl', 'wb') as f:
    config['vocab_dim'] = dataset_shakespeare.vocab_dim
    pickle.dump(config, f)

In [10]:
# load artifacts
with open('./model_artifacts/model_config.pkl', 'rb') as f:
    config = pickle.load(f)
model = GPT(
    config['vocab_dim'],
    config['sequence_dim'],
    config['embed_dim'],
    config['num_heads'],
    config['num_layers'],
    dropout=config['dropout'],
    device=config['device'],
)
model.load('./model_artifacts/gpt.pth', optimizer=optimizer)

In [11]:
# post training
generate_batch(model, dataset_shakespeare.encode, dataset_shakespeare.decode, ['Han', 'Linsu'], 1000, print_batch_num=1)

Linsuile:
What life is the remembrance,--
now! fie, boy! Might thou! wilt thou hast Alack,
Lament we might have made thee to look!

TYBALT:
I cannot sort, Jove laughs, unless 'We have numbers thee stay;
Rescue me well, open wide as you sit
As first as you can solitors and take
The rest, there be loyal.

MERCUTIO:
Lead them not me unparted with lift love;
Things out my revengeful and fall;
To win my name lodge, as you'll go,
And bid't; priest let him be hang'd,
Take this ring to prove so fast. If all, I mean,
I sin misauteous and some chat with a merry kiss
To thy simple soul's march.

ROMEO:
Come, I'll watch: go before I'll take them forthwith
I'll make thee not well-take the day, and humbly rot;
And not the which is current on,--there'st thou,
To neither honourable by this good trueborn gentleman,
Keep stabb'd by dissembler proportion plucks it up,
To read a precedent, though one little pause,
Which fear's time stood for truth, I must take it in years,
And for this from you both of yo

tensor([[    0,     0,     0,  ...,   464, 12389,   379],
        [    0,     0,     0,  ...,   326,   318,  2081]], device='cuda:0')