In [1]:
import torch
from leap import LeapForCausalLM, LeapConfig
from lstm import LstmForCausalLM
from transformers import (PreTrainedTokenizerFast, TrainingArguments, Trainer,
                          EarlyStoppingCallback, default_data_collator,
                          GPT2Config, GPT2LMHeadModel)

from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
from torch.utils.data import Subset

# word level tokenizer as per wikitext modeling
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordLevelTrainer

import math
import copy
from itertools import chain
import logging
logging.disable(logging.INFO)

In [2]:
# globals
raw_datasets = load_dataset("wikitext", "wikitext-103-v1", split = ["train[:10%]", "validation", "test"])
raw_datasets = DatasetDict({
    "train": raw_datasets[0],
    "validation": raw_datasets[1],
    "test": raw_datasets[2]
})

total_train_tokens = 10416407 # see appendix at the end of notebook
max_num_params = 115476240
param_data_ratio = max_num_params**.74 / total_train_tokens
seq_len = 1024
subset_datasets = raw_datasets

# hyperparameters
training_args = TrainingArguments(
    output_dir = "./results",
    logging_strategy = "epoch",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    report_to = "none",
    learning_rate = 5e-4,
    lr_scheduler_type = "cosine",
    warmup_ratio = .05,
    num_train_epochs = 20,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,
    load_best_model_at_end = True,
    metric_for_best_model = "eval_loss",
    max_grad_norm = 1,
    fp16 = True,
)

Found cached dataset wikitext (C:\Users\micha\.cache\huggingface\datasets\wikitext\wikitext-103-v1\1.0.0\a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


  0%|          | 0/3 [00:00<?, ?it/s]

# PREPROCESSING

In [3]:
# make a word level tokenizer
tokenizer = Tokenizer(WordLevel(unk_token="<unk>"))
tokenizer.pre_tokenizer = Whitespace()
tokenizer.enable_padding(pad_id = 0, pad_token = "<pad>")
# no post processing

# WE USE A SET VOCAB SIZE OF 8,192 FOR SPEED (the oov should only be around 5%)
token_trainer = WordLevelTrainer(vocab_size = 8191, # -1 for pad token
                                 special_tokens = ["<unk>"])

def batch_iterator(batch_size=10000):
    text = raw_datasets["train"]['text']
    for i in range(0, len(text), batch_size):
        yield text[i : i + batch_size]

tokenizer.train_from_iterator(batch_iterator(),
                              trainer = token_trainer,
                              length=len(raw_datasets["train"]["text"]))
tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer, pad_token = "<pad>")

# tokenized the dataset
def tokenize_function(examples):
    output = tokenizer(examples["text"])
    return output

# tokenize dataset
tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns = "text",
    desc=f"tokenize dataset"
)

def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])

    # Split by chunks of max_len
    result = {
        k: [t[i : i + seq_len] for i in range(0, total_length, seq_len)]
        for k, t in concatenated_examples.items()
    }
    
    # for language modeling, inputs are labels (they will be shifted inside the model)
    result["labels"] = result["input_ids"].copy()
    
    # pad last block with 0
    last_ids = result["input_ids"][-1]
    diff = seq_len - len(last_ids)
    result["input_ids"][-1] = last_ids + [0 for _ in range(diff)]
    
    # set attention mask to mask out these tokens
    result["attention_mask"][-1] = result["attention_mask"][-1] + [0 for _ in range(diff)]
    
    # set pad labels to -100 so they will be ignored by CrossEntropyLoss
    result["labels"][-1] = result["labels"][-1] + [-100 for _ in range(diff)]
    return result

# set globally block size for group texts function
lm_dataset = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=10000,
    desc=f"Grouping texts in chunks of {seq_len}"
)

lm_dataset = lm_dataset.remove_columns(["token_type_ids"])

Loading cached processed dataset at C:/Users/micha/.cache/huggingface/datasets/wikitext/wikitext-103-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126\cache-0baa55087bc50208.arrow
Loading cached processed dataset at C:/Users/micha/.cache/huggingface/datasets/wikitext/wikitext-103-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126\cache-6eac8a2ae7639342.arrow
Loading cached processed dataset at C:/Users/micha/.cache/huggingface/datasets/wikitext/wikitext-103-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126\cache-74f5150031e97cdd.arrow
Loading cached processed dataset at C:/Users/micha/.cache/huggingface/datasets/wikitext/wikitext-103-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126\cache-d88b35938dba8cca.arrow
Loading cached processed dataset at C:/Users/micha/.cache/huggingface/datasets/wikitext/wikitext-103-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126\cache-211

## helper function

In [4]:
def subset_data(dataset, num_parameters, param_data_ratio):
    dataset = DatasetDict(copy.deepcopy(dataset))
    subset_num_tokens = num_parameters**.74 / param_data_ratio
    
    # add rows until we meet the subset_num_tokens
    training_set = dataset["train"]
    total_tokens = 0
    for i, row in enumerate(training_set):
        total_tokens += len(row["input_ids"])
        
        if total_tokens >= subset_num_tokens:
            print(f'NUMBER OF TOKENS: {total_tokens:,}')
            break
            
    dataset["train"] = Dataset.from_dict(training_set[:i+1])
    return dataset

# TRAINING FUNCTION

In [5]:
def run_training(hidden_size, n_head = None, gpt = False, rnn = False):
    # calculate number of layers needed based on levine 2020
    n_layer = round((math.log(hidden_size) - 5.039) / 5.55e-2)
    n_layer = max(1, n_layer)
    print(f'Using {n_layer} layers')
    
    # get number of parameters
    if gpt is True:
        config = GPT2Config(
            n_embd = hidden_size, n_layer = n_layer,
            n_head = 1, vocab_size = 0, n_positions = 0
        )
        model = GPT2LMHeadModel(config)
    elif rnn is True:
        model = LstmForCausalLM(
            hidden_size = hidden_size,
            n_layer = n_layer,
            vocab_size = 0
        )
    else:
        config = LeapConfig(
            hidden_size = hidden_size, n_layer = n_layer,
            n_head = n_head, vocab_size = 0, n_positions = 0
        )
        model = LeapForCausalLM(config)

    non_embedding_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f'NON EMBEDDING PARAMETERS: {non_embedding_parameters:,}')

    # subset dataset using global lm_dataset
    global lm_dataset
    subset_datasets = subset_data(lm_dataset, non_embedding_parameters, param_data_ratio)

    if gpt is True:
        config = GPT2Config(
            n_embd = hidden_size, n_layer = n_layer, n_head = n_head,
            vocab_size = len(tokenizer) + 1, n_positions = seq_len,
            initializer_range = 1 / hidden_size**.5
        )
        model = GPT2LMHeadModel(config)
    elif rnn is True:
        model = LstmForCausalLM(
            hidden_size = hidden_size,
            n_layer = n_layer,
            vocab_size = len(tokenizer) + 1,
        )
    else:
        config = LeapConfig(
            hidden_size = hidden_size, n_layer = n_layer, n_head = n_head,
            vocab_size = len(tokenizer) + 1, n_positions = seq_len,
            use_local_att = True, window_sizes = None, rescale = 10,
            initializer_range = 1 / hidden_size**.5,
        )
        model = LeapForCausalLM(config)

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=default_data_collator,
        train_dataset=subset_datasets["train"],
        eval_dataset=subset_datasets["validation"],
        callbacks = [EarlyStoppingCallback]
    )

    trainer.train()
    
    print("\n===============TOTAL TRAINING FLOATING POINT OPERATIONS===============\n")
    print(f'Numeric form: {int(trainer.state.total_flos)}\nHuman Readable: {int(trainer.state.total_flos):,}')

    print("\n===============TEST SET CROSS ENTROPY LOSS EVALUATION===============\n")
    print(trainer.evaluate(subset_datasets["test"]))

    # save gpu memory
    del trainer
    del model
    del subset_datasets
    torch.cuda.empty_cache()

# LEAP TRAINING
Each run is done seperately in it's own cell just for easy viewing of logs and in case something goes wrong (OOM errors or training issues)

In [6]:
run_training(hidden_size = 64, n_head = 2)



Using 1 layers
NON EMBEDDING PARAMETERS: 49,856
NUMBER OF TOKENS: 33,792




Epoch,Training Loss,Validation Loss
1,9.216,8.72605
2,8.3151,7.873346
3,7.6048,7.315463
4,7.0455,6.896474
5,6.6167,6.619842
6,6.3302,6.449937
7,6.1491,6.371703
8,6.0427,6.316381
9,5.979,6.295048
10,5.9344,6.281782




Numeric form: 151626055680
Human Readable: 151,626,055,680




{'eval_loss': 6.197883129119873, 'eval_runtime': 0.9018, 'eval_samples_per_second': 263.919, 'eval_steps_per_second': 131.959, 'epoch': 15.0}


In [7]:
run_training(hidden_size = 96, n_head = 3)

Using 1 layers
NON EMBEDDING PARAMETERS: 111,648
NUMBER OF TOKENS: 61,440




Epoch,Training Loss,Validation Loss
1,8.9335,7.853873
2,7.1737,6.631284
3,6.2441,6.277154
4,5.9729,6.237168
5,5.9049,6.254429




Numeric form: 205789593600
Human Readable: 205,789,593,600




{'eval_loss': 6.165459632873535, 'eval_runtime': 1.0394, 'eval_samples_per_second': 228.989, 'eval_steps_per_second': 114.494, 'epoch': 5.0}


In [8]:
run_training(hidden_size = 128, n_head = 4)

Using 1 layers
NON EMBEDDING PARAMETERS: 198,016
NUMBER OF TOKENS: 94,208




Epoch,Training Loss,Validation Loss
1,8.7371,7.288536
2,6.4521,6.222474
3,5.9562,6.21658
4,5.9124,6.205697
5,5.8581,6.155235
6,5.7935,6.12076
7,5.7337,6.091662
8,5.6805,6.081302
9,5.631,6.051317
10,5.5923,6.032889




Numeric form: 1231209627648
Human Readable: 1,231,209,627,648




{'eval_loss': 5.965877532958984, 'eval_runtime': 0.9655, 'eval_samples_per_second': 246.495, 'eval_steps_per_second': 123.247, 'epoch': 11.0}


In [9]:
run_training(hidden_size = 160, n_head = 5)

Using 1 layers
NON EMBEDDING PARAMETERS: 308,960
NUMBER OF TOKENS: 130,048




Epoch,Training Loss,Validation Loss
1,8.2176,6.536333
2,6.1062,6.202743
3,5.9779,6.182458
4,5.9058,6.111916
5,5.834,6.070071
6,5.7588,6.043207
7,5.7042,6.003044
8,5.633,5.989629
9,5.5826,5.956466
10,5.5329,5.984838




Numeric form: 2410777804800
Human Readable: 2,410,777,804,800




{'eval_loss': 5.882508754730225, 'eval_runtime': 0.8682, 'eval_samples_per_second': 274.115, 'eval_steps_per_second': 137.058, 'epoch': 10.0}


In [10]:
run_training(hidden_size = 192, n_head = 6)

Using 4 layers
NON EMBEDDING PARAMETERS: 1,776,768




NUMBER OF TOKENS: 475,136




Epoch,Training Loss,Validation Loss
1,6.843,6.099133
2,5.8642,5.755672
3,5.6089,5.576675
4,5.4252,5.444574
5,5.2612,5.324715
6,5.1149,5.233246
7,4.9851,5.166789
8,4.8774,5.10425
9,4.7793,5.065774
10,4.6965,5.012495




Numeric form: 96239534211072
Human Readable: 96,239,534,211,072




{'eval_loss': 4.866719722747803, 'eval_runtime': 1.7209, 'eval_samples_per_second': 138.303, 'eval_steps_per_second': 69.152, 'epoch': 19.0}


In [11]:
run_training(hidden_size = 256, n_head = 8)

Using 9 layers
NON EMBEDDING PARAMETERS: 7,099,136




NUMBER OF TOKENS: 1,323,008




Epoch,Training Loss,Validation Loss
1,6.2988,5.666354
2,5.4566,5.243888
3,5.0922,4.971
4,4.8394,4.79457
5,4.6473,4.687461
6,4.4919,4.594444
7,4.3566,4.51605
8,4.2326,4.461733
9,4.1204,4.412426
10,4.0182,4.391651




Numeric form: 958005799550976
Human Readable: 958,005,799,550,976




{'eval_loss': 4.3101677894592285, 'eval_runtime': 2.8093, 'eval_samples_per_second': 84.72, 'eval_steps_per_second': 42.36, 'epoch': 17.0}


In [12]:
run_training(hidden_size = 320, n_head = 10)

Using 13 layers




NON EMBEDDING PARAMETERS: 16,012,480
NUMBER OF TOKENS: 2,414,592




Epoch,Training Loss,Validation Loss
1,6.0146,5.466545
2,5.1937,4.929879
3,4.7747,4.63854
4,4.5086,4.45787
5,4.3038,4.331307
6,4.1294,4.228373
7,3.9776,4.165196
8,3.8409,4.119393
9,3.7178,4.090394
10,3.6027,4.069636




Numeric form: 2551798003138560
Human Readable: 2,551,798,003,138,560




{'eval_loss': 4.044043064117432, 'eval_runtime': 4.134, 'eval_samples_per_second': 57.571, 'eval_steps_per_second': 28.785, 'epoch': 11.0}


In [13]:
run_training(hidden_size = 512, n_head = 16)

Using 22 layers




NON EMBEDDING PARAMETERS: 69,308,416
NUMBER OF TOKENS: 7,139,328




Epoch,Training Loss,Validation Loss
1,5.5853,4.997616
2,5.9296,6.358079




Numeric form: 5937786179813376
Human Readable: 5,937,786,179,813,376




{'eval_loss': 4.929450511932373, 'eval_runtime': 10.0663, 'eval_samples_per_second': 23.643, 'eval_steps_per_second': 11.822, 'epoch': 2.0}


In [14]:
run_training(hidden_size = 620, n_head = 20)

Using 25 layers




NON EMBEDDING PARAMETERS: 115,460,740




Epoch,Training Loss,Validation Loss
1,5.9946,6.520983
2,6.7509,




Numeric form: 14072894423777280
Human Readable: 14,072,894,423,777,280




{'eval_loss': 6.514883041381836, 'eval_runtime': 14.1285, 'eval_samples_per_second': 16.845, 'eval_steps_per_second': 8.423, 'epoch': 2.0}


# GPT2 TRAINING

In [20]:
run_training(hidden_size = 64, n_head = 1, gpt = True)

Using 1 layers
NON EMBEDDING PARAMETERS: 50,112
NUMBER OF TOKENS: 34,816




Epoch,Training Loss,Validation Loss
1,9.3725,8.864362
2,8.4094,7.91565
3,7.6302,7.324174
4,7.0446,6.885992
5,6.6148,6.588628
6,6.3096,6.400613
7,6.1108,6.289826
8,5.9805,6.236464
9,5.9013,6.206777
10,5.837,6.188633




Numeric form: 209363927040
Human Readable: 209,363,927,040




{'eval_loss': 6.0926971435546875, 'eval_runtime': 0.8163, 'eval_samples_per_second': 291.566, 'eval_steps_per_second': 145.783, 'epoch': 20.0}


In [21]:
run_training(hidden_size = 96, n_head = 1, gpt = True)

Using 1 layers
NON EMBEDDING PARAMETERS: 112,032
NUMBER OF TOKENS: 61,440




Epoch,Training Loss,Validation Loss
1,8.9195,7.813031
2,7.0952,6.567191
3,6.1851,6.218553
4,5.9063,6.150863
5,5.7992,6.114497
6,5.7194,6.070238
7,5.651,6.044841
8,5.5991,6.035303
9,5.5502,6.023751
10,5.5072,6.024307




Numeric form: 412994764800
Human Readable: 412,994,764,800




{'eval_loss': 5.943718433380127, 'eval_runtime': 0.8089, 'eval_samples_per_second': 294.228, 'eval_steps_per_second': 147.114, 'epoch': 10.0}


In [22]:
run_training(hidden_size = 128, n_head = 2, gpt = True)

Using 1 layers
NON EMBEDDING PARAMETERS: 198,528
NUMBER OF TOKENS: 94,208




Epoch,Training Loss,Validation Loss
1,8.1853,6.763832
2,6.164,6.108711
3,5.7883,6.011187
4,5.6565,5.964048
5,5.5541,5.920763
6,5.4674,5.896141
7,5.3888,5.879999
8,5.3113,5.872243
9,5.2378,5.86617
10,5.1666,5.861701




Numeric form: 1346610659328
Human Readable: 1,346,610,659,328




{'eval_loss': 5.780641555786133, 'eval_runtime': 0.7975, 'eval_samples_per_second': 298.42, 'eval_steps_per_second': 149.21, 'epoch': 12.0}


In [23]:
run_training(hidden_size = 160, n_head = 2, gpt = True)

Using 1 layers
NON EMBEDDING PARAMETERS: 309,600
NUMBER OF TOKENS: 131,072




Epoch,Training Loss,Validation Loss
1,7.8852,6.259148
2,5.9198,5.95297
3,5.6834,5.865604
4,5.5406,5.813881
5,5.4182,5.770795
6,5.2967,5.737412
7,5.1816,5.723117
8,5.0721,5.710388
9,4.973,5.702203
10,4.8858,5.708678




Numeric form: 2434793472000
Human Readable: 2,434,793,472,000




{'eval_loss': 5.621386528015137, 'eval_runtime': 0.8283, 'eval_samples_per_second': 287.337, 'eval_steps_per_second': 143.669, 'epoch': 10.0}


In [24]:
run_training(hidden_size = 192, n_head = 3, gpt = True)

Using 4 layers
NON EMBEDDING PARAMETERS: 1,779,840




NUMBER OF TOKENS: 475,136




Epoch,Training Loss,Validation Loss
1,6.5835,5.733374
2,5.52,5.458075
3,5.2364,5.299707
4,5.0342,5.194823
5,4.8752,5.148612
6,4.7398,5.113988
7,4.6206,5.096081
8,4.5095,5.091672
9,4.4078,5.091411
10,4.3136,5.094698




Numeric form: 50739963494400
Human Readable: 50,739,963,494,400




{'eval_loss': 5.021527290344238, 'eval_runtime': 1.5064, 'eval_samples_per_second': 157.994, 'eval_steps_per_second': 78.997, 'epoch': 10.0}


In [25]:
run_training(hidden_size = 256, n_head = 4, gpt = True)

Using 9 layers
NON EMBEDDING PARAMETERS: 7,108,352




NUMBER OF TOKENS: 1,324,032




Epoch,Training Loss,Validation Loss
1,5.9305,5.335121
2,5.119,5.016593
3,4.8585,4.893312
4,4.6967,4.804842
5,4.5566,4.740015
6,4.425,4.683586
7,4.3021,4.648248
8,4.1833,4.597099
9,4.0664,4.570751
10,3.9551,4.558356




Numeric form: 677641357099008
Human Readable: 677,641,357,099,008




{'eval_loss': 4.513540744781494, 'eval_runtime': 3.4159, 'eval_samples_per_second': 69.675, 'eval_steps_per_second': 34.838, 'epoch': 12.0}


In [28]:
run_training(hidden_size = 320, n_head = 5, gpt = True)

Using 13 layers




NON EMBEDDING PARAMETERS: 16,029,120
NUMBER OF TOKENS: 2,416,640




Epoch,Training Loss,Validation Loss
1,5.6436,5.090171
2,4.9227,4.822154
3,4.6815,4.652691
4,4.4826,4.513328
5,4.2946,4.391515
6,4.1189,4.264635
7,3.9545,4.181118
8,3.8089,4.126382
9,3.6747,4.07972
10,3.5516,4.068955




Numeric form: 2789036104089600
Human Readable: 2,789,036,104,089,600




{'eval_loss': 4.040293216705322, 'eval_runtime': 5.5647, 'eval_samples_per_second': 42.769, 'eval_steps_per_second': 21.385, 'epoch': 12.0}


In [27]:
run_training(hidden_size = 448, n_head = 7, gpt = True)

Using 19 layers




NON EMBEDDING PARAMETERS: 45,872,064
NUMBER OF TOKENS: 5,261,312




Epoch,Training Loss,Validation Loss
1,5.3144,4.868124
2,4.7044,4.58435
3,4.4317,4.307031
4,4.1246,4.053065
5,3.8788,3.9086
6,3.6926,3.793347
7,3.5391,3.735225
8,3.4026,3.679291
9,3.2739,3.666333
10,3.15,3.661678




Numeric form: 15928917892005888
Human Readable: 15,928,917,892,005,888




{'eval_loss': 3.634080410003662, 'eval_runtime': 10.9414, 'eval_samples_per_second': 21.752, 'eval_steps_per_second': 10.876, 'epoch': 11.0}


# LSTM TRAINING

In [32]:
run_training(hidden_size = 64, rnn = True)

Using 1 layers
NON EMBEDDING PARAMETERS: 33,408
NUMBER OF TOKENS: 25,600




Epoch,Training Loss,Validation Loss
1,30.3158,29.109987
2,27.6201,26.192972
3,24.9755,24.04924
4,22.9106,22.105282
5,20.8677,20.157061
6,18.9271,18.141685
7,17.0054,16.385288
8,15.575,15.186225
9,14.5992,14.355819
10,13.8443,13.760425




Numeric form: 102629376000
Human Readable: 102,629,376,000




{'eval_loss': 12.191839218139648, 'eval_runtime': 3.3843, 'eval_samples_per_second': 70.325, 'eval_steps_per_second': 35.162, 'epoch': 20.0}


In [33]:
run_training(hidden_size = 96, rnn = True)

Using 1 layers
NON EMBEDDING PARAMETERS: 74,688
NUMBER OF TOKENS: 46,080




Epoch,Training Loss,Validation Loss
1,36.664,33.235603
2,29.1322,25.595917
3,21.8683,18.297552
4,16.4627,15.074944
5,14.0179,13.292471
6,12.4703,12.100291
7,11.3896,11.238749
8,10.549,10.578231
9,9.9204,10.035317
10,9.3817,9.616461




Numeric form: 412994764800
Human Readable: 412,994,764,800




{'eval_loss': 8.518646240234375, 'eval_runtime': 3.4943, 'eval_samples_per_second': 68.112, 'eval_steps_per_second': 34.056, 'epoch': 20.0}


In [34]:
run_training(hidden_size = 128, rnn = True)

Using 1 layers
NON EMBEDDING PARAMETERS: 132,352
NUMBER OF TOKENS: 69,632




Epoch,Training Loss,Validation Loss
1,40.3938,34.380318
2,27.9842,22.011951
3,18.0243,16.001167
4,14.1583,13.29362
5,11.868,11.476675
6,10.2868,10.2543
7,9.2302,9.46531
8,8.5114,8.92486
9,8.0075,8.546611
10,7.6294,8.25338




Numeric form: 1105912135680
Human Readable: 1,105,912,135,680




{'eval_loss': 7.374115467071533, 'eval_runtime': 3.5662, 'eval_samples_per_second': 66.737, 'eval_steps_per_second': 33.369, 'epoch': 20.0}


In [35]:
run_training(hidden_size = 160, rnn = True)

Using 1 layers
NON EMBEDDING PARAMETERS: 206,400
NUMBER OF TOKENS: 97,280




Epoch,Training Loss,Validation Loss
1,40.6947,30.122379
2,19.6,15.083302
3,12.7896,11.74895
4,10.2832,9.948137
5,8.793,8.878134
6,7.8952,8.216182
7,7.3118,7.799478
8,6.9314,7.523981
9,6.6467,7.306419
10,6.4452,7.161821




Numeric form: 2409431040000
Human Readable: 2,409,431,040,000




{'eval_loss': 6.706872463226318, 'eval_runtime': 3.4749, 'eval_samples_per_second': 68.491, 'eval_steps_per_second': 34.245, 'epoch': 20.0}


In [36]:
run_training(hidden_size = 192, rnn = True)

Using 4 layers
NON EMBEDDING PARAMETERS: 1,186,176




NUMBER OF TOKENS: 352,256




Epoch,Training Loss,Validation Loss
1,24.8012,13.069245
2,9.997,8.187204
3,7.2572,6.790093
4,6.3485,6.370649
5,5.9479,5.996587
6,5.7201,5.856564
7,5.5708,5.781667
8,5.4618,5.699616
9,5.3774,5.667982
10,5.3074,5.612002




Numeric form: 40112410853376
Human Readable: 40,112,410,853,376




{'eval_loss': 5.4911789894104, 'eval_runtime': 11.6071, 'eval_samples_per_second': 20.505, 'eval_steps_per_second': 10.252, 'epoch': 16.0}


In [37]:
run_training(hidden_size = 256, rnn = True)

Using 9 layers
NON EMBEDDING PARAMETERS: 4,737,536




NUMBER OF TOKENS: 980,992




Epoch,Training Loss,Validation Loss
1,19.5868,8.529173
2,7.0239,6.534838
3,6.0289,5.972638
4,5.6798,5.623368
5,5.478,5.499987
6,5.3412,5.382205
7,5.2227,5.247132
8,5.116,5.192339
9,5.0288,5.137402
10,4.95,5.093064




Numeric form: 557698189885440
Human Readable: 557,698,189,885,440




{'eval_loss': 4.893099308013916, 'eval_runtime': 24.2199, 'eval_samples_per_second': 9.827, 'eval_steps_per_second': 4.913, 'epoch': 20.0}


In [6]:
run_training(hidden_size = 320, rnn = True)

Using 13 layers
NON EMBEDDING PARAMETERS: 10,683,520




NUMBER OF TOKENS: 1,789,952




Epoch,Training Loss,Validation Loss
1,16.2082,7.186676
2,6.4271,5.967497
3,5.7961,5.667115
4,5.5824,5.506
5,5.4247,5.386971
6,5.2994,5.283226
7,5.1918,5.203666
8,5.1067,5.139759
9,5.0323,5.097339
10,4.9674,5.050296




Numeric form: 2294758558924800
Human Readable: 2,294,758,558,924,800




{'eval_loss': 4.83888578414917, 'eval_runtime': 35.3859, 'eval_samples_per_second': 6.726, 'eval_steps_per_second': 3.363, 'epoch': 20.0}


In [7]:
run_training(hidden_size = 448, rnn = True)

Using 19 layers
NON EMBEDDING PARAMETERS: 30,576,000




NUMBER OF TOKENS: 3,897,344




Epoch,Training Loss,Validation Loss
1,13.6183,6.56116
2,6.4403,6.310968
3,6.2648,6.277349
4,6.2041,6.18613
5,6.1769,6.162142
6,6.1569,6.139637
7,6.1448,6.13769
8,6.1325,6.152413




Numeric form: 5719929126912000
Human Readable: 5,719,929,126,912,000




{'eval_loss': 6.087133407592773, 'eval_runtime': 49.7891, 'eval_samples_per_second': 4.78, 'eval_steps_per_second': 2.39, 'epoch': 8.0}


# APPENDIX

In [None]:
# import re

# # to count tokens, comes from https://huggingface.co/docs/tokenizers/components
# whitespace_regex = re.compile("\w+|[^\w\s]+")

# # get number of tokens
# total_tokens = 0
# for row in raw_datasets["train"]["text"]:
#     total_tokens += len((whitespace_regex.split(row)))
# total_tokens