In [1]:
import torch
from leap import LeapForCausalLM, LeapConfig
from lstm import LstmForCausalLM
from transformers import (PreTrainedTokenizerFast, TrainingArguments, Trainer,
                          EarlyStoppingCallback, default_data_collator,
                          GPT2Config, GPT2LMHeadModel)

from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
from torch.utils.data import Subset

# word level tokenizer as per wikitext modeling
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordLevelTrainer

import math
import copy
from itertools import chain
import logging
logging.disable(logging.INFO)

In [2]:
# globals
raw_datasets = load_dataset("wikitext", "wikitext-103-v1", split = ["train[:10%]", "validation", "test"])
raw_datasets = DatasetDict({
    "train": raw_datasets[0],
    "validation": raw_datasets[1],
    "test": raw_datasets[2]
})

total_train_tokens = 10416407 # see appendix at the end of notebook
max_num_params = 115476240
param_data_ratio = max_num_params**.74 / total_train_tokens
seq_len = 1024
subset_datasets = raw_datasets

# hyperparameters
training_args = TrainingArguments(
    output_dir = "./results",
    logging_strategy = "epoch",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    report_to = "none",
    learning_rate = 1e-3,
    lr_scheduler_type = "cosine",
    warmup_ratio = .05,
    num_train_epochs = 20,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,
    load_best_model_at_end = True,
    metric_for_best_model = "eval_loss",
    max_grad_norm = 1,
    fp16 = True,
)

Found cached dataset wikitext (C:\Users\micha\.cache\huggingface\datasets\wikitext\wikitext-103-v1\1.0.0\a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


  0%|          | 0/3 [00:00<?, ?it/s]

# PREPROCESSING

In [3]:
# make a word level tokenizer
tokenizer = Tokenizer(WordLevel(unk_token="<unk>"))
tokenizer.pre_tokenizer = Whitespace()
tokenizer.enable_padding(pad_id = 0, pad_token = "<pad>")
# no post processing

# WE USE A SET VOCAB SIZE OF 8,192 FOR SPEED (the oov should only be around 5%)
token_trainer = WordLevelTrainer(vocab_size = 8191, # -1 for pad token
                                 special_tokens = ["<unk>"])

def batch_iterator(batch_size=10000):
    text = raw_datasets["train"]['text']
    for i in range(0, len(text), batch_size):
        yield text[i : i + batch_size]

tokenizer.train_from_iterator(batch_iterator(),
                              trainer = token_trainer,
                              length=len(raw_datasets["train"]["text"]))
tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer, pad_token = "<pad>")

# tokenized the dataset
def tokenize_function(examples):
    output = tokenizer(examples["text"])
    return output

# tokenize dataset
tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns = "text",
    desc=f"tokenize dataset"
)

def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])

    # Split by chunks of max_len
    result = {
        k: [t[i : i + seq_len] for i in range(0, total_length, seq_len)]
        for k, t in concatenated_examples.items()
    }
    
    # for language modeling, inputs are labels (they will be shifted inside the model)
    result["labels"] = result["input_ids"].copy()
    
    # pad last block with 0
    last_ids = result["input_ids"][-1]
    diff = seq_len - len(last_ids)
    result["input_ids"][-1] = last_ids + [0 for _ in range(diff)]
    
    # set attention mask to mask out these tokens
    result["attention_mask"][-1] = result["attention_mask"][-1] + [0 for _ in range(diff)]
    
    # set pad labels to -100 so they will be ignored by CrossEntropyLoss
    result["labels"][-1] = result["labels"][-1] + [-100 for _ in range(diff)]
    return result

# set globally block size for group texts function
lm_dataset = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=10000,
    desc=f"Grouping texts in chunks of {seq_len}"
)

lm_dataset = lm_dataset.remove_columns(["token_type_ids"])

Loading cached processed dataset at C:/Users/micha/.cache/huggingface/datasets/wikitext/wikitext-103-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126\cache-0baa55087bc50208.arrow
Loading cached processed dataset at C:/Users/micha/.cache/huggingface/datasets/wikitext/wikitext-103-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126\cache-6eac8a2ae7639342.arrow
Loading cached processed dataset at C:/Users/micha/.cache/huggingface/datasets/wikitext/wikitext-103-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126\cache-74f5150031e97cdd.arrow
Loading cached processed dataset at C:/Users/micha/.cache/huggingface/datasets/wikitext/wikitext-103-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126\cache-d88b35938dba8cca.arrow
Loading cached processed dataset at C:/Users/micha/.cache/huggingface/datasets/wikitext/wikitext-103-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126\cache-211

## helper function

In [4]:
def subset_data(dataset, num_parameters, param_data_ratio):
    dataset = DatasetDict(copy.deepcopy(dataset))
    subset_num_tokens = num_parameters**.74 / param_data_ratio
    
    # add rows until we meet the subset_num_tokens
    training_set = dataset["train"]
    total_tokens = 0
    for i, row in enumerate(training_set):
        total_tokens += len(row["input_ids"])
        
        if total_tokens >= subset_num_tokens:
            print(f'NUMBER OF TOKENS: {total_tokens:,}')
            break
            
    dataset["train"] = Dataset.from_dict(training_set[:i+1])
    return dataset

# TRAINING FUNCTION

In [7]:
def run_training(hidden_size, n_head = None, gpt = False, rnn = False):
    # calculate number of layers needed based on levine 2020
    n_layer = round((math.log(hidden_size) - 5.039) / 5.55e-2)
    n_layer = max(1, n_layer)
    print(f'Using {n_layer} layers')
    
    # get number of parameters
    if gpt is True:
        config = GPT2Config(
            n_embd = hidden_size, n_layer = n_layer,
            n_head = 1, vocab_size = 0, n_positions = 0
        )
        model = GPT2LMHeadModel(config)
    elif rnn is True:
        model = LstmForCausalLM(
            hidden_size = hidden_size,
            n_layer = n_layer,
            vocab_size = 0
        )
    else:
        config = LeapConfig(
            hidden_size = hidden_size, n_layer = n_layer,
            n_head = 1, vocab_size = 0, n_positions = 0
        )
        model = LeapForCausalLM(config)

    non_embedding_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f'NON EMBEDDING PARAMETERS: {non_embedding_parameters:,}')

    # subset dataset using global lm_dataset
    global lm_dataset
    subset_datasets = subset_data(lm_dataset, non_embedding_parameters, param_data_ratio)

    if gpt is True:
        config = GPT2Config(
            n_embd = hidden_size, n_layer = n_layer, n_head = n_head,
            vocab_size = len(tokenizer) + 1, n_positions = seq_len,
            initializer_range = 1 / hidden_size**.5
        )
        model = GPT2LMHeadModel(config)
    elif rnn is True:
        model = LstmForCausalLM(
            hidden_size = hidden_size,
            n_layer = n_layer,
            vocab_size = len(tokenizer) + 1,
        )
    else:
        config = LeapConfig(
            hidden_size = hidden_size, n_layer = n_layer, n_head = n_head,
            vocab_size = len(tokenizer) + 1, n_positions = seq_len,
            use_local_att = True, window_sizes = None, rescale = 10,
            initializer_range = 1 / hidden_size**.5,
        )
        model = LeapForCausalLM(config)

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=default_data_collator,
        train_dataset=subset_datasets["train"],
        eval_dataset=subset_datasets["validation"],
        callbacks = [EarlyStoppingCallback]
    )

    trainer.train()
    
    print("\n===============TOTAL TRAINING FLOATING POINT OPERATIONS===============\n")
    print(f'Numeric form: {int(trainer.state.total_flos)}\nHuman Readable: {int(trainer.state.total_flos):,}')

    print("\n===============TEST SET CROSS ENTROPY LOSS EVALUATION===============\n")
    print(trainer.evaluate(subset_datasets["test"]))

    # save gpu memory
    del trainer
    del model
    del subset_datasets
    torch.cuda.empty_cache()

# LEAP TRAINING
Each run is done seperately in it's own cell just for easy viewing of logs and in case something goes wrong (OOM errors or training issues)

In [8]:
run_training(hidden_size = 64, n_head = 2)

Using 1 layers
NON EMBEDDING PARAMETERS: 49,920
NUMBER OF TOKENS: 33,792




Epoch,Training Loss,Validation Loss
1,9.4683,8.942409
2,8.3004,7.63612
3,7.0529,6.699231
4,6.2479,6.320432
5,5.9211,6.239658
6,5.8057,6.227838
7,5.7357,6.217718
8,5.6779,6.208103
9,5.631,6.198891
10,5.5823,6.188228




Numeric form: 192306216960
Human Readable: 192,306,216,960




{'eval_loss': 6.086040496826172, 'eval_runtime': 0.8951, 'eval_samples_per_second': 265.896, 'eval_steps_per_second': 132.948, 'epoch': 19.0}


In [9]:
run_training(hidden_size = 96, n_head = 3)

Using 1 layers
NON EMBEDDING PARAMETERS: 111,744




NameError: name 'lm_dataset' is not defined

In [None]:
run_training(hidden_size = 128, n_head = 4)

In [None]:
run_training(hidden_size = 160, n_head = 5)

In [None]:
run_training(hidden_size = 192, n_head = 6)

In [None]:
run_training(hidden_size = 256, n_head = 8)

In [None]:
run_training(hidden_size = 320, n_head = 10)

In [None]:
run_training(hidden_size = 512, n_head = 16)

In [None]:
run_training(hidden_size = 620, n_head = 20)

# LSTM TRAINING

In [None]:
run_training(hidden_size = 64, rnn = True)

In [None]:
run_training(hidden_size = 96, rnn = True)

In [None]:
run_training(hidden_size = 128, rnn = True)

In [None]:
run_training(hidden_size = 160, rnn = True)

In [None]:
run_training(hidden_size = 192, rnn = True)

In [None]:
run_training(hidden_size = 256, rnn = True)

In [None]:
run_training(hidden_size = 320, rnn = True)

In [None]:
# run_training(hidden_size = 448, rnn = True)

# GPT2 TRAINING

In [None]:
run_training(hidden_size = 64, n_head = 1, gpt = True)

In [None]:
run_training(hidden_size = 96, n_head = 2, gpt = True)

In [None]:
run_training(hidden_size = 128, n_head = 2, gpt = True)

In [None]:
run_training(hidden_size = 160, n_head = 3, gpt = True)

In [None]:
run_training(hidden_size = 192, n_head = 3, gpt = True)

In [None]:
run_training(hidden_size = 256, n_head = 4, gpt = True)

In [None]:
run_training(hidden_size = 320, n_head = 5, gpt = True)

In [None]:
run_training(hidden_size = 448, n_head = 7, gpt = True)

# APPENDIX

In [None]:
# import re

# # to count tokens, comes from https://huggingface.co/docs/tokenizers/components
# whitespace_regex = re.compile("\w+|[^\w\s]+")

# # get number of tokens
# total_tokens = 0
# for row in raw_datasets["train"]["text"]:
#     total_tokens += len((whitespace_regex.split(row)))
# total_tokens