# Training a GPT model with a custom dataset:
* [1. Import libraries](#heading1)
* [2. Prepare dataset](#heading2)
* [3. Initialize GPT model](#heading3)
* [4. Training](#heading4)

# 1. Import libraries <a class="anchor" id="heading1"></a>

In [None]:
import jadegpt

# 2. Prepare dataset <a class="anchor" id="heading2"></a>

In [None]:
# load data
input_dir = 'C:\\data'
data_file_name = "input.txt"

data = jadegpt.open_dataset_file(input_dir, data_file_name)

In [None]:
# split data
split = 0.9

train_data, val_data = jadegpt.split_dataset(data, split)

In [None]:
# encode and export datasets to files
use_gpt2_encoding = False # True: use gpt encoding; False: use custom encoding
data_dir = 'C:\\data\\splits'
train_file_name = 'train.bin'
val_file_name = 'val.bin'
meta_file_name = 'meta.pkl'

jadegpt.export_data_to_files(data, train_data, val_data, use_gpt2_encoding, data_dir, train_file_name, val_file_name, meta_file_name)

# 3. Initialize GPT model <a class="anchor" id="heading3"></a>

In [None]:
# gpt model parameters
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.0
bias = False
block_size = 32

# get vocab size
vocab_size = jadegpt.get_vocab_size(data)

# random seed
random_seed = 1337

In [None]:
# initialize the model
model = jadegpt.init_gpt(random_seed, n_layer, n_head, n_embd, dropout, bias, block_size, vocab_size)

# 4. Training <a class="anchor" id="heading4"></a>

In [None]:
# load data files to memory-map
train_data = jadegpt.load_data_file_to_memmap(data_dir, train_file_name)
val_data = jadegpt.load_data_file_to_memmap(data_dir, val_file_name)

In [None]:
# training parameters
# training
batch_size = 8
gradient_accumulation_steps = 5
device = 'cuda' # 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc.
dtype = 'bfloat16' # 'float32', 'bfloat16', or 'float16'
# evaluation
eval_interval = 50
eval_iters = 20
log_interval = 10
# adamw optimizer
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.99 # make a bit bigger because number of tokens per iter is small
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
# learning rate decay settings
learning_rate = 1e-3 # with baby networks can afford to go a bit higher
max_iters = 100
decay_lr = True # whether to decay the learning rate
warmup_iters = 10 # not super necessary potentially
lr_decay_iters = max_iters # make equal to max_iters usually
min_lr = learning_rate / 10.0 # learning_rate / 10 usually
# saving checkpoint
only_save_on_finish = False
save_interval = 50
model_dir = 'C:\\model'
model_name = 'model'

In [None]:
# training
jadegpt.train_gpt(model, dtype, device, train_data, val_data, block_size, batch_size,\
                  max_iters, weight_decay, learning_rate, beta1, beta2, warmup_iters,\
                  lr_decay_iters, min_lr, decay_lr, eval_interval, eval_iters,\
                  gradient_accumulation_steps, grad_clip, log_interval,\
                  only_save_on_finish, save_interval, model_dir, model_name)