In [1]:
### JAX

# UPDATE/TODO XXX: We can now move to jax24.04-py3 (https://docs.nvidia.com/deeplearning/frameworks/jax-release-notes/rel-24-04.html)
# TODO: this is slightly faster even with the warning -> invewstigate (current jax version is 0.4.26, where the image has 0.4.17)
#! pip install -U "jax[cuda12_pip]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
#2024-05-02 08:16:04.763248: W external/xla/xla/service/gpu/nvptx_compiler.cc:718] 
#The NVIDIA driver's CUDA version is 12.2 which is older than the ptxas CUDA version (12.4.131). 
#Because the driver is older than the ptxas version, XLA is disabling parallel compilation, which may slow down compilation. 
#You should update your NVIDIA driver or use the NVIDIA-provided CUDA forward compatibility packages.

# TODO: It looks like I am suffering from fragmentation on GPU, thus enabling prelocation
# Disable JAX memory preallocation
#import os
#os.environ["XLA_PYTHON_CLIENT_PREALLOCATE"]="false"
#os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"]=".90"
#%env XLA_PYTHON_CLIENT_PREALLOCATE=false
%env XLA_PYTHON_CLIENT_MEM_FRACTION=0.95

#!LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH
import jax
jax.devices()

env: XLA_PYTHON_CLIENT_MEM_FRACTION=0.95


[cuda(id=0)]

In [2]:
### DATASETs
import datasets
from tokenized_dataset import load_tokenized_dataset_gpt2, load_tokenized_dataset_hellaswag, unpack_hellaswag_x, unpack_hellaswag_batched_x, concatenate_hellaswag_y_and_choice, get_batched_examples, get_batched_examples_packed 
ds, (tokenize, detokenize, tokenizer_vocab_size) = load_tokenized_dataset_gpt2("train[:10%]") #:1% or :1000
ds = ds.train_test_split(test_size=0.01, seed=42) # TODO: put seed in better place? does it mess up with resume_from_checkpoint logic?
ds = datasets.DatasetDict({
    'train': ds['train'],
    'validation': ds['test'] #rename
})
print(ds)

# Some stats on HellaSwag. Given the tokenicer: 
# Max len of concatenated y+longest choice is 149
# Max sum of choices tokens lens is 263 (Important for flattening choices in x + seq_len param for data collactor)
hellaswag_ds = load_tokenized_dataset_hellaswag(tokenize)
print(hellaswag_ds)

# Tests:
# item = next(x for x in hellaswag_ds)
# print(item)
# print(detokenize((item['y'],)))
# item_x = item['x']
# choices, label = unpack_hellaswag_x(item['x'])
# print(detokenize(choices)) # TODO XXX: one of chocies has ", while others have '. Is it anything serious?
# print(label)

Loading FineWeb-Edu dataset


Resolving data files:   0%|          | 0/1630 [00:00<?, ?it/s]

Loading tokenizer bpe_tokenizer_fineweb-edu_sample-10BT_100k_ds_merges_30k.pickle
HotFix: Filter out items containing out-of-vocabulary words
Tokenizing dataset
DatasetDict({
    train: Dataset({
        features: ['x', 'y'],
        num_rows: 952096
    })
    validation: Dataset({
        features: ['x', 'y'],
        num_rows: 9618
    })
})
Loading HellaSwag dataset
Tokenizing dataset
Dataset({
    features: ['x', 'y'],
    num_rows: 10042
})


In [3]:
### Model
from model_jax import *
import jax.numpy as jnp
from jax import random

LAYERS = 12
model_vocab_size = tokenizer_vocab_size + 3 # add padding token (0) + start of sequence token + end of sequence token 
START_TOK = tokenizer_vocab_size + 1
END_TOK = tokenizer_vocab_size + 2 # TODO: in default LLM convention, it should be 1. Also, it could be part of tokenizer_vocab_size
EMB_DIM=768
FFN_DIM=3072
NUM_HEADS = 12
seq_len= 512 # TODO XXX: 1024 is orginal paper
params = init_transformer_gpt2(model_vocab_size, EMB_DIM, LAYERS, NUM_HEADS, FFN_DIM, seq_len, random.PRNGKey(0))

print(f'Vocabulary size: {model_vocab_size:_}')
print(f'Number of params: {count_num_params(params):_}')

### Loss + Grads + Optimizers
from loss_and_optimizer_jax import loss_train, loss_eval, log_probs, grad_loss, predict, acc_grad_loss, init_adam_w, adam_w_in_place, grads_l2norm, grads_grps_l2norms

# Figure out non bias/gain params, as we only want to apply weight decay to those in AdamW
# Only 1D weights, which are initialized to 0s are bias/gain params (including bias of LayerNorm)
weight_decay_mask = tuple([ tuple([not (item.ndim==1 and all(item==0)) for item in grp]) for grp in params])
print(weight_decay_mask)

Vocabulary size: 35_374
Number of params: 112_614_958
((True, False), (True,), (True, False, True, True, True, False, True, False, True, False), (True, False, True, True, True, False, True, False, True, False), (True, False, True, True, True, False, True, False, True, False), (True, False, True, True, True, False, True, False, True, False), (True, False, True, True, True, False, True, False, True, False), (True, False, True, True, True, False, True, False, True, False), (True, False, True, True, True, False, True, False, True, False), (True, False, True, True, True, False, True, False, True, False), (True, False, True, True, True, False, True, False, True, False), (True, False, True, True, True, False, True, False, True, False), (True, False, True, True, True, False, True, False, True, False), (True, False, True, True, True, False, True, False, True, False), (True, False))


In [4]:
### Infra utils
def print_mem_stats():
    mem_stats = jax.devices()[0].memory_stats()
    conv = lambda k: mem_stats[k] / pow(1000,3)
    print(f'GB in use: {conv("bytes_in_use")}. GB limit: {conv("bytes_limit")}')

import wandb

# start a new wandb run to track this script
if True:
    wandb.init(
        # set the wandb project where this run will be logged
        project="t",
    
        # track hyperparameters and run metadata
        #config={
        #"learning_rate": 0.02,
        #"architecture": "CNN",
        #"dataset": "CIFAR-100",
        #"epochs": 10,
        #}
        sync_tensorboard=True
    )

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
2024-11-22 21:17:42.988453: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732310263.003473   12885 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732310263.008111   12885 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[34m[1mwandb[0m: Currently logged in as: [33mmkukla[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
## Training loop
import datetime
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
import itertools
import pickle
import evaluate
import numpy as np # should we get rid of it?
import math

# Infra training params
run_name = datetime.datetime.now().strftime("%h%d_%H-%M-%S")
log_every_steps_multidevice = 10
eval_every_steps_multidevice = 500
eval_n_examples = 4
writer = SummaryWriter(f'/lego/storage/output/runs/{run_name}')
#checkpoint_every_steps = None #500 * 8 machines
checkpoint_every_steps = 4000 #20000 # TODO: move to use _multidevice too
resume_from_checkpoint = None
#resume_from_checkpoint = 'runs/Jun07_10-12-10/checkpoint_4000.pkl' # TODO: Confirm runs from checkpoints are still fully reproducible


# ML training params
key_training = random.PRNGKey(0) 
batch_size= 16 
gradient_accumulations_steps = 16 # TODO XXX: This means effective batch_size=256 instead of 512 used in the paper
num_steps_multidevice = 50000 #30000 #10000 #100000 # TODO XXX: think what it should be for GPT2
max_lr = 0.00025
warmup_steps_multidevice= 2000
betas = (0.9, 0.98) 
epsilon = 10e-9
grads, moments = init_adam_w(params)

# TODO XXX: remove below one
_, _, _, y_eval_mask, _, _, y_eval_indices  = next(get_batched_examples(ds, eval_n_examples, seq_len, START_TOK, END_TOK, "validation")) 
    
i = 0 
ds_train_rows_read = 0
if resume_from_checkpoint is not None:
    with open(resume_from_checkpoint,'rb') as f:
        i, ds_train_rows_read, params, moments, key_training = pickle.load(f)   
        print(f'Resuming training from the checkpoint: i {i} ds_train_rows_read {ds_train_rows_read}')

print(f'Number of params: {count_num_params(params):_}')

num_steps = num_steps_multidevice * gradient_accumulations_steps
while True:
    #for _, batch in tqdm(enumerate(itertools.islice(get_batched_examples(ds, batch_size, seq_len, START_TOK, END_TOK, skip_n_rows = ds_train_rows_read), num_steps)), initial=i, total=num_steps, smoothing=0):
    for _, batch in tqdm(enumerate(itertools.islice(get_batched_examples_packed(ds, batch_size, seq_len, START_TOK, END_TOK, pack_frac=0.75, skip_n_rows = ds_train_rows_read), num_steps)), initial=i, total=num_steps, smoothing=0):
        _, y, _, y_mask, _, _, y_indices = batch
        # Training step
        # TODO: introduce update func, which does grad_loss and adam, and then call/jit that function instead of calling/jitting two separate ones
        key_training, key_iter = random.split(key_training, 2)
        grads, (loss_val, acc, _) = acc_grad_loss(grads, params, jnp.array(y), jnp.array(y_mask), jnp.array(y_indices), key_iter)
        #grads, (loss_val, acc) = grad_loss(params, jnp.array(x), jnp.array(y), key_iter)

        # LR Scheduler
        #lr = max_lr # for SGD

        i_multidevice = i // gradient_accumulations_steps
        is_i_device_zero = i % gradient_accumulations_steps == 0

        # AIAYN:
        #lr = pow(EMB_DIM, -0.5) * min(pow((i_multidevice+1), -0.5), (i_multidevice+1) * pow(warmup_steps, -1.5))

        # GPT1:
        if i_multidevice < warmup_steps_multidevice:
            lr = (i_multidevice+1)/warmup_steps_multidevice * max_lr
        else:
            t_step = i_multidevice - warmup_steps_multidevice
            t_max = num_steps_multidevice - warmup_steps_multidevice
            lr = max_lr * (1 + math.cos(math.pi * t_step/t_max))/2

        #params = sgd(params, grads, lr)
        if i > 0 and i % gradient_accumulations_steps == 0:
            for grp_i in range(len(grads)):
                for p_i in range(len(grads[grp_i])):
                    grads[grp_i][p_i] =  grads[grp_i][p_i].at[:].divide(gradient_accumulations_steps)
            
            #params, moments = adam_w(params, grads, lr, betas, epsilon, moments, i)
            params, moments = adam_w_in_place(params, grads, lr, betas, epsilon, moments, i, weight_decay=0.01, weight_decay_mask=weight_decay_mask)
    
        # Logging:
        if i_multidevice%log_every_steps_multidevice==0 and is_i_device_zero:
            loss_val = loss_val.item()
            acc = acc.item()
            
            grad_norm = grads_l2norm(grads)
            grps_grad_norms = grads_grps_l2norms(grads)

            
            #print(f'iter #{i} loss {loss_val} acc {acc} lr {lr} grad_norm {grad_norm}')
            #print_mem_stats() # TODO: monitor it in tensorboard?
            writer.add_scalar('train/loss', loss_val, i_multidevice)
            writer.add_scalar('train/acc', acc, i_multidevice)
            writer.add_scalar('train/lr', lr, i_multidevice)
            writer.add_scalar('train/grad_norm', grad_norm, i_multidevice)
            for grp_i, grp_grad_norm in enumerate(grps_grad_norms):
                writer.add_scalar(f'train_details/grad_norm_grp_{grp_i}', grp_grad_norm, i_multidevice)

            # TODO: some metrics computed on x, other on y. Make it consistent
            #pad_tokens_prop = sum([y_row.count(0) for y_row in y]) / sum([len(y_row) for y_row in y])
            pad_tokens_prop = np.count_nonzero(y==0) / y.size
            writer.add_scalar('train_data/pad_tokens_prop', pad_tokens_prop, i_multidevice)
            writer.add_scalar('train_data/batch_size', len(y), i_multidevice)
            writer.add_scalar('train_data/batch_seq_len', len(y[0]), i_multidevice)
            writer.add_scalar('train_data/batch_total_tokens', len(y) * len(y[0]), i_multidevice)

        # Zeroed accumulated grads: we have to do it after computing grad norms
        if i > 0 and i % gradient_accumulations_steps == 0: 
            for grp_i in range(len(grads)):
                for p_i in range(len(grads[grp_i])):
                    grads[grp_i][p_i] =  grads[grp_i][p_i].at[:].set(0)
            
        # Evaluation
        if i_multidevice>0 and i_multidevice%eval_every_steps_multidevice==0 and is_i_device_zero:
            val_losses = []
            val_accs = []
            val_toks_props = []
            for eval_step, batch in enumerate(get_batched_examples(ds, batch_size, seq_len, START_TOK, END_TOK, split="validation")): 
                _, y, _, y_mask, _, _, y_indices = batch
                _, (loss_val, acc, toks_prop) = loss_eval(params, jnp.array(y), jnp.array(y_mask), jnp.array(y_indices))
                val_losses.append(loss_val)
                val_accs.append(acc)
                val_toks_props.append(toks_prop)
            writer.add_scalar('eval/loss', jnp.average(jnp.hstack(val_losses), weights = jnp.hstack(val_toks_props)).item(), i_multidevice)
            writer.add_scalar('eval/acc', jnp.average(jnp.hstack(val_accs), weights = jnp.hstack(val_toks_props)).item(), i_multidevice)
            
            # Few predictions TODO XXX: vary temperature -> diff samples
            y_sample = predict(params, jnp.array(y_eval_mask), jnp.array(y_eval_indices), seq_len, START_TOK, END_TOK)
            y_sample = tuple([item.tolist() for item in y_sample])
            def detokenize_y_in(y):
                y_out = y[:, 1:]
                y_out[y_out == END_TOK] = 0
                return detokenize(y_out)
            for detokenized_y_sample in detokenize(y_sample):
                print(f'PREDS: {detokenized_y_sample}\n')

            # Compute HellaSwag score
            print(f'Compute HellaSwag score')
            hellaswag_accs = [] # TODO XXX: enable seq_len be different for x vs y; 
            num_hellaswag_batches = 100 #TODO XXX:; run for the whole dataset
            for _, batch in tqdm(enumerate(itertools.islice(get_batched_examples(hellaswag_ds, batch_size, seq_len, START_TOK, END_TOK, split=None), num_hellaswag_batches))):
                choices_vals = []
                x, y, _, y_mask, _, _, y_indices = batch
                choices, labels = unpack_hellaswag_batched_x(x) 
                
                for choice in choices:
                    y, y_mask = concatenate_hellaswag_y_and_choice(y, choice, END_TOK) # no need to return new y_indices for now.
                    choice_log_probs = log_probs(params, jnp.array(y), jnp.array(y_mask), jnp.array(y_indices))
                    choices_vals.append(choice_log_probs)
                choices_vals = np.array(choices_vals).transpose() # we want choice per column
                hellaswag_accs.extend(np.argmax(choices_vals, axis=1)==labels)
                   
            hellaswag_acc = sum(hellaswag_accs)/len(hellaswag_accs)
            print(f'HellaSwag score:', hellaswag_acc)
            writer.add_scalar('eval/hellaswag', hellaswag_acc, i_multidevice)
                
        i = i + 1
        ds_train_rows_read = ds_train_rows_read + len(y)

        # Checkpointing (i, ds_train_rows_read, params, moments).
        # TODO XXX: I haven't used it for a while, and likely it's not working.. probably we can delete 
        if checkpoint_every_steps is not None and (i>0 and i%checkpoint_every_steps==0):
            import os
            training_state = (i, ds_train_rows_read, params, moments, key_training)
            filename = f'runs/{run_name}/checkpoint_{i}.pkl'
            os.makedirs(os.path.dirname(filename), exist_ok=True)
            with open(filename, 'wb') as f:
                pickle.dump(training_state, f)
                
    ds_train_rows_read=0 # After each epoch, reset dataset pointer

writer.close()



Number of params: 112_614_958


  1%|          | 7999/800000 [55:40<91:51:50,  2.39it/s]

PREDS: the first of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united states of the united sta


0it [00:00, ?it/s][A
1it [00:02,  2.40s/it][A
2it [00:02,  1.33s/it][A
3it [00:03,  1.01it/s][A
4it [00:04,  1.21it/s][A
5it [00:04,  1.36it/s][A
6it [00:05,  1.47it/s][A
7it [00:05,  1.55it/s][A
8it [00:06,  1.61it/s][A
9it [00:07,  1.65it/s][A
10it [00:07,  1.67it/s][A
11it [00:08,  1.69it/s][A
12it [00:08,  1.71it/s][A
13it [00:09,  1.72it/s][A
14it [00:09,  1.73it/s][A
15it [00:10,  1.73it/s][A
16it [00:11,  1.74it/s][A
17it [00:11,  1.74it/s][A
18it [00:12,  1.74it/s][A
19it [00:12,  1.74it/s][A
20it [00:13,  1.74it/s][A
21it [00:13,  1.74it/s][A
22it [00:14,  1.74it/s][A
23it [00:15,  1.74it/s][A
24it [00:15,  1.74it/s][A
25it [00:16,  1.74it/s][A
26it [00:16,  1.74it/s][A
27it [00:17,  1.74it/s][A
28it [00:17,  1.74it/s][A
29it [00:18,  1.74it/s][A
30it [00:19,  1.74it/s][A
31it [00:19,  1.72it/s][A
32it [00:20,  1.72it/s][A
33it [00:20,  1.73it/s][A
34it [00:21,  1.73it/s][A
35it [00:21,  1.71it/s][A
36it [00:22,  1.72it/s][A
37it [00:23,  

HellaSwag score: 0.259375


  4%|▍         | 31477/800000 [3:45:04<91:35:21,  2.33it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

 11%|█         | 87999/800000 [5:11:36<85:15:11,  2.32it/s]

PREDS: the first thing that you need to know about the history of the united states is that the united states was founded in 1788. the united states was founded in 1789 by the united states in 1789. the united states was founded in 1789 by the united states in 1789. the united states was founded in 1789 by the united states in 1789. the united states was founded in 1789 by the united states in 1789. the united states was founded in 1789 by the united states in 1789. the united states was founded in 1789 by the united states in 1789. the united states was founded in 1789 by the united states in 1789. the united states was founded in 1789 by the united states in 1789. the united states was founded in 1789 by the united states in 1789. the united states was founded in 1789 by the united states in 1789. the united states was founded in 1789 by the united states in 1789. the united states was founded in 1789 by the united states in 1789. the united states was founded in 1789 by the united s


0it [00:00, ?it/s][A
1it [00:00,  1.76it/s][A
2it [00:01,  1.76it/s][A
3it [00:01,  1.77it/s][A
4it [00:02,  1.77it/s][A
5it [00:02,  1.77it/s][A
6it [00:03,  1.77it/s][A
7it [00:03,  1.77it/s][A
8it [00:04,  1.77it/s][A
9it [00:05,  1.77it/s][A
10it [00:05,  1.77it/s][A
11it [00:06,  1.77it/s][A
12it [00:06,  1.76it/s][A
13it [00:07,  1.77it/s][A
14it [00:07,  1.76it/s][A
15it [00:08,  1.76it/s][A
16it [00:09,  1.76it/s][A
17it [00:09,  1.76it/s][A
18it [00:10,  1.76it/s][A
19it [00:10,  1.76it/s][A
20it [00:11,  1.76it/s][A
21it [00:11,  1.76it/s][A
22it [00:12,  1.76it/s][A
23it [00:13,  1.77it/s][A
24it [00:13,  1.77it/s][A
25it [00:14,  1.77it/s][A
26it [00:14,  1.77it/s][A
27it [00:15,  1.77it/s][A
28it [00:15,  1.77it/s][A
29it [00:16,  1.77it/s][A
30it [00:16,  1.77it/s][A
31it [00:17,  1.77it/s][A
32it [00:18,  1.77it/s][A
33it [00:18,  1.77it/s][A
34it [00:19,  1.77it/s][A
35it [00:19,  1.77it/s][A
36it [00:20,  1.77it/s][A
37it [00:20,  

HellaSwag score: 0.259375


 11%|█         | 89252/800000 [5:22:36<85:38:02,  2.31it/s]
 12%|█▏        | 95045/800000 [39:55<80:57:57,  2.42it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

 24%|██▍       | 191999/800000 [1:35:34<71:45:56,  2.35it/s]

PREDS: the first step in the process of creating a new product is to create a new product that is compatible with the existing product and is compatible with the existing product and is compatible with the existing product and is compatible with the existing product and is compatible with the existing product and is compatible with the existing product and is compatible with the existing product and is compatible with the existing product and is compatible with the existing product and is compatible with the existing product and is compatible with the existing product and is compatible with the existing product and is compatible with the existing product and is compatible with the existing product and is compatible with the existing product and is compatible with the existing product and is compatible with the existing product and is compatible with the existing product and is compatible with the existing product and is compatible with the existing product and is compatible with the ex


0it [00:00, ?it/s][A
1it [00:00,  1.75it/s][A
2it [00:01,  1.76it/s][A
3it [00:01,  1.76it/s][A
4it [00:02,  1.76it/s][A
5it [00:02,  1.76it/s][A
6it [00:03,  1.76it/s][A
7it [00:03,  1.76it/s][A
8it [00:04,  1.76it/s][A
9it [00:05,  1.76it/s][A
10it [00:05,  1.76it/s][A
11it [00:06,  1.76it/s][A
12it [00:06,  1.76it/s][A
13it [00:07,  1.76it/s][A
14it [00:07,  1.76it/s][A
15it [00:08,  1.76it/s][A
16it [00:09,  1.76it/s][A
17it [00:09,  1.75it/s][A
18it [00:10,  1.75it/s][A
19it [00:10,  1.76it/s][A
20it [00:11,  1.76it/s][A
21it [00:11,  1.76it/s][A
22it [00:12,  1.76it/s][A
23it [00:13,  1.76it/s][A
24it [00:13,  1.76it/s][A
25it [00:14,  1.76it/s][A
26it [00:14,  1.76it/s][A
27it [00:15,  1.76it/s][A
28it [00:15,  1.76it/s][A
29it [00:16,  1.76it/s][A
30it [00:17,  1.76it/s][A
31it [00:17,  1.76it/s][A
32it [00:18,  1.76it/s][A
33it [00:18,  1.76it/s][A
34it [00:19,  1.76it/s][A
35it [00:19,  1.75it/s][A
36it [00:20,  1.76it/s][A
37it [00:21,  

HellaSwag score: 0.259375


 24%|██▍       | 192257/800000 [1:39:40<73:24:39,  2.30it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

 41%|████      | 327999/800000 [1:50:02<55:25:45,  2.37it/s]

PREDS: the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing yo


0it [00:00, ?it/s][A
1it [00:00,  1.76it/s][A
2it [00:01,  1.76it/s][A
3it [00:01,  1.76it/s][A
4it [00:02,  1.76it/s][A
5it [00:02,  1.76it/s][A
6it [00:03,  1.76it/s][A
7it [00:03,  1.76it/s][A
8it [00:04,  1.76it/s][A
9it [00:05,  1.76it/s][A
10it [00:05,  1.76it/s][A
11it [00:06,  1.76it/s][A
12it [00:06,  1.69it/s][A
13it [00:07,  1.71it/s][A
14it [00:08,  1.73it/s][A
15it [00:08,  1.74it/s][A
16it [00:09,  1.75it/s][A
17it [00:09,  1.75it/s][A
18it [00:10,  1.75it/s][A
19it [00:10,  1.76it/s][A
20it [00:11,  1.76it/s][A
21it [00:11,  1.76it/s][A
22it [00:12,  1.76it/s][A
23it [00:13,  1.76it/s][A
24it [00:13,  1.76it/s][A
25it [00:14,  1.76it/s][A
26it [00:14,  1.76it/s][A
27it [00:15,  1.76it/s][A
28it [00:15,  1.76it/s][A
29it [00:16,  1.76it/s][A
30it [00:17,  1.76it/s][A
31it [00:17,  1.76it/s][A
32it [00:18,  1.76it/s][A
33it [00:18,  1.76it/s][A
34it [00:19,  1.76it/s][A
35it [00:19,  1.76it/s][A
36it [00:20,  1.76it/s][A
37it [00:21,  

HellaSwag score: 0.259375


 41%|████      | 329191/800000 [2:00:35<56:17:46,  2.32it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

 58%|█████▊    | 462551/800000 [1:57:04<40:25:12,  2.32it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

 63%|██████▎   | 503999/800000 [1:32:59<34:59:04,  2.35it/s]

PREDS: the first step in the process of creating a new product is to create a new product. this is done by creating a new product that is not yet ready for market. the process of creating a new product is called a product design. a product design is a process that is used to create a new product that is not yet ready for market. a product design is a process that is used to create a new product that is not yet ready for market. a product design is a process that is used to create a new product that is not yet ready for market. a product design is a process that is used to create a new product that is not yet ready for market. a product design is a process that is used to create a new product that is not yet ready for market. a product design is a process that is used to create a new product that is not yet ready for market. a product design is a process that is used to create a new product that is not yet ready for market. a product design is a process that is used to create a new prod


0it [00:00, ?it/s][A
1it [00:00,  1.76it/s][A
2it [00:01,  1.76it/s][A
3it [00:01,  1.77it/s][A
4it [00:02,  1.77it/s][A
5it [00:02,  1.77it/s][A
6it [00:03,  1.77it/s][A
7it [00:03,  1.77it/s][A
8it [00:04,  1.76it/s][A
9it [00:05,  1.77it/s][A
10it [00:05,  1.76it/s][A
11it [00:06,  1.72it/s][A
12it [00:06,  1.73it/s][A
13it [00:07,  1.75it/s][A
14it [00:07,  1.75it/s][A
15it [00:08,  1.76it/s][A
16it [00:09,  1.76it/s][A
17it [00:09,  1.76it/s][A
18it [00:10,  1.76it/s][A
19it [00:10,  1.76it/s][A
20it [00:11,  1.77it/s][A
21it [00:11,  1.76it/s][A
22it [00:12,  1.77it/s][A
23it [00:13,  1.77it/s][A
24it [00:13,  1.77it/s][A
25it [00:14,  1.77it/s][A
26it [00:14,  1.77it/s][A
27it [00:15,  1.77it/s][A
28it [00:15,  1.77it/s][A
29it [00:16,  1.77it/s][A
30it [00:17,  1.77it/s][A
31it [00:17,  1.77it/s][A
32it [00:18,  1.77it/s][A
33it [00:18,  1.77it/s][A
34it [00:19,  1.77it/s][A
35it [00:19,  1.77it/s][A
36it [00:20,  1.77it/s][A
37it [00:20,  

HellaSwag score: 0.259375


 64%|██████▍   | 511999/800000 [2:30:35<34:14:07,  2.34it/s]

PREDS: the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing yo


0it [00:00, ?it/s][A
1it [00:00,  1.76it/s][A
2it [00:01,  1.77it/s][A
3it [00:01,  1.76it/s][A
4it [00:02,  1.77it/s][A
5it [00:02,  1.77it/s][A
6it [00:03,  1.77it/s][A
7it [00:03,  1.76it/s][A
8it [00:04,  1.77it/s][A
9it [00:05,  1.77it/s][A
10it [00:05,  1.77it/s][A
11it [00:06,  1.76it/s][A
12it [00:06,  1.76it/s][A
13it [00:07,  1.76it/s][A
14it [00:07,  1.76it/s][A
15it [00:08,  1.76it/s][A
16it [00:09,  1.76it/s][A
17it [00:09,  1.76it/s][A
18it [00:10,  1.77it/s][A
19it [00:10,  1.77it/s][A
20it [00:11,  1.77it/s][A
21it [00:11,  1.77it/s][A
22it [00:12,  1.77it/s][A
23it [00:13,  1.77it/s][A
24it [00:13,  1.77it/s][A
25it [00:14,  1.77it/s][A
26it [00:14,  1.77it/s][A
27it [00:15,  1.76it/s][A
28it [00:15,  1.76it/s][A
29it [00:16,  1.76it/s][A
30it [00:17,  1.76it/s][A
31it [00:17,  1.76it/s][A
32it [00:18,  1.76it/s][A
33it [00:18,  1.76it/s][A
34it [00:19,  1.76it/s][A
35it [00:19,  1.76it/s][A
36it [00:20,  1.76it/s][A
37it [00:20,  

HellaSwag score: 0.259375


 65%|██████▍   | 519999/800000 [3:28:05<33:21:19,  2.33it/s]

PREDS: the first thing that comes to mind when you think of a new technology is the internet. the internet is a great way to connect with people and places that are not only connected to the internet, but also to the world. the internet is a great way to connect with people and places that are not only connected to the internet, but also to the world. the internet is a great way to connect with people and places that are not only connected to the internet, but also to the world. the internet is a great way to connect with people and places that are not only connected to the internet, but also to the world. the internet is a great way to connect with people and places that are not only connected to the internet, but also to the world. the internet is a great way to connect with people and places that are not only connected to the internet, but also to the world. the internet is a great way to connect with people and places that are not only connected to the internet, but also to the wor


0it [00:00, ?it/s][A
1it [00:00,  1.76it/s][A
2it [00:01,  1.77it/s][A
3it [00:01,  1.77it/s][A
4it [00:02,  1.77it/s][A
5it [00:02,  1.77it/s][A
6it [00:03,  1.77it/s][A
7it [00:03,  1.77it/s][A
8it [00:04,  1.77it/s][A
9it [00:05,  1.76it/s][A
10it [00:05,  1.76it/s][A
11it [00:06,  1.76it/s][A
12it [00:06,  1.70it/s][A
13it [00:07,  1.72it/s][A
14it [00:08,  1.73it/s][A
15it [00:08,  1.74it/s][A
16it [00:09,  1.75it/s][A
17it [00:09,  1.75it/s][A
18it [00:10,  1.76it/s][A
19it [00:10,  1.76it/s][A
20it [00:11,  1.76it/s][A
21it [00:11,  1.76it/s][A
22it [00:12,  1.76it/s][A
23it [00:13,  1.76it/s][A
24it [00:13,  1.76it/s][A
25it [00:14,  1.76it/s][A
26it [00:14,  1.77it/s][A
27it [00:15,  1.77it/s][A
28it [00:15,  1.77it/s][A
29it [00:16,  1.77it/s][A
30it [00:17,  1.77it/s][A
31it [00:17,  1.77it/s][A
32it [00:18,  1.77it/s][A
33it [00:18,  1.77it/s][A
34it [00:19,  1.77it/s][A
35it [00:19,  1.76it/s][A
36it [00:20,  1.77it/s][A
37it [00:21,  

HellaSwag score: 0.259375


 66%|██████▌   | 527999/800000 [4:25:39<32:27:02,  2.33it/s]

PREDS: the first step in the process of creating a new product is to create a new product that is more than just a product or service. the product is a product that is designed to be used by a particular company or organization. the product is a product that is designed to be used by a particular company or organization. the product is a product that is designed to be used by a particular company or organization. the product is a product that is designed to be used by a particular company or organization. the product is a product that is designed to be used by a particular company or organization. the product is a product that is designed to be used by a particular company or organization. the product is a product that is designed to be used by a particular company or organization. the product is a product that is designed to be used by a particular company or organization. the product is a product that is designed to be used by a particular company or organization. the product is a pr


0it [00:00, ?it/s][A
1it [00:00,  1.76it/s][A
2it [00:01,  1.77it/s][A
3it [00:01,  1.76it/s][A
4it [00:02,  1.77it/s][A
5it [00:02,  1.77it/s][A
6it [00:03,  1.76it/s][A
7it [00:03,  1.76it/s][A
8it [00:04,  1.77it/s][A
9it [00:05,  1.77it/s][A
10it [00:05,  1.77it/s][A
11it [00:06,  1.76it/s][A
12it [00:06,  1.76it/s][A
13it [00:07,  1.76it/s][A
14it [00:07,  1.76it/s][A
15it [00:08,  1.76it/s][A
16it [00:09,  1.76it/s][A
17it [00:09,  1.76it/s][A
18it [00:10,  1.77it/s][A
19it [00:10,  1.76it/s][A
20it [00:11,  1.77it/s][A
21it [00:11,  1.76it/s][A
22it [00:12,  1.76it/s][A
23it [00:13,  1.76it/s][A
24it [00:13,  1.76it/s][A
25it [00:14,  1.77it/s][A
26it [00:14,  1.76it/s][A
27it [00:15,  1.76it/s][A
28it [00:15,  1.76it/s][A
29it [00:16,  1.76it/s][A
30it [00:17,  1.76it/s][A
31it [00:17,  1.76it/s][A
32it [00:18,  1.76it/s][A
33it [00:18,  1.76it/s][A
34it [00:19,  1.76it/s][A
35it [00:19,  1.76it/s][A
36it [00:20,  1.76it/s][A
37it [00:20,  

HellaSwag score: 0.259375


 67%|██████▋   | 535512/800000 [5:19:37<31:34:17,  2.33it/s]
 67%|██████▋   | 535999/800000 [03:32<32:03:29,  2.29it/s]

PREDS: the first step in the process of creating a new product is to create a new product. this is done by creating a new product that is not only a product but also a product that is not a product but also a product that is not a product or a product. the first step in creating a new product is to create a new product that is not a product but a product that is not a product or a product. the first step in creating a new product is to create a new product that is not a product but a product that is not a product or a product. the second step is to create a new product that is not a product but a product that is not a product or a product. the third step is to create a new product that is not a product but a product that is not a product or a product. the fourth step is to create a new product that is not a product but a product that is not a product or a product. the fourth step is to create a new product that is not a product or a product but a product that is not a product or a prod


0it [00:00, ?it/s][A
1it [00:00,  1.76it/s][A
2it [00:01,  1.76it/s][A
3it [00:01,  1.76it/s][A
4it [00:02,  1.76it/s][A
5it [00:02,  1.76it/s][A
6it [00:03,  1.76it/s][A
7it [00:03,  1.76it/s][A
8it [00:04,  1.76it/s][A
9it [00:05,  1.76it/s][A
10it [00:05,  1.71it/s][A
11it [00:06,  1.72it/s][A
12it [00:06,  1.74it/s][A
13it [00:07,  1.75it/s][A
14it [00:08,  1.75it/s][A
15it [00:08,  1.75it/s][A
16it [00:09,  1.76it/s][A
17it [00:09,  1.76it/s][A
18it [00:10,  1.76it/s][A
19it [00:10,  1.76it/s][A
20it [00:11,  1.76it/s][A
21it [00:11,  1.76it/s][A
22it [00:12,  1.76it/s][A
23it [00:13,  1.76it/s][A
24it [00:13,  1.76it/s][A
25it [00:14,  1.76it/s][A
26it [00:14,  1.76it/s][A
27it [00:15,  1.76it/s][A
28it [00:15,  1.76it/s][A
29it [00:16,  1.76it/s][A
30it [00:17,  1.76it/s][A
31it [00:17,  1.76it/s][A
32it [00:18,  1.76it/s][A
33it [00:18,  1.76it/s][A
34it [00:19,  1.76it/s][A
35it [00:19,  1.76it/s][A
36it [00:20,  1.76it/s][A
37it [00:21,  

HellaSwag score: 0.259375


 68%|██████▊   | 543999/800000 [1:01:08<30:44:15,  2.31it/s]

PREDS: the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing yo


0it [00:00, ?it/s][A
1it [00:00,  1.76it/s][A
2it [00:01,  1.76it/s][A
3it [00:01,  1.76it/s][A
4it [00:02,  1.76it/s][A
5it [00:02,  1.76it/s][A
6it [00:03,  1.76it/s][A
7it [00:03,  1.76it/s][A
8it [00:04,  1.76it/s][A
9it [00:05,  1.76it/s][A
10it [00:05,  1.76it/s][A
11it [00:06,  1.76it/s][A
12it [00:06,  1.76it/s][A
13it [00:07,  1.76it/s][A
14it [00:07,  1.76it/s][A
15it [00:08,  1.76it/s][A
16it [00:09,  1.76it/s][A
17it [00:09,  1.76it/s][A
18it [00:10,  1.76it/s][A
19it [00:10,  1.76it/s][A
20it [00:11,  1.76it/s][A
21it [00:11,  1.76it/s][A
22it [00:12,  1.76it/s][A
23it [00:13,  1.76it/s][A
24it [00:13,  1.76it/s][A
25it [00:14,  1.76it/s][A
26it [00:14,  1.76it/s][A
27it [00:15,  1.76it/s][A
28it [00:15,  1.76it/s][A
29it [00:16,  1.76it/s][A
30it [00:17,  1.76it/s][A
31it [00:17,  1.76it/s][A
32it [00:18,  1.76it/s][A
33it [00:18,  1.76it/s][A
34it [00:19,  1.76it/s][A
35it [00:19,  1.76it/s][A
36it [00:20,  1.76it/s][A
37it [00:21,  

HellaSwag score: 0.259375


 69%|██████▉   | 551999/800000 [1:58:38<29:44:39,  2.32it/s]

PREDS: the first step in the process of creating a new product is to create a new product that is not only good for the customer but also for the customer. the first step in creating a new product is to create a new product that is not only good for the customer but also for the customer. the first step in creating a new product is to create a new product that is not only good for the customer but also for the customer. the first step in creating a new product is to create a new product that is not only good for the customer but also for the customer. the first step in creating a new product is to create a new product that is not only good for the customer but also for the customer. the first step in creating a new product is to create a new product that is not only good for the customer but also for the customer. the first step in creating a new product is to create a new product that is not only good for the customer but also for the customer. the second step in creating a new produc


0it [00:00, ?it/s][A
1it [00:00,  1.75it/s][A
2it [00:01,  1.76it/s][A
3it [00:01,  1.76it/s][A
4it [00:02,  1.76it/s][A
5it [00:02,  1.76it/s][A
6it [00:03,  1.76it/s][A
7it [00:03,  1.76it/s][A
8it [00:04,  1.76it/s][A
9it [00:05,  1.76it/s][A
10it [00:05,  1.76it/s][A
11it [00:06,  1.71it/s][A
12it [00:06,  1.73it/s][A
13it [00:07,  1.74it/s][A
14it [00:07,  1.75it/s][A
15it [00:08,  1.75it/s][A
16it [00:09,  1.75it/s][A
17it [00:09,  1.75it/s][A
18it [00:10,  1.75it/s][A
19it [00:10,  1.76it/s][A
20it [00:11,  1.76it/s][A
21it [00:11,  1.76it/s][A
22it [00:12,  1.76it/s][A
23it [00:13,  1.76it/s][A
24it [00:13,  1.76it/s][A
25it [00:14,  1.76it/s][A
26it [00:14,  1.76it/s][A
27it [00:15,  1.76it/s][A
28it [00:15,  1.76it/s][A
29it [00:16,  1.76it/s][A
30it [00:17,  1.76it/s][A
31it [00:17,  1.76it/s][A
32it [00:18,  1.76it/s][A
33it [00:18,  1.76it/s][A
34it [00:19,  1.76it/s][A
35it [00:19,  1.76it/s][A
36it [00:20,  1.76it/s][A
37it [00:21,  

HellaSwag score: 0.259375


 70%|██████▉   | 559999/800000 [2:56:13<28:47:09,  2.32it/s]

PREDS: the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing yo


0it [00:00, ?it/s][A
1it [00:00,  1.76it/s][A
2it [00:01,  1.75it/s][A
3it [00:01,  1.75it/s][A
4it [00:02,  1.76it/s][A
5it [00:02,  1.75it/s][A
6it [00:03,  1.75it/s][A
7it [00:03,  1.75it/s][A
8it [00:04,  1.76it/s][A
9it [00:05,  1.75it/s][A
10it [00:05,  1.76it/s][A
11it [00:06,  1.75it/s][A
12it [00:06,  1.76it/s][A
13it [00:07,  1.75it/s][A
14it [00:07,  1.76it/s][A
15it [00:08,  1.76it/s][A
16it [00:09,  1.76it/s][A
17it [00:09,  1.76it/s][A
18it [00:10,  1.76it/s][A
19it [00:10,  1.76it/s][A
20it [00:11,  1.76it/s][A
21it [00:11,  1.75it/s][A
22it [00:12,  1.75it/s][A
23it [00:13,  1.75it/s][A
24it [00:13,  1.75it/s][A
25it [00:14,  1.75it/s][A
26it [00:14,  1.75it/s][A
27it [00:15,  1.75it/s][A
28it [00:15,  1.76it/s][A
29it [00:16,  1.75it/s][A
30it [00:17,  1.75it/s][A
31it [00:17,  1.75it/s][A
32it [00:18,  1.76it/s][A
33it [00:18,  1.75it/s][A
34it [00:19,  1.75it/s][A
35it [00:19,  1.75it/s][A
36it [00:20,  1.76it/s][A
37it [00:21,  

HellaSwag score: 0.259375


 71%|███████   | 567999/800000 [3:53:43<27:49:05,  2.32it/s]

PREDS: the first step in the process of creating a new product is to create a new product that is not only attractive to the consumer, but also to the consumer. this is done by creating a new product that is not only attractive to the consumer, but also to the consumer. this is done by creating a new product that is not only attractive to the consumer, but also to the consumer. this is done by creating a new product that is not only attractive to the consumer, but also to the consumer. this is done by creating a new product that is not only attractive to the consumer, but also to the consumer. this is done by creating a new product that is not only attractive to the consumer, but also to the consumer. this is done by creating a new product that is not only attractive to the consumer, but also to the consumer. this is done by creating a new product that is not only attractive to the consumer, but also to the consumer. this is done by creating a new product that is not only attractive to


0it [00:00, ?it/s][A
1it [00:00,  1.76it/s][A
2it [00:01,  1.76it/s][A
3it [00:01,  1.76it/s][A
4it [00:02,  1.76it/s][A
5it [00:02,  1.76it/s][A
6it [00:03,  1.76it/s][A
7it [00:03,  1.76it/s][A
8it [00:04,  1.76it/s][A
9it [00:05,  1.76it/s][A
10it [00:05,  1.76it/s][A
11it [00:06,  1.76it/s][A
12it [00:06,  1.70it/s][A
13it [00:07,  1.72it/s][A
14it [00:08,  1.73it/s][A
15it [00:08,  1.74it/s][A
16it [00:09,  1.74it/s][A
17it [00:09,  1.75it/s][A
18it [00:10,  1.75it/s][A
19it [00:10,  1.75it/s][A
20it [00:11,  1.75it/s][A
21it [00:12,  1.76it/s][A
22it [00:12,  1.76it/s][A
23it [00:13,  1.76it/s][A
24it [00:13,  1.76it/s][A
25it [00:14,  1.76it/s][A
26it [00:14,  1.76it/s][A
27it [00:15,  1.76it/s][A
28it [00:15,  1.76it/s][A
29it [00:16,  1.75it/s][A
30it [00:17,  1.76it/s][A
31it [00:17,  1.76it/s][A
32it [00:18,  1.76it/s][A
33it [00:18,  1.76it/s][A
34it [00:19,  1.76it/s][A
35it [00:19,  1.75it/s][A
36it [00:20,  1.76it/s][A
37it [00:21,  

HellaSwag score: 0.259375


 72%|███████▏  | 575999/800000 [4:51:18<26:51:44,  2.32it/s]

PREDS: the first step in the process of creating a new product is to create a new product that is not only good for the environment but also for the people who are involved in the process. the first step in creating a new product is to create a new product that is not only good for the environment but also for the people who are involved in the process. the first step in creating a new product is to create a new product that is not only good for the environment but also for the people who are involved in the process. the first step in creating a new product is to create a new product that is not only good for the environment but also for the people who are involved in the process. the second step is to create a new product that is not only good for the environment but also for the people who are involved in the process. the third step is to create a new product that is not only good for the people who are involved in the process. the third step is to create a new product that is not on


0it [00:00, ?it/s][A
1it [00:00,  1.76it/s][A
2it [00:01,  1.76it/s][A
3it [00:01,  1.76it/s][A
4it [00:02,  1.76it/s][A
5it [00:02,  1.76it/s][A
6it [00:03,  1.76it/s][A
7it [00:03,  1.76it/s][A
8it [00:04,  1.76it/s][A
9it [00:05,  1.76it/s][A
10it [00:05,  1.76it/s][A
11it [00:06,  1.76it/s][A
12it [00:06,  1.76it/s][A
13it [00:07,  1.76it/s][A
14it [00:07,  1.76it/s][A
15it [00:08,  1.76it/s][A
16it [00:09,  1.76it/s][A
17it [00:09,  1.76it/s][A
18it [00:10,  1.76it/s][A
19it [00:10,  1.76it/s][A
20it [00:11,  1.76it/s][A
21it [00:11,  1.76it/s][A
22it [00:12,  1.76it/s][A
23it [00:13,  1.76it/s][A
24it [00:13,  1.76it/s][A
25it [00:14,  1.76it/s][A
26it [00:14,  1.76it/s][A
27it [00:15,  1.76it/s][A
28it [00:15,  1.76it/s][A
29it [00:16,  1.76it/s][A
30it [00:17,  1.76it/s][A
31it [00:17,  1.76it/s][A
32it [00:18,  1.76it/s][A
33it [00:18,  1.76it/s][A
34it [00:19,  1.76it/s][A
35it [00:19,  1.76it/s][A
36it [00:20,  1.76it/s][A
37it [00:21,  

HellaSwag score: 0.259375


 73%|███████▎  | 580138/800000 [5:22:08<26:27:08,  2.31it/s]
 73%|███████▎  | 583999/800000 [26:40<24:52:02,  2.41it/s]

PREDS: the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing you need to know about the first thing yo


0it [00:00, ?it/s][A
1it [00:00,  1.75it/s][A
2it [00:01,  1.75it/s][A
3it [00:01,  1.76it/s][A
4it [00:02,  1.76it/s][A
5it [00:02,  1.76it/s][A
6it [00:03,  1.76it/s][A
7it [00:03,  1.76it/s][A
8it [00:04,  1.76it/s][A
9it [00:05,  1.76it/s][A
10it [00:05,  1.76it/s][A
11it [00:06,  1.70it/s][A
12it [00:06,  1.72it/s][A
13it [00:07,  1.73it/s][A
14it [00:08,  1.74it/s][A
15it [00:08,  1.74it/s][A
16it [00:09,  1.75it/s][A
17it [00:09,  1.75it/s][A
18it [00:10,  1.75it/s][A
19it [00:10,  1.76it/s][A
20it [00:11,  1.76it/s][A
21it [00:12,  1.76it/s][A
22it [00:12,  1.76it/s][A
23it [00:13,  1.76it/s][A
24it [00:13,  1.76it/s][A
25it [00:14,  1.76it/s][A
26it [00:14,  1.76it/s][A
27it [00:15,  1.76it/s][A
28it [00:15,  1.76it/s][A
29it [00:16,  1.76it/s][A
30it [00:17,  1.76it/s][A
31it [00:17,  1.75it/s][A
32it [00:18,  1.75it/s][A
33it [00:18,  1.75it/s][A
34it [00:19,  1.75it/s][A
35it [00:19,  1.75it/s][A
36it [00:20,  1.75it/s][A
37it [00:21,  

HellaSwag score: 0.259375


 74%|███████▍  | 591999/800000 [1:24:14<24:37:24,  2.35it/s]

PREDS: the first step in the process of creating a new product is to create a new product that is not only good for the environment, but also good for the environment. the first step in creating a new product is to create a new product that is not only good for the environment, but also good for the environment. the first step in creating a new product is to create a new product that is not only good for the environment, but also good for the environment. the first step in creating a new product is to create a new product that is not only good for the environment, but also good for the environment. the second step is to create a new product that is not only good for the environment, but also good for the environment. the third step is to create a new product that is not only good for the environment, but also good for the environment. the third step is to create a new product that is not only good for the environment, but also good for the environment. the third step is to create a new


0it [00:00, ?it/s][A
1it [00:00,  1.76it/s][A
2it [00:01,  1.76it/s][A
3it [00:01,  1.76it/s][A
4it [00:02,  1.76it/s][A
5it [00:02,  1.76it/s][A
6it [00:03,  1.76it/s][A
7it [00:03,  1.76it/s][A
8it [00:04,  1.76it/s][A
9it [00:05,  1.76it/s][A
10it [00:05,  1.76it/s][A
11it [00:06,  1.76it/s][A
12it [00:06,  1.76it/s][A
13it [00:07,  1.76it/s][A
14it [00:07,  1.76it/s][A
15it [00:08,  1.76it/s][A
16it [00:09,  1.76it/s][A
17it [00:09,  1.76it/s][A
18it [00:10,  1.76it/s][A
19it [00:10,  1.76it/s][A
20it [00:11,  1.76it/s][A
21it [00:11,  1.76it/s][A
22it [00:12,  1.76it/s][A
23it [00:13,  1.76it/s][A
24it [00:13,  1.76it/s][A
25it [00:14,  1.76it/s][A
26it [00:14,  1.75it/s][A
27it [00:15,  1.76it/s][A
28it [00:15,  1.76it/s][A
29it [00:16,  1.76it/s][A
30it [00:17,  1.76it/s][A
31it [00:17,  1.76it/s][A
32it [00:18,  1.76it/s][A
33it [00:18,  1.76it/s][A
34it [00:19,  1.76it/s][A
35it [00:19,  1.76it/s][A
36it [00:20,  1.76it/s][A
37it [00:21,  

HellaSwag score: 0.259375


 74%|███████▍  | 595689/800000 [1:51:58<24:31:02,  2.31it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

 88%|████████▊ | 703999/800000 [4:08:23<11:28:59,  2.32it/s]

PREDS: the first thing that comes to mind when you think of a new product is the fact that it is made from recycled materials. the first thing that comes to mind when you think of a new product is the fact that it is made from recycled materials. the second thing that comes to mind when you think of a new product is the fact that it is made from recycled materials. the third thing that comes to mind when you think of a new product is the fact that it is made from recycled materials. the fourth thing that comes to mind when you think of a new product is the fact that it is made from recycled materials. the fourth thing that comes to mind when you think of a new product is the fact that it is made from recycled materials. the fourth thing that comes to mind when you think of a new product is the fact that it is made from recycled materials. the fourth thing that comes to mind when you think of a new product is the fact that it is made from recycled materials. the fifth thing that comes t


0it [00:00, ?it/s][A
1it [00:00,  1.76it/s][A
2it [00:01,  1.77it/s][A
3it [00:01,  1.77it/s][A
4it [00:02,  1.77it/s][A
5it [00:02,  1.77it/s][A
6it [00:03,  1.77it/s][A
7it [00:03,  1.77it/s][A
8it [00:04,  1.77it/s][A
9it [00:05,  1.77it/s][A
10it [00:05,  1.77it/s][A
11it [00:06,  1.77it/s][A
12it [00:06,  1.77it/s][A
13it [00:07,  1.77it/s][A
14it [00:07,  1.77it/s][A
15it [00:08,  1.77it/s][A
16it [00:09,  1.77it/s][A
17it [00:09,  1.77it/s][A
18it [00:10,  1.77it/s][A
19it [00:10,  1.77it/s][A
20it [00:11,  1.77it/s][A
21it [00:11,  1.77it/s][A
22it [00:12,  1.77it/s][A
23it [00:13,  1.77it/s][A
24it [00:13,  1.77it/s][A
25it [00:14,  1.77it/s][A
26it [00:14,  1.77it/s][A
27it [00:15,  1.77it/s][A
28it [00:15,  1.77it/s][A
29it [00:16,  1.77it/s][A
30it [00:16,  1.77it/s][A
31it [00:17,  1.77it/s][A
32it [00:18,  1.77it/s][A
33it [00:18,  1.77it/s][A
34it [00:19,  1.77it/s][A
35it [00:19,  1.77it/s][A
36it [00:20,  1.77it/s][A
37it [00:20,  

HellaSwag score: 0.259375


 89%|████████▉ | 711999/800000 [5:05:58<10:31:56,  2.32it/s]

PREDS: the first step in the process of creating a new product is to create a new product that is not only good for the consumer, but also for the consumer. this is the first step in the process of creating a new product that is not only good for the consumer, but also for the consumer. the first step in creating a new product is to create a new product that is not only good for the consumer, but also for the consumer. this is the first step in the process of creating a new product that is not only good for the consumer, but also for the consumer. this is the second step in the process of creating a new product that is not only good for the consumer, but also for the consumer. this is the third step in the process of creating a new product that is not only good for the consumer, but also for the consumer. this is the third step in the process of creating a new product that is not only good for the consumer, but also for the consumer. this is the third step in the process of creating a 


0it [00:00, ?it/s][A
1it [00:00,  1.76it/s][A
2it [00:01,  1.77it/s][A
3it [00:01,  1.77it/s][A
4it [00:02,  1.77it/s][A
5it [00:02,  1.76it/s][A
6it [00:03,  1.76it/s][A
7it [00:03,  1.77it/s][A
8it [00:04,  1.77it/s][A
9it [00:05,  1.77it/s][A
10it [00:05,  1.77it/s][A
11it [00:06,  1.77it/s][A
12it [00:06,  1.77it/s][A
13it [00:07,  1.77it/s][A
14it [00:07,  1.72it/s][A
15it [00:08,  1.72it/s][A
16it [00:09,  1.74it/s][A
17it [00:09,  1.74it/s][A
18it [00:10,  1.75it/s][A
19it [00:10,  1.75it/s][A
20it [00:11,  1.76it/s][A
21it [00:11,  1.76it/s][A
22it [00:12,  1.76it/s][A
23it [00:13,  1.76it/s][A
24it [00:13,  1.76it/s][A
25it [00:14,  1.77it/s][A
26it [00:14,  1.77it/s][A
27it [00:15,  1.77it/s][A
28it [00:15,  1.77it/s][A
29it [00:16,  1.77it/s][A
30it [00:17,  1.76it/s][A
31it [00:17,  1.77it/s][A
32it [00:18,  1.77it/s][A
33it [00:18,  1.77it/s][A
34it [00:19,  1.77it/s][A
35it [00:19,  1.77it/s][A
36it [00:20,  1.77it/s][A
37it [00:21,  

HellaSwag score: 0.259375


 89%|████████▉ | 714016/800000 [5:22:08<10:20:42,  2.31it/s]
 90%|████████▉ | 719999/800000 [41:19<9:12:37,  2.41it/s] 

PREDS: the first step in the process of creating a new product is to create a new product that is not only a product but also a product that is not only a product but also a product that is not only a product but also a product that is not a product but also a product that is not a product but a product that is not a product but a product that is not a product but a product that is not a product but a product that is not a product but a product that is not a product but a product that is not a product but a product that is not a product but a product that is not a product but a product that is not a product but a product that is not a product but a product that is not a product but a product that is not a product but a product that is not a product but a product that is not a product but a product that is not a product but a product that is not a product but a product that is not a product but a product that is not a product but a product that is not a product but a product that is not


0it [00:00, ?it/s][A
1it [00:00,  1.76it/s][A
2it [00:01,  1.77it/s][A
3it [00:01,  1.77it/s][A
4it [00:02,  1.77it/s][A
5it [00:02,  1.77it/s][A
6it [00:03,  1.77it/s][A
7it [00:03,  1.77it/s][A
8it [00:04,  1.77it/s][A
9it [00:05,  1.76it/s][A
10it [00:05,  1.76it/s][A
11it [00:06,  1.76it/s][A
12it [00:06,  1.76it/s][A
13it [00:07,  1.76it/s][A
14it [00:07,  1.76it/s][A
15it [00:08,  1.76it/s][A
16it [00:09,  1.76it/s][A
17it [00:09,  1.76it/s][A
18it [00:10,  1.76it/s][A
19it [00:10,  1.76it/s][A
20it [00:11,  1.76it/s][A
21it [00:11,  1.77it/s][A
22it [00:12,  1.76it/s][A
23it [00:13,  1.76it/s][A
24it [00:13,  1.77it/s][A
25it [00:14,  1.77it/s][A
26it [00:14,  1.77it/s][A
27it [00:15,  1.76it/s][A
28it [00:15,  1.76it/s][A
29it [00:16,  1.76it/s][A
30it [00:17,  1.76it/s][A
31it [00:17,  1.76it/s][A
32it [00:18,  1.77it/s][A
33it [00:18,  1.77it/s][A
34it [00:19,  1.77it/s][A
35it [00:19,  1.77it/s][A
36it [00:20,  1.77it/s][A
37it [00:20,  

HellaSwag score: 0.259375


 91%|█████████ | 727999/800000 [1:38:54<8:29:17,  2.36it/s]

PREDS: the first step in the process of creating a new product is to create a new product that is not only aesthetically pleasing but also aesthetically pleasing. this is done by creating a new product that is not only aesthetically pleasing, but also aesthetically pleasing. the first step in creating a new product is to create a new product that is not only aesthetically pleasing, but also aesthetically pleasing. this is done by creating a new product that is not only aesthetically pleasing, but also aesthetically pleasing. this is done by creating a new product that is not only aesthetically pleasing, but also aesthetically pleasing. this is done by creating a new product that is not only aesthetically pleasing, but also aesthetically pleasing. this is done by creating a new product that is not only aesthetically pleasing, but also aesthetically pleasing. this is done by creating a new product that is not only aesthetically pleasing, but also aesthetically pleasing. this is done by c


0it [00:00, ?it/s][A
1it [00:00,  1.76it/s][A
2it [00:01,  1.77it/s][A
3it [00:01,  1.77it/s][A
4it [00:02,  1.77it/s][A
5it [00:02,  1.77it/s][A
6it [00:03,  1.77it/s][A
7it [00:03,  1.77it/s][A
8it [00:04,  1.77it/s][A
9it [00:05,  1.77it/s][A
10it [00:05,  1.77it/s][A
11it [00:06,  1.76it/s][A
12it [00:06,  1.77it/s][A
13it [00:07,  1.71it/s][A
14it [00:07,  1.73it/s][A
15it [00:08,  1.74it/s][A
16it [00:09,  1.75it/s][A
17it [00:09,  1.75it/s][A
18it [00:10,  1.76it/s][A
19it [00:10,  1.76it/s][A
20it [00:11,  1.76it/s][A
21it [00:11,  1.76it/s][A
22it [00:12,  1.76it/s][A
23it [00:13,  1.76it/s][A
24it [00:13,  1.76it/s][A
25it [00:14,  1.76it/s][A
26it [00:14,  1.77it/s][A
27it [00:15,  1.77it/s][A
28it [00:15,  1.77it/s][A
29it [00:16,  1.77it/s][A
30it [00:17,  1.77it/s][A
31it [00:17,  1.77it/s][A
32it [00:18,  1.77it/s][A
33it [00:18,  1.77it/s][A
34it [00:19,  1.77it/s][A
35it [00:19,  1.77it/s][A
36it [00:20,  1.77it/s][A
37it [00:21,  

HellaSwag score: 0.259375


 92%|█████████▏| 735999/800000 [2:36:29<7:35:37,  2.34it/s]

PREDS: the first step in the process of creating a new product is to create a new product that is not only aesthetically pleasing but also aesthetically pleasing. this is done by creating a new product that is not only aesthetically pleasing, but also aesthetically pleasing. the first step in creating a new product is to create a new product that is not only aesthetically pleasing, but also aesthetically pleasing. this is done by creating a new product that is not only aesthetically pleasing, but also aesthetically pleasing. this is done by creating a new product that is not only aesthetically pleasing, but also aesthetically pleasing. this is done by creating a new product that is not only aesthetically pleasing, but also aesthetically pleasing. this is done by creating a new product that is not only aesthetically pleasing, but also aesthetically pleasing. this is done by creating a new product that is not only aesthetically pleasing, but also aesthetically pleasing. this is done by c


0it [00:00, ?it/s][A
1it [00:00,  1.77it/s][A
2it [00:01,  1.77it/s][A
3it [00:01,  1.77it/s][A
4it [00:02,  1.77it/s][A
5it [00:02,  1.77it/s][A
6it [00:03,  1.77it/s][A
7it [00:03,  1.77it/s][A
8it [00:04,  1.77it/s][A
9it [00:05,  1.77it/s][A
10it [00:05,  1.77it/s][A
11it [00:06,  1.77it/s][A
12it [00:06,  1.76it/s][A
13it [00:07,  1.76it/s][A
14it [00:07,  1.76it/s][A
15it [00:08,  1.76it/s][A
16it [00:09,  1.76it/s][A
17it [00:09,  1.76it/s][A
18it [00:10,  1.76it/s][A
19it [00:10,  1.76it/s][A
20it [00:11,  1.76it/s][A
21it [00:11,  1.76it/s][A
22it [00:12,  1.76it/s][A
23it [00:13,  1.76it/s][A
24it [00:13,  1.76it/s][A
25it [00:14,  1.77it/s][A
26it [00:14,  1.76it/s][A
27it [00:15,  1.77it/s][A
28it [00:15,  1.77it/s][A
29it [00:16,  1.77it/s][A
30it [00:17,  1.77it/s][A
31it [00:17,  1.76it/s][A
32it [00:18,  1.77it/s][A
33it [00:18,  1.76it/s][A
34it [00:19,  1.76it/s][A
35it [00:19,  1.76it/s][A
36it [00:20,  1.77it/s][A
37it [00:20,  

HellaSwag score: 0.259375


 93%|█████████▎| 743405/800000 [3:29:46<6:43:58,  2.33it/s]

In [None]:
# FOR TESTING

# Compute HellaSwag score
import numpy as np

hellaswag_accs = []
from tqdm import tqdm
import itertools
for _, batch in tqdm(enumerate(itertools.islice(get_batched_examples(hellaswag_ds, 2, seq_len, START_TOK, END_TOK, split=None), 4))): 
#for _, batch in tqdm(enumerate(get_batched_examples(hellaswag_ds, 1, 400, START_TOK, END_TOK, split=None))):
    choices_vals = []
    x, y, _, y_mask, _, _, y_indices = batch
    choices, labels = unpack_hellaswag_batched_x(x)
    
    for choice in choices:
        y, y_mask = concatenate_hellaswag_y_and_choice(y, choice, END_TOK) # no need to return new y_indices for now.
        choice_log_probs = log_probs(params, jnp.array(y), jnp.array(y_mask), jnp.array(y_indices))
        choices_vals.append(choice_log_probs)
    choices_vals = np.array(choices_vals).transpose()
    hellaswag_accs.extend(np.argmax(choices_vals, axis=1)==labels)

#print("hellaswag_accs", hellaswag_accs)
hellaswag_acc = sum(hellaswag_accs)/len(hellaswag_accs)
print(hellaswag_acc)


In [None]:
### Final test predictions + BLEU computation
x_tokens_per_batch = 15000 #For variable batch len, we don't use it as we can fit less data (paper does 25k)

print(f'Few predictions for validation dataset')
y_sample = predict(params, jnp.array(x_eval), seq_len, START_TOK, END_TOK)
y_sample = tuple([item.tolist() for item in y_sample])
for detekonized_x_eval, detokenized_y_eval, detokenized_y_sample in zip(detokenize(x_eval), detokenize(y_eval), detokenize(y_sample)):
    print(f'X:{detekonized_x_eval}\tY: {detokenized_y_eval} \tPREDS: {detokenized_y_sample}\n')
    references.append(detokenized_y_eval)
    predictions.append(detokenized_y_sample)

print(f'Computing BLEU for validation dataset')
import evaluate
references = [] 
predictions = []
for _, (x, y) in tqdm(enumerate(get_batched_examples_per_length(ds, x_tokens_per_batch, split="validation"))):
    y_sample = predict(params, jnp.array(x), seq_len, START_TOK, END_TOK)
    y_sample = tuple([item.tolist() for item in y_sample])
    for detekonized_x_eval, detokenized_y_eval, detokenized_y_sample in zip(detokenize(x), detokenize(y), detokenize(y_sample)):
        references.append(detokenized_y_eval)
        predictions.append(detokenized_y_sample)

bleu = evaluate.load("bleu")
results = bleu.compute(predictions=predictions, references=references)
print(results)