In [1]:
from datasets import Dataset
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
from transformers import AutoTokenizer, GPT2Config, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import zstandard as zstd
import chess.pgn
import io
import re
from torch.utils.data import DataLoader
import math
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import stockfish
import chess
import chess.engine

  from .autonotebook import tqdm as notebook_tqdm


# READ DATA

note to reader: this is where you can determine data size by choosing N when running read_first_n_games_as_strings

In [2]:
def format_pgn(text):

    # Remove metadata lines (anything within square brackets)
    text = re.sub(r'\[.*?\]', '', text)

    # Remove the result of the game (win, loss, or draw)
    text = re.sub(r'\s*(1-0|0-1|1/2-1/2)\s*', '', text)  # Remove game result

    # Remove excessive spaces and newlines
    text = ' '.join(text.split())

    # Remove `{ ... }` (comments or analysis)
    text = re.sub(r'\{.*?\}', '', text)

    # Normalize spaces 
    text = ' '.join(text.split())  

    # Define patterns for different components
    move_number_pattern = re.compile(r'(\d+\.)')  # Move numbers (e.g., "1.")
    piece_pattern = re.compile(r'([KQRBN])')  # Chess pieces (e.g., "N", "K")
    square_pattern = re.compile(r'([a-h][1-8])')  # Board squares (e4, d5, etc.)
    special_move_pattern = re.compile(r'(O-O|O-O-O|\+|#|x|=Q|=R|=B|=N)')  # Castling, check, capture, promotions
    
    # Ensure move numbers, pieces, and special moves are space-separated
    text = move_number_pattern.sub(r'\1 ', text)  # Move number spacing
    text = piece_pattern.sub(r'\1 ', text)  # Piece spacing
    text = special_move_pattern.sub(r' \1 ', text)  # Special moves spacing

    text += ' <EOS>'
    
    return ' '.join(text.split()) 


def read_first_n_games_as_strings(file_path, n=100):
    with open(file_path, 'rb') as f:
        dctx = zstd.ZstdDecompressor()
        decompressed = dctx.stream_reader(f)
        pgn_text = io.TextIOWrapper(decompressed, encoding='utf-8')

        games = []
        for _ in range(n):
            game = chess.pgn.read_game(pgn_text)
            if game is None:
                break  # Stop if no more games are available
            
            # Convert game to string
            game_str = io.StringIO()
            game.accept(chess.pgn.StringExporter())  # Corrected line
            games.append(format_pgn(str(game)))

    return games


# PREP DATA FOR TOKENIZER

In [None]:
# Replace with your file path
file_path = "INSERT_PATH_HERE"
# Replace with number of games
number_of_games = 100000
games_as_strings = read_first_n_games_as_strings(file_path, n=number_of_games)

# Print the first game's PGN as a string
print(games_as_strings[0])

# Convert to Hugging Face Dataset format
dataset = Dataset.from_dict({"text": games_as_strings})

train_data, val_data = train_test_split(dataset["text"], test_size=0.2, random_state=42)

train_dataset = Dataset.from_dict({"text": train_data})
val_dataset = Dataset.from_dict({"text": val_data})

dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset
})


1. d4 1. .. N f6 2. f3 2. .. d5 3. c4 3. .. c6 4. e3 4. .. g6 5. N c3 5. .. B g7 6. B d3 6. .. O-O 7. N ge2 7. .. N a6 8. a3 8. .. N c7 9. b4 9. .. R e8 10. O-O 10. .. B d7 11. c5 11. .. e5 12. h3 12. .. e x d4 13. N x d4 13. .. N e6 14. R b1 14. .. N x d4 15. e x d4 15. .. N h5 16. N e2 16. .. Q f6 17. B c2 17. .. B f5 18. B x f5 18. .. Q x f5 19. B b2 19. .. N f4 20. N x f4 20. .. Q x f4 21. Q c2 21. .. R e3 22. B c1 22. .. B x d4 23. B x e3 23. .. B x e3 + 24. K h1 24. .. h5 25. Q e2 25. .. d4 26. R be1 26. .. R e8 27. Q d3 27. .. R e6 28. R e2 28. .. Q g3 29. R x e3 29. .. d x e3 30. Q d8 + 30. .. K g7 31. Q d3 31. .. e2 32. R g1 32. .. e1 =Q 33. R x e1 33. .. R x e1 + <EOS>


# TRAIN TOKENIZER

note to reader: if experimenting with model size, adjust tokenizer name accordingly below

In [5]:
# note to reader: if experimenting with model size, adjust tokenizer name accordingly below
tokenizer_name = "gpt2"
old_tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

# Define special tokens
special_tokens = {
    "eos_token": "<EOS>",
    "pad_token": "[PAD]"
}

# Resize the tokenizer with the new EOS token
old_tokenizer.add_special_tokens(special_tokens)

# Train a new tokenizer using the updated old_tokenizer
tokenizer = old_tokenizer.train_new_from_iterator(dataset['train'], 120)






# TOKENIZE DATA

In [6]:
# Tokenize Function
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        padding='max_length',
        max_length=128,
        return_special_tokens_mask=True
    )


# Tokenize Dataset
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

train_subset = tokenized_datasets["train"]
eval_subset = tokenized_datasets["validation"]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Map: 100%|██████████| 80000/80000 [00:15<00:00, 5148.39 examples/s]
Map: 100%|██████████| 20000/20000 [00:03<00:00, 5152.78 examples/s]


# SET UP AND CONFIGURE MODEL

In [None]:
# Create GPT model configuration
# note to reader: you can modify configuration to see how it affects chess performance
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,  # Match tokenizer's vocab size
    n_positions=128,
    n_embd=768,
    n_layer=12,
    n_head=12
)

# Initialize model
model = GPT2LMHeadModel(config)


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False 
)

training_args = TrainingArguments(
    output_dir="./gpt-chess",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    warmup_steps=500,
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_subset,
    eval_dataset=eval_subset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()



  trainer = Trainer(
  0%|          | 10/30000 [00:03<2:42:54,  3.07it/s]

{'loss': 5.3283, 'grad_norm': 70.04059600830078, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0}


  0%|          | 20/30000 [00:06<2:39:13,  3.14it/s]

{'loss': 3.6905, 'grad_norm': 16.219356536865234, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.0}


  0%|          | 30/30000 [00:09<2:39:28,  3.13it/s]

{'loss': 2.8198, 'grad_norm': 11.553008079528809, 'learning_rate': 3e-06, 'epoch': 0.0}


  0%|          | 40/30000 [00:13<2:40:35,  3.11it/s]

{'loss': 2.2878, 'grad_norm': 6.791100025177002, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.0}


  0%|          | 50/30000 [00:16<2:42:19,  3.08it/s]

{'loss': 1.9267, 'grad_norm': 6.1818156242370605, 'learning_rate': 5e-06, 'epoch': 0.01}


  0%|          | 60/30000 [00:19<2:40:07,  3.12it/s]

{'loss': 1.6298, 'grad_norm': 9.23494815826416, 'learning_rate': 6e-06, 'epoch': 0.01}


  0%|          | 70/30000 [00:22<2:38:31,  3.15it/s]

{'loss': 1.5016, 'grad_norm': 22.694639205932617, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.01}


  0%|          | 80/30000 [00:26<2:41:40,  3.08it/s]

{'loss': 1.4019, 'grad_norm': 19.0913028717041, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.01}


  0%|          | 90/30000 [00:29<2:38:07,  3.15it/s]

{'loss': 1.3351, 'grad_norm': 14.423232078552246, 'learning_rate': 9e-06, 'epoch': 0.01}


  0%|          | 100/30000 [00:32<2:37:27,  3.16it/s]

{'loss': 1.2299, 'grad_norm': 7.854684829711914, 'learning_rate': 1e-05, 'epoch': 0.01}


  0%|          | 110/30000 [00:35<2:37:23,  3.17it/s]

{'loss': 1.1814, 'grad_norm': 12.353670120239258, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.01}


  0%|          | 120/30000 [00:38<2:37:40,  3.16it/s]

{'loss': 1.1145, 'grad_norm': 11.964996337890625, 'learning_rate': 1.2e-05, 'epoch': 0.01}


  0%|          | 130/30000 [00:41<2:38:22,  3.14it/s]

{'loss': 1.0974, 'grad_norm': 5.788852214813232, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.01}


  0%|          | 140/30000 [00:45<2:36:10,  3.19it/s]

{'loss': 1.0792, 'grad_norm': 6.248486518859863, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.01}


  0%|          | 150/30000 [00:48<2:36:06,  3.19it/s]

{'loss': 1.05, 'grad_norm': 7.796597480773926, 'learning_rate': 1.5e-05, 'epoch': 0.01}


  1%|          | 160/30000 [00:51<2:35:43,  3.19it/s]

{'loss': 1.0354, 'grad_norm': 6.947649955749512, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.02}


  1%|          | 170/30000 [00:54<2:35:44,  3.19it/s]

{'loss': 0.9953, 'grad_norm': 9.90211296081543, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.02}


  1%|          | 180/30000 [00:57<2:35:30,  3.20it/s]

{'loss': 1.0169, 'grad_norm': 5.466711521148682, 'learning_rate': 1.8e-05, 'epoch': 0.02}


  1%|          | 190/30000 [01:00<2:36:14,  3.18it/s]

{'loss': 1.0071, 'grad_norm': 6.516955852508545, 'learning_rate': 1.9e-05, 'epoch': 0.02}


  1%|          | 200/30000 [01:03<2:35:34,  3.19it/s]

{'loss': 0.9654, 'grad_norm': 6.780934810638428, 'learning_rate': 2e-05, 'epoch': 0.02}


  1%|          | 210/30000 [01:07<2:36:09,  3.18it/s]

{'loss': 0.9805, 'grad_norm': 8.059958457946777, 'learning_rate': 2.1e-05, 'epoch': 0.02}


  1%|          | 220/30000 [01:10<2:35:43,  3.19it/s]

{'loss': 0.9435, 'grad_norm': 7.758155345916748, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.02}


  1%|          | 230/30000 [01:13<2:35:26,  3.19it/s]

{'loss': 0.9528, 'grad_norm': 6.256776332855225, 'learning_rate': 2.3000000000000003e-05, 'epoch': 0.02}


  1%|          | 240/30000 [01:16<2:35:44,  3.18it/s]

{'loss': 0.8939, 'grad_norm': 6.576498508453369, 'learning_rate': 2.4e-05, 'epoch': 0.02}


  1%|          | 250/30000 [01:19<2:35:33,  3.19it/s]

{'loss': 0.9489, 'grad_norm': 4.810886383056641, 'learning_rate': 2.5e-05, 'epoch': 0.03}


  1%|          | 260/30000 [01:22<2:35:33,  3.19it/s]

{'loss': 0.8888, 'grad_norm': 6.324295520782471, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.03}


  1%|          | 270/30000 [01:25<2:36:05,  3.17it/s]

{'loss': 0.9002, 'grad_norm': 5.260014533996582, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.03}


  1%|          | 280/30000 [01:29<2:36:04,  3.17it/s]

{'loss': 0.8606, 'grad_norm': 4.550217628479004, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.03}


  1%|          | 290/30000 [01:32<2:38:15,  3.13it/s]

{'loss': 0.8836, 'grad_norm': 4.073721408843994, 'learning_rate': 2.9e-05, 'epoch': 0.03}


  1%|          | 300/30000 [01:35<2:34:57,  3.19it/s]

{'loss': 0.8642, 'grad_norm': 4.147672176361084, 'learning_rate': 3e-05, 'epoch': 0.03}


  1%|          | 310/30000 [01:38<2:34:47,  3.20it/s]

{'loss': 0.8252, 'grad_norm': 4.560773849487305, 'learning_rate': 3.1e-05, 'epoch': 0.03}


  1%|          | 320/30000 [01:41<2:35:24,  3.18it/s]

{'loss': 0.849, 'grad_norm': 6.568504333496094, 'learning_rate': 3.2000000000000005e-05, 'epoch': 0.03}


  1%|          | 330/30000 [01:44<2:35:22,  3.18it/s]

{'loss': 0.8299, 'grad_norm': 5.5851054191589355, 'learning_rate': 3.3e-05, 'epoch': 0.03}


  1%|          | 340/30000 [01:47<2:35:49,  3.17it/s]

{'loss': 0.8159, 'grad_norm': 6.477124214172363, 'learning_rate': 3.4000000000000007e-05, 'epoch': 0.03}


  1%|          | 350/30000 [01:51<2:35:28,  3.18it/s]

{'loss': 0.7786, 'grad_norm': 5.009265422821045, 'learning_rate': 3.5e-05, 'epoch': 0.04}


  1%|          | 360/30000 [01:54<2:34:57,  3.19it/s]

{'loss': 0.8063, 'grad_norm': 6.9077229499816895, 'learning_rate': 3.6e-05, 'epoch': 0.04}


  1%|          | 370/30000 [01:57<2:34:56,  3.19it/s]

{'loss': 0.7785, 'grad_norm': 5.779118537902832, 'learning_rate': 3.7e-05, 'epoch': 0.04}


  1%|▏         | 380/30000 [02:00<2:34:34,  3.19it/s]

{'loss': 0.7583, 'grad_norm': 5.0579118728637695, 'learning_rate': 3.8e-05, 'epoch': 0.04}


  1%|▏         | 390/30000 [02:03<2:34:22,  3.20it/s]

{'loss': 0.7825, 'grad_norm': 6.0821356773376465, 'learning_rate': 3.9000000000000006e-05, 'epoch': 0.04}


  1%|▏         | 400/30000 [02:06<2:34:28,  3.19it/s]

{'loss': 0.7162, 'grad_norm': 4.717190265655518, 'learning_rate': 4e-05, 'epoch': 0.04}


  1%|▏         | 410/30000 [02:09<2:34:22,  3.19it/s]

{'loss': 0.6987, 'grad_norm': 3.6956446170806885, 'learning_rate': 4.1e-05, 'epoch': 0.04}


  1%|▏         | 420/30000 [02:13<2:34:13,  3.20it/s]

{'loss': 0.6762, 'grad_norm': 4.208924293518066, 'learning_rate': 4.2e-05, 'epoch': 0.04}


  1%|▏         | 430/30000 [02:16<2:34:16,  3.19it/s]

{'loss': 0.6518, 'grad_norm': 5.724032878875732, 'learning_rate': 4.3e-05, 'epoch': 0.04}


  1%|▏         | 440/30000 [02:19<2:34:26,  3.19it/s]

{'loss': 0.6727, 'grad_norm': 4.345970630645752, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.04}


  2%|▏         | 450/30000 [02:22<2:34:22,  3.19it/s]

{'loss': 0.6158, 'grad_norm': 4.983090877532959, 'learning_rate': 4.5e-05, 'epoch': 0.04}


  2%|▏         | 460/30000 [02:25<2:34:13,  3.19it/s]

{'loss': 0.6071, 'grad_norm': 5.513019561767578, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.05}


  2%|▏         | 470/30000 [02:28<2:34:30,  3.19it/s]

{'loss': 0.6248, 'grad_norm': 3.0954699516296387, 'learning_rate': 4.7e-05, 'epoch': 0.05}


  2%|▏         | 480/30000 [02:31<2:34:07,  3.19it/s]

{'loss': 0.5778, 'grad_norm': 3.5344398021698, 'learning_rate': 4.8e-05, 'epoch': 0.05}


  2%|▏         | 490/30000 [02:34<2:35:16,  3.17it/s]

{'loss': 0.5738, 'grad_norm': 2.6311066150665283, 'learning_rate': 4.9e-05, 'epoch': 0.05}


  2%|▏         | 500/30000 [02:38<2:34:12,  3.19it/s]

{'loss': 0.5681, 'grad_norm': 3.4116475582122803, 'learning_rate': 5e-05, 'epoch': 0.05}


  2%|▏         | 510/30000 [02:41<2:34:13,  3.19it/s]

{'loss': 0.5472, 'grad_norm': 3.5303609371185303, 'learning_rate': 4.998305084745763e-05, 'epoch': 0.05}


  2%|▏         | 520/30000 [02:44<2:33:59,  3.19it/s]

{'loss': 0.5302, 'grad_norm': 2.6192336082458496, 'learning_rate': 4.9966101694915254e-05, 'epoch': 0.05}


  2%|▏         | 530/30000 [02:47<2:34:18,  3.18it/s]

{'loss': 0.553, 'grad_norm': 2.5639052391052246, 'learning_rate': 4.9949152542372884e-05, 'epoch': 0.05}


  2%|▏         | 540/30000 [02:50<2:35:53,  3.15it/s]

{'loss': 0.5208, 'grad_norm': 2.99933123588562, 'learning_rate': 4.993220338983051e-05, 'epoch': 0.05}


  2%|▏         | 550/30000 [02:53<2:37:38,  3.11it/s]

{'loss': 0.5481, 'grad_norm': 2.5154922008514404, 'learning_rate': 4.991525423728814e-05, 'epoch': 0.06}


  2%|▏         | 560/30000 [02:57<2:36:16,  3.14it/s]

{'loss': 0.5202, 'grad_norm': 2.827075958251953, 'learning_rate': 4.9898305084745765e-05, 'epoch': 0.06}


  2%|▏         | 570/30000 [03:00<2:35:45,  3.15it/s]

{'loss': 0.4942, 'grad_norm': 3.108485221862793, 'learning_rate': 4.9881355932203394e-05, 'epoch': 0.06}


  2%|▏         | 580/30000 [03:03<2:34:57,  3.16it/s]

{'loss': 0.4882, 'grad_norm': 2.1722939014434814, 'learning_rate': 4.9864406779661024e-05, 'epoch': 0.06}


  2%|▏         | 590/30000 [03:06<2:34:40,  3.17it/s]

{'loss': 0.4649, 'grad_norm': 2.330479383468628, 'learning_rate': 4.9847457627118646e-05, 'epoch': 0.06}


  2%|▏         | 600/30000 [03:09<2:34:43,  3.17it/s]

{'loss': 0.4944, 'grad_norm': 2.8331964015960693, 'learning_rate': 4.9830508474576276e-05, 'epoch': 0.06}


  2%|▏         | 610/30000 [03:12<2:34:41,  3.17it/s]

{'loss': 0.501, 'grad_norm': 2.278212547302246, 'learning_rate': 4.98135593220339e-05, 'epoch': 0.06}


  2%|▏         | 620/30000 [03:16<2:34:46,  3.16it/s]

{'loss': 0.4936, 'grad_norm': 1.9440912008285522, 'learning_rate': 4.979661016949153e-05, 'epoch': 0.06}


  2%|▏         | 630/30000 [03:19<2:34:49,  3.16it/s]

{'loss': 0.4748, 'grad_norm': 1.5835989713668823, 'learning_rate': 4.977966101694915e-05, 'epoch': 0.06}


  2%|▏         | 640/30000 [03:22<2:36:22,  3.13it/s]

{'loss': 0.4755, 'grad_norm': 2.199092149734497, 'learning_rate': 4.976271186440678e-05, 'epoch': 0.06}


  2%|▏         | 650/30000 [03:25<2:36:12,  3.13it/s]

{'loss': 0.4975, 'grad_norm': 2.3522839546203613, 'learning_rate': 4.974576271186441e-05, 'epoch': 0.07}


  2%|▏         | 660/30000 [03:28<2:34:31,  3.16it/s]

{'loss': 0.5024, 'grad_norm': 1.8271058797836304, 'learning_rate': 4.972881355932204e-05, 'epoch': 0.07}


  2%|▏         | 670/30000 [03:32<2:35:11,  3.15it/s]

{'loss': 0.4792, 'grad_norm': 2.004955291748047, 'learning_rate': 4.971186440677966e-05, 'epoch': 0.07}


  2%|▏         | 680/30000 [03:35<2:33:45,  3.18it/s]

{'loss': 0.4872, 'grad_norm': 2.0533807277679443, 'learning_rate': 4.969491525423729e-05, 'epoch': 0.07}


  2%|▏         | 690/30000 [03:38<2:33:43,  3.18it/s]

{'loss': 0.4666, 'grad_norm': 1.6132811307907104, 'learning_rate': 4.967796610169492e-05, 'epoch': 0.07}


  2%|▏         | 700/30000 [03:41<2:33:24,  3.18it/s]

{'loss': 0.4511, 'grad_norm': 1.4302833080291748, 'learning_rate': 4.966101694915254e-05, 'epoch': 0.07}


  2%|▏         | 710/30000 [03:44<2:32:47,  3.20it/s]

{'loss': 0.4789, 'grad_norm': 2.0715994834899902, 'learning_rate': 4.964406779661017e-05, 'epoch': 0.07}


  2%|▏         | 720/30000 [03:47<2:34:26,  3.16it/s]

{'loss': 0.4645, 'grad_norm': 2.135833740234375, 'learning_rate': 4.96271186440678e-05, 'epoch': 0.07}


  2%|▏         | 730/30000 [03:51<2:35:54,  3.13it/s]

{'loss': 0.4534, 'grad_norm': 1.2334096431732178, 'learning_rate': 4.961016949152543e-05, 'epoch': 0.07}


  2%|▏         | 740/30000 [03:54<2:34:42,  3.15it/s]

{'loss': 0.4592, 'grad_norm': 1.587831735610962, 'learning_rate': 4.959322033898305e-05, 'epoch': 0.07}


  2%|▎         | 750/30000 [03:57<2:35:05,  3.14it/s]

{'loss': 0.4608, 'grad_norm': 1.1513001918792725, 'learning_rate': 4.957627118644068e-05, 'epoch': 0.07}


  3%|▎         | 760/30000 [04:00<2:33:02,  3.18it/s]

{'loss': 0.4519, 'grad_norm': 1.4829697608947754, 'learning_rate': 4.955932203389831e-05, 'epoch': 0.08}


  3%|▎         | 770/30000 [04:03<2:34:42,  3.15it/s]

{'loss': 0.4316, 'grad_norm': 1.248335599899292, 'learning_rate': 4.9542372881355934e-05, 'epoch': 0.08}


  3%|▎         | 780/30000 [04:06<2:40:41,  3.03it/s]

{'loss': 0.4585, 'grad_norm': 1.6668416261672974, 'learning_rate': 4.952542372881356e-05, 'epoch': 0.08}


  3%|▎         | 790/30000 [04:10<2:36:21,  3.11it/s]

{'loss': 0.4406, 'grad_norm': 1.626022219657898, 'learning_rate': 4.950847457627119e-05, 'epoch': 0.08}


  3%|▎         | 800/30000 [04:13<2:37:07,  3.10it/s]

{'loss': 0.4307, 'grad_norm': 1.7203913927078247, 'learning_rate': 4.9491525423728815e-05, 'epoch': 0.08}


  3%|▎         | 810/30000 [04:16<2:33:11,  3.18it/s]

{'loss': 0.4508, 'grad_norm': 1.2867565155029297, 'learning_rate': 4.9474576271186444e-05, 'epoch': 0.08}


  3%|▎         | 820/30000 [04:19<2:33:25,  3.17it/s]

{'loss': 0.4407, 'grad_norm': 1.1592384576797485, 'learning_rate': 4.945762711864407e-05, 'epoch': 0.08}


  3%|▎         | 830/30000 [04:22<2:34:00,  3.16it/s]

{'loss': 0.4492, 'grad_norm': 2.067692756652832, 'learning_rate': 4.9440677966101696e-05, 'epoch': 0.08}


  3%|▎         | 840/30000 [04:25<2:33:42,  3.16it/s]

{'loss': 0.4428, 'grad_norm': 1.4824312925338745, 'learning_rate': 4.9423728813559326e-05, 'epoch': 0.08}


  3%|▎         | 850/30000 [04:29<2:33:19,  3.17it/s]

{'loss': 0.4337, 'grad_norm': 1.395923137664795, 'learning_rate': 4.940677966101695e-05, 'epoch': 0.09}


  3%|▎         | 860/30000 [04:32<2:33:33,  3.16it/s]

{'loss': 0.4424, 'grad_norm': 1.1640069484710693, 'learning_rate': 4.938983050847458e-05, 'epoch': 0.09}


  3%|▎         | 870/30000 [04:35<2:33:09,  3.17it/s]

{'loss': 0.4388, 'grad_norm': 1.3930507898330688, 'learning_rate': 4.937288135593221e-05, 'epoch': 0.09}


  3%|▎         | 880/30000 [04:38<2:32:56,  3.17it/s]

{'loss': 0.4313, 'grad_norm': 1.0758049488067627, 'learning_rate': 4.935593220338983e-05, 'epoch': 0.09}


  3%|▎         | 890/30000 [04:41<2:32:54,  3.17it/s]

{'loss': 0.4329, 'grad_norm': 1.6339757442474365, 'learning_rate': 4.933898305084746e-05, 'epoch': 0.09}


  3%|▎         | 900/30000 [04:44<2:33:26,  3.16it/s]

{'loss': 0.4151, 'grad_norm': 1.198546290397644, 'learning_rate': 4.932203389830509e-05, 'epoch': 0.09}


  3%|▎         | 910/30000 [04:48<2:32:49,  3.17it/s]

{'loss': 0.425, 'grad_norm': 1.5617865324020386, 'learning_rate': 4.930508474576271e-05, 'epoch': 0.09}


  3%|▎         | 920/30000 [04:51<2:32:35,  3.18it/s]

{'loss': 0.4283, 'grad_norm': 1.4114736318588257, 'learning_rate': 4.928813559322034e-05, 'epoch': 0.09}


  3%|▎         | 930/30000 [04:54<2:35:58,  3.11it/s]

{'loss': 0.4246, 'grad_norm': 1.3082576990127563, 'learning_rate': 4.927118644067797e-05, 'epoch': 0.09}


  3%|▎         | 940/30000 [04:57<2:32:40,  3.17it/s]

{'loss': 0.4273, 'grad_norm': 1.312103033065796, 'learning_rate': 4.92542372881356e-05, 'epoch': 0.09}


  3%|▎         | 950/30000 [05:00<2:32:26,  3.18it/s]

{'loss': 0.4224, 'grad_norm': 1.0328887701034546, 'learning_rate': 4.923728813559322e-05, 'epoch': 0.1}


  3%|▎         | 960/30000 [05:03<2:32:30,  3.17it/s]

{'loss': 0.4421, 'grad_norm': 1.1844892501831055, 'learning_rate': 4.922033898305085e-05, 'epoch': 0.1}


  3%|▎         | 970/30000 [05:06<2:32:01,  3.18it/s]

{'loss': 0.4265, 'grad_norm': 1.8886964321136475, 'learning_rate': 4.920338983050848e-05, 'epoch': 0.1}


  3%|▎         | 980/30000 [05:10<2:32:23,  3.17it/s]

{'loss': 0.4255, 'grad_norm': 1.1740758419036865, 'learning_rate': 4.91864406779661e-05, 'epoch': 0.1}


  3%|▎         | 990/30000 [05:13<2:31:57,  3.18it/s]

{'loss': 0.4298, 'grad_norm': 1.347285509109497, 'learning_rate': 4.916949152542373e-05, 'epoch': 0.1}


  3%|▎         | 1000/30000 [05:16<2:32:30,  3.17it/s]

{'loss': 0.4229, 'grad_norm': 1.5065795183181763, 'learning_rate': 4.915254237288136e-05, 'epoch': 0.1}


  3%|▎         | 1010/30000 [05:19<2:31:58,  3.18it/s]

{'loss': 0.4256, 'grad_norm': 1.353600263595581, 'learning_rate': 4.913559322033899e-05, 'epoch': 0.1}


  3%|▎         | 1020/30000 [05:22<2:32:03,  3.18it/s]

{'loss': 0.4099, 'grad_norm': 1.5258965492248535, 'learning_rate': 4.9118644067796607e-05, 'epoch': 0.1}


  3%|▎         | 1030/30000 [05:25<2:32:17,  3.17it/s]

{'loss': 0.4042, 'grad_norm': 1.3298282623291016, 'learning_rate': 4.9101694915254236e-05, 'epoch': 0.1}


  3%|▎         | 1040/30000 [05:29<2:32:11,  3.17it/s]

{'loss': 0.4055, 'grad_norm': 1.2595540285110474, 'learning_rate': 4.9084745762711865e-05, 'epoch': 0.1}


  4%|▎         | 1050/30000 [05:32<2:31:56,  3.18it/s]

{'loss': 0.4213, 'grad_norm': 1.2365326881408691, 'learning_rate': 4.9067796610169495e-05, 'epoch': 0.1}


  4%|▎         | 1060/30000 [05:35<2:31:52,  3.18it/s]

{'loss': 0.4012, 'grad_norm': 1.6509685516357422, 'learning_rate': 4.905084745762712e-05, 'epoch': 0.11}


  4%|▎         | 1070/30000 [05:38<2:31:34,  3.18it/s]

{'loss': 0.4169, 'grad_norm': 1.0216946601867676, 'learning_rate': 4.9033898305084746e-05, 'epoch': 0.11}


  4%|▎         | 1080/30000 [05:41<2:31:44,  3.18it/s]

{'loss': 0.3983, 'grad_norm': 1.2977769374847412, 'learning_rate': 4.9016949152542376e-05, 'epoch': 0.11}


  4%|▎         | 1090/30000 [05:44<2:32:13,  3.17it/s]

{'loss': 0.3966, 'grad_norm': 1.0821563005447388, 'learning_rate': 4.9e-05, 'epoch': 0.11}


  4%|▎         | 1100/30000 [05:47<2:31:46,  3.17it/s]

{'loss': 0.4113, 'grad_norm': 1.285447120666504, 'learning_rate': 4.898305084745763e-05, 'epoch': 0.11}


  4%|▎         | 1110/30000 [05:51<2:31:32,  3.18it/s]

{'loss': 0.4034, 'grad_norm': 1.2509238719940186, 'learning_rate': 4.896610169491526e-05, 'epoch': 0.11}


  4%|▎         | 1120/30000 [05:54<2:31:38,  3.17it/s]

{'loss': 0.3952, 'grad_norm': 1.2434766292572021, 'learning_rate': 4.8949152542372886e-05, 'epoch': 0.11}


  4%|▍         | 1130/30000 [05:57<2:31:19,  3.18it/s]

{'loss': 0.3976, 'grad_norm': 1.0600134134292603, 'learning_rate': 4.893220338983051e-05, 'epoch': 0.11}


  4%|▍         | 1140/30000 [06:00<2:31:07,  3.18it/s]

{'loss': 0.4416, 'grad_norm': 1.2557756900787354, 'learning_rate': 4.891525423728814e-05, 'epoch': 0.11}


  4%|▍         | 1150/30000 [06:03<2:32:02,  3.16it/s]

{'loss': 0.4309, 'grad_norm': 0.9953380823135376, 'learning_rate': 4.889830508474577e-05, 'epoch': 0.12}


  4%|▍         | 1160/30000 [06:06<2:32:16,  3.16it/s]

{'loss': 0.4034, 'grad_norm': 1.4899879693984985, 'learning_rate': 4.888135593220339e-05, 'epoch': 0.12}


  4%|▍         | 1170/30000 [06:10<2:40:17,  3.00it/s]

{'loss': 0.4134, 'grad_norm': 1.086102843284607, 'learning_rate': 4.886440677966102e-05, 'epoch': 0.12}


  4%|▍         | 1180/30000 [06:13<2:32:11,  3.16it/s]

{'loss': 0.3858, 'grad_norm': 1.173025369644165, 'learning_rate': 4.884745762711865e-05, 'epoch': 0.12}


  4%|▍         | 1190/30000 [06:16<2:37:15,  3.05it/s]

{'loss': 0.4071, 'grad_norm': 1.2425107955932617, 'learning_rate': 4.883050847457628e-05, 'epoch': 0.12}


  4%|▍         | 1200/30000 [06:19<2:35:27,  3.09it/s]

{'loss': 0.3799, 'grad_norm': 0.9419765472412109, 'learning_rate': 4.88135593220339e-05, 'epoch': 0.12}


  4%|▍         | 1210/30000 [06:23<2:35:53,  3.08it/s]

{'loss': 0.3959, 'grad_norm': 1.2378581762313843, 'learning_rate': 4.879661016949153e-05, 'epoch': 0.12}


  4%|▍         | 1220/30000 [06:26<2:32:15,  3.15it/s]

{'loss': 0.4298, 'grad_norm': 0.8184176683425903, 'learning_rate': 4.877966101694916e-05, 'epoch': 0.12}


  4%|▍         | 1230/30000 [06:29<2:38:12,  3.03it/s]

{'loss': 0.3787, 'grad_norm': 0.8219221234321594, 'learning_rate': 4.876271186440678e-05, 'epoch': 0.12}


  4%|▍         | 1240/30000 [06:32<2:35:41,  3.08it/s]

{'loss': 0.4036, 'grad_norm': 1.5246723890304565, 'learning_rate': 4.8745762711864405e-05, 'epoch': 0.12}


  4%|▍         | 1250/30000 [06:35<2:32:03,  3.15it/s]

{'loss': 0.3866, 'grad_norm': 1.1189311742782593, 'learning_rate': 4.8728813559322034e-05, 'epoch': 0.12}


  4%|▍         | 1260/30000 [06:39<2:32:12,  3.15it/s]

{'loss': 0.3966, 'grad_norm': 1.2133716344833374, 'learning_rate': 4.8711864406779663e-05, 'epoch': 0.13}


  4%|▍         | 1270/30000 [06:42<2:34:29,  3.10it/s]

{'loss': 0.4043, 'grad_norm': 1.0856636762619019, 'learning_rate': 4.8694915254237286e-05, 'epoch': 0.13}


  4%|▍         | 1280/30000 [06:45<2:33:40,  3.11it/s]

{'loss': 0.3744, 'grad_norm': 0.7918602228164673, 'learning_rate': 4.8677966101694915e-05, 'epoch': 0.13}


  4%|▍         | 1290/30000 [06:48<2:35:38,  3.07it/s]

{'loss': 0.3748, 'grad_norm': 1.5575339794158936, 'learning_rate': 4.8661016949152545e-05, 'epoch': 0.13}


  4%|▍         | 1300/30000 [06:52<2:34:15,  3.10it/s]

{'loss': 0.3831, 'grad_norm': 2.5063393115997314, 'learning_rate': 4.8644067796610174e-05, 'epoch': 0.13}


  4%|▍         | 1310/30000 [06:55<2:35:27,  3.08it/s]

{'loss': 0.3757, 'grad_norm': 1.4116849899291992, 'learning_rate': 4.86271186440678e-05, 'epoch': 0.13}


  4%|▍         | 1320/30000 [06:58<2:33:58,  3.10it/s]

{'loss': 0.3913, 'grad_norm': 0.9483572244644165, 'learning_rate': 4.8610169491525426e-05, 'epoch': 0.13}


  4%|▍         | 1330/30000 [07:01<2:33:15,  3.12it/s]

{'loss': 0.3869, 'grad_norm': 0.7904651165008545, 'learning_rate': 4.8593220338983055e-05, 'epoch': 0.13}


  4%|▍         | 1340/30000 [07:05<2:34:33,  3.09it/s]

{'loss': 0.3854, 'grad_norm': 1.1147615909576416, 'learning_rate': 4.857627118644068e-05, 'epoch': 0.13}


  4%|▍         | 1350/30000 [07:08<2:32:06,  3.14it/s]

{'loss': 0.3859, 'grad_norm': 1.6010453701019287, 'learning_rate': 4.855932203389831e-05, 'epoch': 0.14}


  5%|▍         | 1360/30000 [07:11<2:31:42,  3.15it/s]

{'loss': 0.4, 'grad_norm': 0.8043802380561829, 'learning_rate': 4.8542372881355937e-05, 'epoch': 0.14}


  5%|▍         | 1370/30000 [07:14<2:31:04,  3.16it/s]

{'loss': 0.3899, 'grad_norm': 1.122285008430481, 'learning_rate': 4.8525423728813566e-05, 'epoch': 0.14}


  5%|▍         | 1380/30000 [07:17<2:31:29,  3.15it/s]

{'loss': 0.3775, 'grad_norm': 1.1975268125534058, 'learning_rate': 4.850847457627119e-05, 'epoch': 0.14}


  5%|▍         | 1390/30000 [07:20<2:33:26,  3.11it/s]

{'loss': 0.372, 'grad_norm': 0.8384429812431335, 'learning_rate': 4.849152542372882e-05, 'epoch': 0.14}


  5%|▍         | 1400/30000 [07:24<2:31:46,  3.14it/s]

{'loss': 0.4062, 'grad_norm': 1.0306510925292969, 'learning_rate': 4.847457627118645e-05, 'epoch': 0.14}


  5%|▍         | 1410/30000 [07:27<2:31:03,  3.15it/s]

{'loss': 0.3935, 'grad_norm': 1.0973999500274658, 'learning_rate': 4.845762711864407e-05, 'epoch': 0.14}


  5%|▍         | 1420/30000 [07:30<2:33:09,  3.11it/s]

{'loss': 0.3639, 'grad_norm': 1.2971346378326416, 'learning_rate': 4.84406779661017e-05, 'epoch': 0.14}


  5%|▍         | 1430/30000 [07:33<2:30:59,  3.15it/s]

{'loss': 0.3738, 'grad_norm': 1.442936658859253, 'learning_rate': 4.842372881355933e-05, 'epoch': 0.14}


  5%|▍         | 1440/30000 [07:36<2:30:30,  3.16it/s]

{'loss': 0.3708, 'grad_norm': 1.1164947748184204, 'learning_rate': 4.840677966101695e-05, 'epoch': 0.14}


  5%|▍         | 1450/30000 [07:40<2:31:41,  3.14it/s]

{'loss': 0.363, 'grad_norm': 0.8720357418060303, 'learning_rate': 4.8389830508474574e-05, 'epoch': 0.14}


  5%|▍         | 1460/30000 [07:43<2:30:39,  3.16it/s]

{'loss': 0.3709, 'grad_norm': 1.2248213291168213, 'learning_rate': 4.83728813559322e-05, 'epoch': 0.15}


  5%|▍         | 1470/30000 [07:46<2:30:13,  3.17it/s]

{'loss': 0.3795, 'grad_norm': 0.8441534042358398, 'learning_rate': 4.835593220338983e-05, 'epoch': 0.15}


  5%|▍         | 1480/30000 [07:49<2:31:36,  3.14it/s]

{'loss': 0.3689, 'grad_norm': 0.971271276473999, 'learning_rate': 4.833898305084746e-05, 'epoch': 0.15}


  5%|▍         | 1490/30000 [07:52<2:30:31,  3.16it/s]

{'loss': 0.3607, 'grad_norm': 0.9574868679046631, 'learning_rate': 4.8322033898305084e-05, 'epoch': 0.15}


  5%|▌         | 1500/30000 [07:56<2:29:51,  3.17it/s]

{'loss': 0.3567, 'grad_norm': 1.1134191751480103, 'learning_rate': 4.8305084745762714e-05, 'epoch': 0.15}


  5%|▌         | 1510/30000 [07:59<2:30:08,  3.16it/s]

{'loss': 0.3765, 'grad_norm': 0.804233968257904, 'learning_rate': 4.828813559322034e-05, 'epoch': 0.15}


  5%|▌         | 1520/30000 [08:02<2:31:40,  3.13it/s]

{'loss': 0.3477, 'grad_norm': 1.3623058795928955, 'learning_rate': 4.8271186440677966e-05, 'epoch': 0.15}


  5%|▌         | 1530/30000 [08:05<2:31:40,  3.13it/s]

{'loss': 0.3756, 'grad_norm': 1.1110016107559204, 'learning_rate': 4.8254237288135595e-05, 'epoch': 0.15}


  5%|▌         | 1540/30000 [08:08<2:30:13,  3.16it/s]

{'loss': 0.3451, 'grad_norm': 0.9339264631271362, 'learning_rate': 4.8237288135593224e-05, 'epoch': 0.15}


  5%|▌         | 1550/30000 [08:11<2:29:39,  3.17it/s]

{'loss': 0.3689, 'grad_norm': 1.2003973722457886, 'learning_rate': 4.822033898305085e-05, 'epoch': 0.15}


  5%|▌         | 1560/30000 [08:15<2:30:14,  3.16it/s]

{'loss': 0.3598, 'grad_norm': 1.352394938468933, 'learning_rate': 4.8203389830508476e-05, 'epoch': 0.16}


  5%|▌         | 1570/30000 [08:18<2:30:26,  3.15it/s]

{'loss': 0.3527, 'grad_norm': 1.2063566446304321, 'learning_rate': 4.8186440677966105e-05, 'epoch': 0.16}


  5%|▌         | 1580/30000 [08:21<2:30:16,  3.15it/s]

{'loss': 0.3355, 'grad_norm': 0.8053338527679443, 'learning_rate': 4.8169491525423735e-05, 'epoch': 0.16}


  5%|▌         | 1590/30000 [08:24<2:30:50,  3.14it/s]

{'loss': 0.3403, 'grad_norm': 0.7543646097183228, 'learning_rate': 4.815254237288136e-05, 'epoch': 0.16}


  5%|▌         | 1600/30000 [08:27<2:31:01,  3.13it/s]

{'loss': 0.38, 'grad_norm': 1.1638081073760986, 'learning_rate': 4.813559322033899e-05, 'epoch': 0.16}


  5%|▌         | 1610/30000 [08:31<2:29:57,  3.16it/s]

{'loss': 0.3625, 'grad_norm': 0.809240460395813, 'learning_rate': 4.8118644067796616e-05, 'epoch': 0.16}


  5%|▌         | 1620/30000 [08:34<2:30:52,  3.14it/s]

{'loss': 0.358, 'grad_norm': 0.9260777235031128, 'learning_rate': 4.810169491525424e-05, 'epoch': 0.16}


  5%|▌         | 1630/30000 [08:37<2:30:05,  3.15it/s]

{'loss': 0.3678, 'grad_norm': 0.8828718066215515, 'learning_rate': 4.808474576271187e-05, 'epoch': 0.16}


  5%|▌         | 1640/30000 [08:40<2:29:38,  3.16it/s]

{'loss': 0.3757, 'grad_norm': 1.1020135879516602, 'learning_rate': 4.80677966101695e-05, 'epoch': 0.16}


  6%|▌         | 1650/30000 [08:43<2:29:30,  3.16it/s]

{'loss': 0.375, 'grad_norm': 0.729888379573822, 'learning_rate': 4.805084745762712e-05, 'epoch': 0.17}


  6%|▌         | 1660/30000 [08:46<2:30:12,  3.14it/s]

{'loss': 0.3726, 'grad_norm': 0.682757556438446, 'learning_rate': 4.803389830508474e-05, 'epoch': 0.17}


  6%|▌         | 1670/30000 [08:50<2:31:05,  3.13it/s]

{'loss': 0.3503, 'grad_norm': 1.1087316274642944, 'learning_rate': 4.801694915254237e-05, 'epoch': 0.17}


  6%|▌         | 1680/30000 [08:53<2:31:12,  3.12it/s]

{'loss': 0.3546, 'grad_norm': 1.1580698490142822, 'learning_rate': 4.8e-05, 'epoch': 0.17}


  6%|▌         | 1690/30000 [08:56<2:30:51,  3.13it/s]

{'loss': 0.37, 'grad_norm': 0.8106649518013, 'learning_rate': 4.798305084745763e-05, 'epoch': 0.17}


  6%|▌         | 1700/30000 [08:59<2:29:28,  3.16it/s]

{'loss': 0.3769, 'grad_norm': 1.2029262781143188, 'learning_rate': 4.796610169491525e-05, 'epoch': 0.17}


  6%|▌         | 1710/30000 [09:03<2:29:13,  3.16it/s]

{'loss': 0.3638, 'grad_norm': 0.9590532183647156, 'learning_rate': 4.794915254237288e-05, 'epoch': 0.17}


  6%|▌         | 1720/30000 [09:06<2:29:02,  3.16it/s]

{'loss': 0.3687, 'grad_norm': 0.8583996295928955, 'learning_rate': 4.793220338983051e-05, 'epoch': 0.17}


  6%|▌         | 1730/30000 [09:09<2:29:03,  3.16it/s]

{'loss': 0.3576, 'grad_norm': 0.7748932838439941, 'learning_rate': 4.7915254237288134e-05, 'epoch': 0.17}


  6%|▌         | 1740/30000 [09:12<2:28:34,  3.17it/s]

{'loss': 0.3495, 'grad_norm': 1.0309680700302124, 'learning_rate': 4.7898305084745764e-05, 'epoch': 0.17}


  6%|▌         | 1750/30000 [09:15<2:28:51,  3.16it/s]

{'loss': 0.3358, 'grad_norm': 0.8370432257652283, 'learning_rate': 4.788135593220339e-05, 'epoch': 0.17}


  6%|▌         | 1760/30000 [09:18<2:29:01,  3.16it/s]

{'loss': 0.3457, 'grad_norm': 0.9213093519210815, 'learning_rate': 4.786440677966102e-05, 'epoch': 0.18}


  6%|▌         | 1770/30000 [09:22<2:29:11,  3.15it/s]

{'loss': 0.3746, 'grad_norm': 1.0132672786712646, 'learning_rate': 4.7847457627118645e-05, 'epoch': 0.18}


  6%|▌         | 1780/30000 [09:25<2:28:38,  3.16it/s]

{'loss': 0.3389, 'grad_norm': 0.9431884288787842, 'learning_rate': 4.7830508474576274e-05, 'epoch': 0.18}


  6%|▌         | 1790/30000 [09:28<2:30:52,  3.12it/s]

{'loss': 0.3444, 'grad_norm': 1.0321259498596191, 'learning_rate': 4.7813559322033904e-05, 'epoch': 0.18}


  6%|▌         | 1800/30000 [09:31<2:28:34,  3.16it/s]

{'loss': 0.3435, 'grad_norm': 0.8768668174743652, 'learning_rate': 4.7796610169491526e-05, 'epoch': 0.18}


  6%|▌         | 1810/30000 [09:34<2:28:26,  3.17it/s]

{'loss': 0.3622, 'grad_norm': 0.7711781859397888, 'learning_rate': 4.7779661016949156e-05, 'epoch': 0.18}


  6%|▌         | 1820/30000 [09:37<2:29:04,  3.15it/s]

{'loss': 0.3649, 'grad_norm': 0.677715539932251, 'learning_rate': 4.7762711864406785e-05, 'epoch': 0.18}


  6%|▌         | 1830/30000 [09:41<2:28:04,  3.17it/s]

{'loss': 0.339, 'grad_norm': 0.7992697954177856, 'learning_rate': 4.7745762711864414e-05, 'epoch': 0.18}


  6%|▌         | 1840/30000 [09:44<2:28:37,  3.16it/s]

{'loss': 0.344, 'grad_norm': 1.261090874671936, 'learning_rate': 4.772881355932204e-05, 'epoch': 0.18}


  6%|▌         | 1850/30000 [09:47<2:28:11,  3.17it/s]

{'loss': 0.3484, 'grad_norm': 1.039085865020752, 'learning_rate': 4.7711864406779666e-05, 'epoch': 0.18}


  6%|▌         | 1860/30000 [09:50<2:28:33,  3.16it/s]

{'loss': 0.359, 'grad_norm': 0.7471023797988892, 'learning_rate': 4.769491525423729e-05, 'epoch': 0.19}


  6%|▌         | 1870/30000 [09:53<2:28:08,  3.16it/s]

{'loss': 0.3575, 'grad_norm': 0.8344559669494629, 'learning_rate': 4.767796610169492e-05, 'epoch': 0.19}


  6%|▋         | 1880/30000 [09:56<2:28:17,  3.16it/s]

{'loss': 0.3423, 'grad_norm': 1.4285144805908203, 'learning_rate': 4.766101694915254e-05, 'epoch': 0.19}


  6%|▋         | 1890/30000 [10:00<2:27:59,  3.17it/s]

{'loss': 0.3523, 'grad_norm': 1.2939354181289673, 'learning_rate': 4.764406779661017e-05, 'epoch': 0.19}


  6%|▋         | 1900/30000 [10:03<2:27:34,  3.17it/s]

{'loss': 0.3701, 'grad_norm': 1.2789169549942017, 'learning_rate': 4.76271186440678e-05, 'epoch': 0.19}


  6%|▋         | 1910/30000 [10:06<2:28:03,  3.16it/s]

{'loss': 0.34, 'grad_norm': 0.8099216222763062, 'learning_rate': 4.761016949152542e-05, 'epoch': 0.19}


  6%|▋         | 1920/30000 [10:09<2:27:48,  3.17it/s]

{'loss': 0.337, 'grad_norm': 0.9430946707725525, 'learning_rate': 4.759322033898305e-05, 'epoch': 0.19}


  6%|▋         | 1930/30000 [10:12<2:27:48,  3.17it/s]

{'loss': 0.3341, 'grad_norm': 0.9581459760665894, 'learning_rate': 4.757627118644068e-05, 'epoch': 0.19}


  6%|▋         | 1940/30000 [10:15<2:27:49,  3.16it/s]

{'loss': 0.3377, 'grad_norm': 0.7859282493591309, 'learning_rate': 4.755932203389831e-05, 'epoch': 0.19}


  6%|▋         | 1950/30000 [10:19<2:27:41,  3.17it/s]

{'loss': 0.3454, 'grad_norm': 0.809521496295929, 'learning_rate': 4.754237288135593e-05, 'epoch': 0.2}


  7%|▋         | 1960/30000 [10:22<2:27:48,  3.16it/s]

{'loss': 0.331, 'grad_norm': 0.935749888420105, 'learning_rate': 4.752542372881356e-05, 'epoch': 0.2}


  7%|▋         | 1970/30000 [10:25<2:28:12,  3.15it/s]

{'loss': 0.3385, 'grad_norm': 0.987437903881073, 'learning_rate': 4.750847457627119e-05, 'epoch': 0.2}


  7%|▋         | 1980/30000 [10:28<2:28:14,  3.15it/s]

{'loss': 0.3255, 'grad_norm': 0.7946547269821167, 'learning_rate': 4.7491525423728814e-05, 'epoch': 0.2}


  7%|▋         | 1990/30000 [10:31<2:27:22,  3.17it/s]

{'loss': 0.3355, 'grad_norm': 1.1954925060272217, 'learning_rate': 4.747457627118644e-05, 'epoch': 0.2}


  7%|▋         | 2000/30000 [10:34<2:27:22,  3.17it/s]

{'loss': 0.3348, 'grad_norm': 0.8051740527153015, 'learning_rate': 4.745762711864407e-05, 'epoch': 0.2}


  7%|▋         | 2010/30000 [10:38<2:27:21,  3.17it/s]

{'loss': 0.3434, 'grad_norm': 0.8887600898742676, 'learning_rate': 4.74406779661017e-05, 'epoch': 0.2}


  7%|▋         | 2020/30000 [10:41<2:27:16,  3.17it/s]

{'loss': 0.3278, 'grad_norm': 0.9088568687438965, 'learning_rate': 4.7423728813559325e-05, 'epoch': 0.2}


  7%|▋         | 2030/30000 [10:44<2:27:16,  3.17it/s]

{'loss': 0.3361, 'grad_norm': 1.071208119392395, 'learning_rate': 4.7406779661016954e-05, 'epoch': 0.2}


  7%|▋         | 2040/30000 [10:47<2:28:55,  3.13it/s]

{'loss': 0.3474, 'grad_norm': 0.9355261921882629, 'learning_rate': 4.738983050847458e-05, 'epoch': 0.2}


  7%|▋         | 2050/30000 [10:50<2:27:15,  3.16it/s]

{'loss': 0.3336, 'grad_norm': 0.879034698009491, 'learning_rate': 4.7372881355932206e-05, 'epoch': 0.2}


  7%|▋         | 2060/30000 [10:53<2:27:32,  3.16it/s]

{'loss': 0.3432, 'grad_norm': 0.7349750399589539, 'learning_rate': 4.735593220338983e-05, 'epoch': 0.21}


  7%|▋         | 2070/30000 [10:57<2:27:29,  3.16it/s]

{'loss': 0.3494, 'grad_norm': 0.888654351234436, 'learning_rate': 4.733898305084746e-05, 'epoch': 0.21}


  7%|▋         | 2080/30000 [11:00<2:27:05,  3.16it/s]

{'loss': 0.3352, 'grad_norm': 1.161085844039917, 'learning_rate': 4.732203389830509e-05, 'epoch': 0.21}


  7%|▋         | 2090/30000 [11:03<2:27:05,  3.16it/s]

{'loss': 0.3303, 'grad_norm': 0.8569942712783813, 'learning_rate': 4.730508474576271e-05, 'epoch': 0.21}


  7%|▋         | 2100/30000 [11:06<2:29:31,  3.11it/s]

{'loss': 0.3235, 'grad_norm': 0.8193941116333008, 'learning_rate': 4.728813559322034e-05, 'epoch': 0.21}


  7%|▋         | 2110/30000 [11:09<2:25:29,  3.20it/s]

{'loss': 0.3394, 'grad_norm': 0.7020472288131714, 'learning_rate': 4.727118644067797e-05, 'epoch': 0.21}


  7%|▋         | 2120/30000 [11:12<2:25:11,  3.20it/s]

{'loss': 0.3113, 'grad_norm': 0.6904006600379944, 'learning_rate': 4.72542372881356e-05, 'epoch': 0.21}


  7%|▋         | 2130/30000 [11:15<2:25:06,  3.20it/s]

{'loss': 0.3107, 'grad_norm': 0.79493647813797, 'learning_rate': 4.723728813559322e-05, 'epoch': 0.21}


  7%|▋         | 2140/30000 [11:19<2:25:03,  3.20it/s]

{'loss': 0.3414, 'grad_norm': 0.9330780506134033, 'learning_rate': 4.722033898305085e-05, 'epoch': 0.21}


  7%|▋         | 2150/30000 [11:22<2:24:59,  3.20it/s]

{'loss': 0.3264, 'grad_norm': 0.8997758030891418, 'learning_rate': 4.720338983050848e-05, 'epoch': 0.21}


  7%|▋         | 2160/30000 [11:25<2:25:04,  3.20it/s]

{'loss': 0.3272, 'grad_norm': 0.9835037589073181, 'learning_rate': 4.71864406779661e-05, 'epoch': 0.22}


  7%|▋         | 2170/30000 [11:28<2:24:55,  3.20it/s]

{'loss': 0.3596, 'grad_norm': 0.9096139669418335, 'learning_rate': 4.716949152542373e-05, 'epoch': 0.22}


  7%|▋         | 2180/30000 [11:31<2:24:47,  3.20it/s]

{'loss': 0.342, 'grad_norm': 0.7340894937515259, 'learning_rate': 4.715254237288136e-05, 'epoch': 0.22}


  7%|▋         | 2190/30000 [11:34<2:24:50,  3.20it/s]

{'loss': 0.3399, 'grad_norm': 1.0887576341629028, 'learning_rate': 4.713559322033898e-05, 'epoch': 0.22}


  7%|▋         | 2200/30000 [11:37<2:24:38,  3.20it/s]

{'loss': 0.315, 'grad_norm': 0.7309188842773438, 'learning_rate': 4.711864406779661e-05, 'epoch': 0.22}


  7%|▋         | 2210/30000 [11:41<2:24:38,  3.20it/s]

{'loss': 0.3308, 'grad_norm': 0.7574307322502136, 'learning_rate': 4.710169491525424e-05, 'epoch': 0.22}


  7%|▋         | 2220/30000 [11:44<2:24:46,  3.20it/s]

{'loss': 0.3251, 'grad_norm': 1.2076176404953003, 'learning_rate': 4.708474576271187e-05, 'epoch': 0.22}


  7%|▋         | 2230/30000 [11:47<2:24:39,  3.20it/s]

{'loss': 0.3264, 'grad_norm': 0.8537784814834595, 'learning_rate': 4.7067796610169493e-05, 'epoch': 0.22}


  7%|▋         | 2240/30000 [11:50<2:24:36,  3.20it/s]

{'loss': 0.3259, 'grad_norm': 0.850348174571991, 'learning_rate': 4.705084745762712e-05, 'epoch': 0.22}


  8%|▊         | 2250/30000 [11:53<2:24:25,  3.20it/s]

{'loss': 0.3249, 'grad_norm': 0.7213231921195984, 'learning_rate': 4.703389830508475e-05, 'epoch': 0.23}


  8%|▊         | 2260/30000 [11:56<2:24:26,  3.20it/s]

{'loss': 0.3286, 'grad_norm': 0.8669227957725525, 'learning_rate': 4.7016949152542375e-05, 'epoch': 0.23}


  8%|▊         | 2270/30000 [11:59<2:24:29,  3.20it/s]

{'loss': 0.3475, 'grad_norm': 0.7586889863014221, 'learning_rate': 4.7e-05, 'epoch': 0.23}


  8%|▊         | 2280/30000 [12:02<2:24:30,  3.20it/s]

{'loss': 0.3711, 'grad_norm': 1.6497501134872437, 'learning_rate': 4.6983050847457627e-05, 'epoch': 0.23}


  8%|▊         | 2290/30000 [12:06<2:24:18,  3.20it/s]

{'loss': 0.3445, 'grad_norm': 1.1341150999069214, 'learning_rate': 4.6966101694915256e-05, 'epoch': 0.23}


  8%|▊         | 2300/30000 [12:09<2:24:24,  3.20it/s]

{'loss': 0.3461, 'grad_norm': 1.1463305950164795, 'learning_rate': 4.694915254237288e-05, 'epoch': 0.23}


  8%|▊         | 2310/30000 [12:12<2:24:14,  3.20it/s]

{'loss': 0.3171, 'grad_norm': 0.8808380365371704, 'learning_rate': 4.693220338983051e-05, 'epoch': 0.23}


  8%|▊         | 2320/30000 [12:15<2:23:58,  3.20it/s]

{'loss': 0.3351, 'grad_norm': 0.9201961755752563, 'learning_rate': 4.691525423728814e-05, 'epoch': 0.23}


  8%|▊         | 2330/30000 [12:18<2:24:13,  3.20it/s]

{'loss': 0.3264, 'grad_norm': 0.6472340822219849, 'learning_rate': 4.6898305084745767e-05, 'epoch': 0.23}


  8%|▊         | 2340/30000 [12:21<2:23:55,  3.20it/s]

{'loss': 0.3079, 'grad_norm': 0.7606852054595947, 'learning_rate': 4.688135593220339e-05, 'epoch': 0.23}


  8%|▊         | 2350/30000 [12:24<2:24:06,  3.20it/s]

{'loss': 0.3298, 'grad_norm': 0.6877907514572144, 'learning_rate': 4.686440677966102e-05, 'epoch': 0.23}


  8%|▊         | 2360/30000 [12:27<2:24:16,  3.19it/s]

{'loss': 0.3171, 'grad_norm': 0.6647066473960876, 'learning_rate': 4.684745762711865e-05, 'epoch': 0.24}


  8%|▊         | 2370/30000 [12:31<2:30:54,  3.05it/s]

{'loss': 0.3404, 'grad_norm': 1.0189846754074097, 'learning_rate': 4.683050847457627e-05, 'epoch': 0.24}


  8%|▊         | 2380/30000 [12:34<2:30:07,  3.07it/s]

{'loss': 0.3325, 'grad_norm': 0.7401015162467957, 'learning_rate': 4.68135593220339e-05, 'epoch': 0.24}


  8%|▊         | 2390/30000 [12:37<2:27:00,  3.13it/s]

{'loss': 0.3396, 'grad_norm': 0.8039404153823853, 'learning_rate': 4.679661016949153e-05, 'epoch': 0.24}


  8%|▊         | 2400/30000 [12:40<2:30:22,  3.06it/s]

{'loss': 0.3376, 'grad_norm': 0.7432248592376709, 'learning_rate': 4.677966101694916e-05, 'epoch': 0.24}


  8%|▊         | 2410/30000 [12:44<2:26:41,  3.13it/s]

{'loss': 0.3154, 'grad_norm': 0.648418128490448, 'learning_rate': 4.676271186440678e-05, 'epoch': 0.24}


  8%|▊         | 2420/30000 [12:47<2:26:48,  3.13it/s]

{'loss': 0.3225, 'grad_norm': 0.6442493796348572, 'learning_rate': 4.674576271186441e-05, 'epoch': 0.24}


  8%|▊         | 2430/30000 [12:50<2:25:13,  3.16it/s]

{'loss': 0.3157, 'grad_norm': 0.7076245546340942, 'learning_rate': 4.672881355932204e-05, 'epoch': 0.24}


  8%|▊         | 2440/30000 [12:53<2:24:32,  3.18it/s]

{'loss': 0.321, 'grad_norm': 0.8574250936508179, 'learning_rate': 4.671186440677966e-05, 'epoch': 0.24}


  8%|▊         | 2450/30000 [12:56<2:24:41,  3.17it/s]

{'loss': 0.335, 'grad_norm': 0.7622602581977844, 'learning_rate': 4.669491525423729e-05, 'epoch': 0.24}


  8%|▊         | 2460/30000 [12:59<2:24:28,  3.18it/s]

{'loss': 0.3486, 'grad_norm': 0.8120979070663452, 'learning_rate': 4.667796610169492e-05, 'epoch': 0.25}


  8%|▊         | 2470/30000 [13:03<2:24:42,  3.17it/s]

{'loss': 0.3228, 'grad_norm': 0.7588837742805481, 'learning_rate': 4.666101694915255e-05, 'epoch': 0.25}


  8%|▊         | 2480/30000 [13:06<2:24:26,  3.18it/s]

{'loss': 0.3154, 'grad_norm': 0.6959908604621887, 'learning_rate': 4.6644067796610166e-05, 'epoch': 0.25}


  8%|▊         | 2490/30000 [13:09<2:24:23,  3.18it/s]

{'loss': 0.3289, 'grad_norm': 0.7313858866691589, 'learning_rate': 4.6627118644067795e-05, 'epoch': 0.25}


  8%|▊         | 2500/30000 [13:12<2:24:24,  3.17it/s]

{'loss': 0.3245, 'grad_norm': 0.6637462377548218, 'learning_rate': 4.6610169491525425e-05, 'epoch': 0.25}


  8%|▊         | 2510/30000 [13:15<2:23:49,  3.19it/s]

{'loss': 0.3187, 'grad_norm': 0.7501656413078308, 'learning_rate': 4.6593220338983054e-05, 'epoch': 0.25}


  8%|▊         | 2520/30000 [13:18<2:23:55,  3.18it/s]

{'loss': 0.3201, 'grad_norm': 0.7578541040420532, 'learning_rate': 4.657627118644068e-05, 'epoch': 0.25}


  8%|▊         | 2530/30000 [13:22<2:23:54,  3.18it/s]

{'loss': 0.3084, 'grad_norm': 0.689811646938324, 'learning_rate': 4.6559322033898306e-05, 'epoch': 0.25}


  8%|▊         | 2540/30000 [13:25<2:23:55,  3.18it/s]

{'loss': 0.3077, 'grad_norm': 0.6966248750686646, 'learning_rate': 4.6542372881355935e-05, 'epoch': 0.25}


  8%|▊         | 2550/30000 [13:28<2:23:40,  3.18it/s]

{'loss': 0.3215, 'grad_norm': 0.6827937364578247, 'learning_rate': 4.652542372881356e-05, 'epoch': 0.26}


  9%|▊         | 2560/30000 [13:31<2:23:43,  3.18it/s]

{'loss': 0.304, 'grad_norm': 0.7228596806526184, 'learning_rate': 4.650847457627119e-05, 'epoch': 0.26}


  9%|▊         | 2570/30000 [13:34<2:23:41,  3.18it/s]

{'loss': 0.3308, 'grad_norm': 0.8091414570808411, 'learning_rate': 4.649152542372882e-05, 'epoch': 0.26}


  9%|▊         | 2580/30000 [13:37<2:23:27,  3.19it/s]

{'loss': 0.3253, 'grad_norm': 1.177407145500183, 'learning_rate': 4.6474576271186446e-05, 'epoch': 0.26}


  9%|▊         | 2590/30000 [13:40<2:23:39,  3.18it/s]

{'loss': 0.3177, 'grad_norm': 1.007314682006836, 'learning_rate': 4.645762711864407e-05, 'epoch': 0.26}


  9%|▊         | 2600/30000 [13:44<2:23:41,  3.18it/s]

{'loss': 0.3194, 'grad_norm': 0.7474990487098694, 'learning_rate': 4.64406779661017e-05, 'epoch': 0.26}


  9%|▊         | 2610/30000 [13:47<2:23:28,  3.18it/s]

{'loss': 0.3065, 'grad_norm': 0.7376142740249634, 'learning_rate': 4.642372881355933e-05, 'epoch': 0.26}


  9%|▊         | 2620/30000 [13:50<2:23:38,  3.18it/s]

{'loss': 0.3241, 'grad_norm': 0.7967848777770996, 'learning_rate': 4.640677966101695e-05, 'epoch': 0.26}


  9%|▉         | 2630/30000 [13:53<2:23:19,  3.18it/s]

{'loss': 0.3344, 'grad_norm': 0.7623135447502136, 'learning_rate': 4.638983050847458e-05, 'epoch': 0.26}


  9%|▉         | 2640/30000 [13:56<2:23:27,  3.18it/s]

{'loss': 0.3189, 'grad_norm': 0.8136531114578247, 'learning_rate': 4.637288135593221e-05, 'epoch': 0.26}


  9%|▉         | 2650/30000 [13:59<2:23:18,  3.18it/s]

{'loss': 0.3173, 'grad_norm': 1.1426308155059814, 'learning_rate': 4.635593220338984e-05, 'epoch': 0.27}


  9%|▉         | 2660/30000 [14:02<2:23:14,  3.18it/s]

{'loss': 0.3192, 'grad_norm': 0.9064581990242004, 'learning_rate': 4.633898305084746e-05, 'epoch': 0.27}


  9%|▉         | 2670/30000 [14:06<2:23:15,  3.18it/s]

{'loss': 0.3213, 'grad_norm': 0.663105845451355, 'learning_rate': 4.632203389830509e-05, 'epoch': 0.27}


  9%|▉         | 2680/30000 [14:09<2:23:06,  3.18it/s]

{'loss': 0.3101, 'grad_norm': 1.0954618453979492, 'learning_rate': 4.630508474576272e-05, 'epoch': 0.27}


  9%|▉         | 2690/30000 [14:12<2:22:53,  3.19it/s]

{'loss': 0.312, 'grad_norm': 0.5862448215484619, 'learning_rate': 4.628813559322034e-05, 'epoch': 0.27}


  9%|▉         | 2700/30000 [14:15<2:22:49,  3.19it/s]

{'loss': 0.3265, 'grad_norm': 0.8061392903327942, 'learning_rate': 4.6271186440677964e-05, 'epoch': 0.27}


  9%|▉         | 2710/30000 [14:18<2:23:19,  3.17it/s]

{'loss': 0.3086, 'grad_norm': 1.1001222133636475, 'learning_rate': 4.6254237288135594e-05, 'epoch': 0.27}


  9%|▉         | 2720/30000 [14:21<2:22:56,  3.18it/s]

{'loss': 0.3306, 'grad_norm': 0.7872691750526428, 'learning_rate': 4.623728813559322e-05, 'epoch': 0.27}


  9%|▉         | 2730/30000 [14:24<2:22:49,  3.18it/s]

{'loss': 0.3096, 'grad_norm': 1.1048153638839722, 'learning_rate': 4.6220338983050846e-05, 'epoch': 0.27}


  9%|▉         | 2740/30000 [14:28<2:22:55,  3.18it/s]

{'loss': 0.3436, 'grad_norm': 0.7163332104682922, 'learning_rate': 4.6203389830508475e-05, 'epoch': 0.27}


  9%|▉         | 2750/30000 [14:31<2:22:43,  3.18it/s]

{'loss': 0.301, 'grad_norm': 0.8418437242507935, 'learning_rate': 4.6186440677966104e-05, 'epoch': 0.28}


  9%|▉         | 2760/30000 [14:34<2:25:14,  3.13it/s]

{'loss': 0.3209, 'grad_norm': 0.865027666091919, 'learning_rate': 4.6169491525423734e-05, 'epoch': 0.28}


  9%|▉         | 2770/30000 [14:37<2:23:42,  3.16it/s]

{'loss': 0.3011, 'grad_norm': 0.6741382479667664, 'learning_rate': 4.6152542372881356e-05, 'epoch': 0.28}


  9%|▉         | 2780/30000 [14:40<2:23:24,  3.16it/s]

{'loss': 0.2986, 'grad_norm': 0.780096709728241, 'learning_rate': 4.6135593220338986e-05, 'epoch': 0.28}


  9%|▉         | 2790/30000 [14:43<2:22:46,  3.18it/s]

{'loss': 0.3441, 'grad_norm': 0.9832929968833923, 'learning_rate': 4.6118644067796615e-05, 'epoch': 0.28}


  9%|▉         | 2800/30000 [14:47<2:22:18,  3.19it/s]

{'loss': 0.3113, 'grad_norm': 1.0042020082473755, 'learning_rate': 4.610169491525424e-05, 'epoch': 0.28}


  9%|▉         | 2810/30000 [14:50<2:22:25,  3.18it/s]

{'loss': 0.3299, 'grad_norm': 0.644050121307373, 'learning_rate': 4.608474576271187e-05, 'epoch': 0.28}


  9%|▉         | 2820/30000 [14:53<2:22:29,  3.18it/s]

{'loss': 0.3219, 'grad_norm': 0.7469609975814819, 'learning_rate': 4.6067796610169496e-05, 'epoch': 0.28}


  9%|▉         | 2830/30000 [14:56<2:22:36,  3.18it/s]

{'loss': 0.3156, 'grad_norm': 1.2194548845291138, 'learning_rate': 4.605084745762712e-05, 'epoch': 0.28}


  9%|▉         | 2840/30000 [14:59<2:22:14,  3.18it/s]

{'loss': 0.3213, 'grad_norm': 0.6565119028091431, 'learning_rate': 4.603389830508475e-05, 'epoch': 0.28}


 10%|▉         | 2850/30000 [15:02<2:22:14,  3.18it/s]

{'loss': 0.3135, 'grad_norm': 0.7264887690544128, 'learning_rate': 4.601694915254238e-05, 'epoch': 0.28}


 10%|▉         | 2860/30000 [15:05<2:22:05,  3.18it/s]

{'loss': 0.3199, 'grad_norm': 0.6591058373451233, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.29}


 10%|▉         | 2870/30000 [15:09<2:22:17,  3.18it/s]

{'loss': 0.2992, 'grad_norm': 0.6788986325263977, 'learning_rate': 4.598305084745763e-05, 'epoch': 0.29}


 10%|▉         | 2880/30000 [15:12<2:22:57,  3.16it/s]

{'loss': 0.3032, 'grad_norm': 0.676214873790741, 'learning_rate': 4.596610169491526e-05, 'epoch': 0.29}


 10%|▉         | 2890/30000 [15:15<2:21:54,  3.18it/s]

{'loss': 0.3059, 'grad_norm': 0.6698433756828308, 'learning_rate': 4.594915254237288e-05, 'epoch': 0.29}


 10%|▉         | 2900/30000 [15:18<2:21:55,  3.18it/s]

{'loss': 0.3287, 'grad_norm': 0.8350700736045837, 'learning_rate': 4.593220338983051e-05, 'epoch': 0.29}


 10%|▉         | 2910/30000 [15:21<2:21:58,  3.18it/s]

{'loss': 0.3117, 'grad_norm': 0.8836612701416016, 'learning_rate': 4.591525423728813e-05, 'epoch': 0.29}


 10%|▉         | 2920/30000 [15:24<2:21:52,  3.18it/s]

{'loss': 0.3265, 'grad_norm': 0.7106208205223083, 'learning_rate': 4.589830508474576e-05, 'epoch': 0.29}


 10%|▉         | 2930/30000 [15:27<2:21:42,  3.18it/s]

{'loss': 0.3076, 'grad_norm': 1.0139248371124268, 'learning_rate': 4.588135593220339e-05, 'epoch': 0.29}


 10%|▉         | 2940/30000 [15:31<2:21:51,  3.18it/s]

{'loss': 0.3181, 'grad_norm': 0.6011265516281128, 'learning_rate': 4.5864406779661014e-05, 'epoch': 0.29}


 10%|▉         | 2950/30000 [15:34<2:21:43,  3.18it/s]

{'loss': 0.3116, 'grad_norm': 0.7079651951789856, 'learning_rate': 4.5847457627118644e-05, 'epoch': 0.29}


 10%|▉         | 2960/30000 [15:37<2:21:46,  3.18it/s]

{'loss': 0.3164, 'grad_norm': 0.7123414278030396, 'learning_rate': 4.583050847457627e-05, 'epoch': 0.3}


 10%|▉         | 2970/30000 [15:40<2:21:52,  3.18it/s]

{'loss': 0.3149, 'grad_norm': 0.8165867328643799, 'learning_rate': 4.58135593220339e-05, 'epoch': 0.3}


 10%|▉         | 2980/30000 [15:43<2:21:53,  3.17it/s]

{'loss': 0.3051, 'grad_norm': 0.8002784252166748, 'learning_rate': 4.5796610169491525e-05, 'epoch': 0.3}


 10%|▉         | 2990/30000 [15:46<2:21:41,  3.18it/s]

{'loss': 0.3223, 'grad_norm': 0.6662521362304688, 'learning_rate': 4.5779661016949154e-05, 'epoch': 0.3}


 10%|█         | 3000/30000 [15:50<2:21:30,  3.18it/s]

{'loss': 0.3155, 'grad_norm': 0.8360806703567505, 'learning_rate': 4.5762711864406784e-05, 'epoch': 0.3}


 10%|█         | 3010/30000 [15:53<2:21:20,  3.18it/s]

{'loss': 0.3084, 'grad_norm': 1.0684542655944824, 'learning_rate': 4.5745762711864406e-05, 'epoch': 0.3}


 10%|█         | 3020/30000 [15:56<2:21:18,  3.18it/s]

{'loss': 0.3054, 'grad_norm': 0.647672712802887, 'learning_rate': 4.5728813559322036e-05, 'epoch': 0.3}


 10%|█         | 3030/30000 [15:59<2:21:30,  3.18it/s]

{'loss': 0.3157, 'grad_norm': 0.7394247055053711, 'learning_rate': 4.5711864406779665e-05, 'epoch': 0.3}


 10%|█         | 3040/30000 [16:02<2:21:06,  3.18it/s]

{'loss': 0.3345, 'grad_norm': 0.8475285172462463, 'learning_rate': 4.5694915254237294e-05, 'epoch': 0.3}


 10%|█         | 3050/30000 [16:05<2:21:06,  3.18it/s]

{'loss': 0.313, 'grad_norm': 0.6530641317367554, 'learning_rate': 4.567796610169492e-05, 'epoch': 0.3}


 10%|█         | 3060/30000 [16:08<2:21:10,  3.18it/s]

{'loss': 0.3197, 'grad_norm': 0.8220323324203491, 'learning_rate': 4.5661016949152546e-05, 'epoch': 0.31}


 10%|█         | 3070/30000 [16:12<2:21:06,  3.18it/s]

{'loss': 0.3166, 'grad_norm': 0.8571009039878845, 'learning_rate': 4.5644067796610176e-05, 'epoch': 0.31}


 10%|█         | 3080/30000 [16:15<2:21:00,  3.18it/s]

{'loss': 0.3246, 'grad_norm': 0.7428576350212097, 'learning_rate': 4.56271186440678e-05, 'epoch': 0.31}


 10%|█         | 3090/30000 [16:18<2:20:55,  3.18it/s]

{'loss': 0.3067, 'grad_norm': 0.821925699710846, 'learning_rate': 4.561016949152543e-05, 'epoch': 0.31}


 10%|█         | 3100/30000 [16:21<2:20:54,  3.18it/s]

{'loss': 0.3114, 'grad_norm': 0.7163041234016418, 'learning_rate': 4.559322033898305e-05, 'epoch': 0.31}


 10%|█         | 3110/30000 [16:24<2:21:18,  3.17it/s]

{'loss': 0.3193, 'grad_norm': 0.7777540683746338, 'learning_rate': 4.557627118644068e-05, 'epoch': 0.31}


 10%|█         | 3120/30000 [16:27<2:20:58,  3.18it/s]

{'loss': 0.3097, 'grad_norm': 0.6904956698417664, 'learning_rate': 4.55593220338983e-05, 'epoch': 0.31}


 10%|█         | 3130/30000 [16:30<2:22:45,  3.14it/s]

{'loss': 0.297, 'grad_norm': 0.6952459812164307, 'learning_rate': 4.554237288135593e-05, 'epoch': 0.31}


 10%|█         | 3140/30000 [16:34<2:20:53,  3.18it/s]

{'loss': 0.3025, 'grad_norm': 0.6574915051460266, 'learning_rate': 4.552542372881356e-05, 'epoch': 0.31}


 10%|█         | 3150/30000 [16:37<2:23:17,  3.12it/s]

{'loss': 0.3142, 'grad_norm': 0.8075567483901978, 'learning_rate': 4.550847457627119e-05, 'epoch': 0.32}


 11%|█         | 3160/30000 [16:40<2:22:05,  3.15it/s]

{'loss': 0.2978, 'grad_norm': 0.981246292591095, 'learning_rate': 4.549152542372881e-05, 'epoch': 0.32}


 11%|█         | 3170/30000 [16:43<2:20:55,  3.17it/s]

{'loss': 0.3289, 'grad_norm': 0.7413700222969055, 'learning_rate': 4.547457627118644e-05, 'epoch': 0.32}


 11%|█         | 3180/30000 [16:46<2:20:33,  3.18it/s]

{'loss': 0.2952, 'grad_norm': 0.6219881176948547, 'learning_rate': 4.545762711864407e-05, 'epoch': 0.32}


 11%|█         | 3190/30000 [16:49<2:20:26,  3.18it/s]

{'loss': 0.3202, 'grad_norm': 0.6946741342544556, 'learning_rate': 4.5440677966101694e-05, 'epoch': 0.32}


 11%|█         | 3200/30000 [16:53<2:20:34,  3.18it/s]

{'loss': 0.3126, 'grad_norm': 0.7144635915756226, 'learning_rate': 4.542372881355932e-05, 'epoch': 0.32}


 11%|█         | 3210/30000 [16:56<2:20:14,  3.18it/s]

{'loss': 0.3047, 'grad_norm': 0.8672412037849426, 'learning_rate': 4.540677966101695e-05, 'epoch': 0.32}


 11%|█         | 3220/30000 [16:59<2:20:32,  3.18it/s]

{'loss': 0.3125, 'grad_norm': 0.7986621260643005, 'learning_rate': 4.538983050847458e-05, 'epoch': 0.32}


 11%|█         | 3230/30000 [17:02<2:20:24,  3.18it/s]

{'loss': 0.3188, 'grad_norm': 0.7876445651054382, 'learning_rate': 4.5372881355932205e-05, 'epoch': 0.32}


 11%|█         | 3240/30000 [17:05<2:20:05,  3.18it/s]

{'loss': 0.307, 'grad_norm': 0.7481833696365356, 'learning_rate': 4.5355932203389834e-05, 'epoch': 0.32}


 11%|█         | 3250/30000 [17:08<2:20:19,  3.18it/s]

{'loss': 0.3064, 'grad_norm': 0.7647323608398438, 'learning_rate': 4.533898305084746e-05, 'epoch': 0.33}


 11%|█         | 3260/30000 [17:11<2:20:10,  3.18it/s]

{'loss': 0.2986, 'grad_norm': 0.6745109558105469, 'learning_rate': 4.5322033898305086e-05, 'epoch': 0.33}


 11%|█         | 3270/30000 [17:15<2:19:58,  3.18it/s]

{'loss': 0.3239, 'grad_norm': 0.6989058256149292, 'learning_rate': 4.5305084745762715e-05, 'epoch': 0.33}


 11%|█         | 3280/30000 [17:18<2:20:05,  3.18it/s]

{'loss': 0.3031, 'grad_norm': 0.6877852082252502, 'learning_rate': 4.5288135593220345e-05, 'epoch': 0.33}


 11%|█         | 3290/30000 [17:21<2:19:51,  3.18it/s]

{'loss': 0.3052, 'grad_norm': 0.6913958191871643, 'learning_rate': 4.5271186440677974e-05, 'epoch': 0.33}


 11%|█         | 3300/30000 [17:24<2:20:07,  3.18it/s]

{'loss': 0.2997, 'grad_norm': 0.7488762140274048, 'learning_rate': 4.5254237288135596e-05, 'epoch': 0.33}


 11%|█         | 3310/30000 [17:27<2:19:55,  3.18it/s]

{'loss': 0.3083, 'grad_norm': 0.8322533965110779, 'learning_rate': 4.523728813559322e-05, 'epoch': 0.33}


 11%|█         | 3320/30000 [17:30<2:19:35,  3.19it/s]

{'loss': 0.2966, 'grad_norm': 0.6794643402099609, 'learning_rate': 4.522033898305085e-05, 'epoch': 0.33}


 11%|█         | 3330/30000 [17:33<2:19:44,  3.18it/s]

{'loss': 0.3018, 'grad_norm': 0.730298638343811, 'learning_rate': 4.520338983050848e-05, 'epoch': 0.33}


 11%|█         | 3340/30000 [17:37<2:19:46,  3.18it/s]

{'loss': 0.3059, 'grad_norm': 0.6124799847602844, 'learning_rate': 4.51864406779661e-05, 'epoch': 0.33}


 11%|█         | 3350/30000 [17:40<2:19:30,  3.18it/s]

{'loss': 0.2859, 'grad_norm': 0.742633044719696, 'learning_rate': 4.516949152542373e-05, 'epoch': 0.34}


 11%|█         | 3360/30000 [17:43<2:19:55,  3.17it/s]

{'loss': 0.2895, 'grad_norm': 1.208903193473816, 'learning_rate': 4.515254237288136e-05, 'epoch': 0.34}


 11%|█         | 3370/30000 [17:46<2:19:29,  3.18it/s]

{'loss': 0.2915, 'grad_norm': 0.8377069234848022, 'learning_rate': 4.513559322033898e-05, 'epoch': 0.34}


 11%|█▏        | 3380/30000 [17:49<2:19:29,  3.18it/s]

{'loss': 0.3099, 'grad_norm': 1.1568325757980347, 'learning_rate': 4.511864406779661e-05, 'epoch': 0.34}


 11%|█▏        | 3390/30000 [17:52<2:19:31,  3.18it/s]

{'loss': 0.293, 'grad_norm': 0.6657543182373047, 'learning_rate': 4.510169491525424e-05, 'epoch': 0.34}


 11%|█▏        | 3400/30000 [17:56<2:19:24,  3.18it/s]

{'loss': 0.2968, 'grad_norm': 0.8737444281578064, 'learning_rate': 4.508474576271187e-05, 'epoch': 0.34}


 11%|█▏        | 3410/30000 [17:59<2:19:26,  3.18it/s]

{'loss': 0.3089, 'grad_norm': 0.8269426822662354, 'learning_rate': 4.506779661016949e-05, 'epoch': 0.34}


 11%|█▏        | 3420/30000 [18:02<2:19:15,  3.18it/s]

{'loss': 0.3072, 'grad_norm': 0.7373241782188416, 'learning_rate': 4.505084745762712e-05, 'epoch': 0.34}


 11%|█▏        | 3430/30000 [18:05<2:19:10,  3.18it/s]

{'loss': 0.3096, 'grad_norm': 0.7728370428085327, 'learning_rate': 4.503389830508475e-05, 'epoch': 0.34}


 11%|█▏        | 3440/30000 [18:08<2:19:07,  3.18it/s]

{'loss': 0.2927, 'grad_norm': 0.6753607392311096, 'learning_rate': 4.5016949152542373e-05, 'epoch': 0.34}


 12%|█▏        | 3450/30000 [18:11<2:19:05,  3.18it/s]

{'loss': 0.2967, 'grad_norm': 0.6159091591835022, 'learning_rate': 4.5e-05, 'epoch': 0.34}


 12%|█▏        | 3460/30000 [18:14<2:19:00,  3.18it/s]

{'loss': 0.2946, 'grad_norm': 0.6400763988494873, 'learning_rate': 4.498305084745763e-05, 'epoch': 0.35}


 12%|█▏        | 3470/30000 [18:18<2:18:46,  3.19it/s]

{'loss': 0.3186, 'grad_norm': 0.8297039866447449, 'learning_rate': 4.4966101694915255e-05, 'epoch': 0.35}


 12%|█▏        | 3480/30000 [18:21<2:18:55,  3.18it/s]

{'loss': 0.2971, 'grad_norm': 0.9520801901817322, 'learning_rate': 4.4949152542372884e-05, 'epoch': 0.35}


 12%|█▏        | 3490/30000 [18:24<2:18:45,  3.18it/s]

{'loss': 0.3055, 'grad_norm': 0.662495493888855, 'learning_rate': 4.4932203389830513e-05, 'epoch': 0.35}


 12%|█▏        | 3500/30000 [18:27<2:19:04,  3.18it/s]

{'loss': 0.2844, 'grad_norm': 0.6864296197891235, 'learning_rate': 4.491525423728814e-05, 'epoch': 0.35}


 12%|█▏        | 3510/30000 [18:30<2:18:54,  3.18it/s]

{'loss': 0.3021, 'grad_norm': 0.720084011554718, 'learning_rate': 4.4898305084745765e-05, 'epoch': 0.35}


 12%|█▏        | 3520/30000 [18:33<2:18:41,  3.18it/s]

{'loss': 0.3039, 'grad_norm': 0.7263253927230835, 'learning_rate': 4.488135593220339e-05, 'epoch': 0.35}


 12%|█▏        | 3530/30000 [18:36<2:18:30,  3.18it/s]

{'loss': 0.3091, 'grad_norm': 0.8455332517623901, 'learning_rate': 4.486440677966102e-05, 'epoch': 0.35}


 12%|█▏        | 3540/30000 [18:40<2:18:45,  3.18it/s]

{'loss': 0.2989, 'grad_norm': 0.7190749049186707, 'learning_rate': 4.484745762711865e-05, 'epoch': 0.35}


 12%|█▏        | 3550/30000 [18:43<2:18:35,  3.18it/s]

{'loss': 0.3125, 'grad_norm': 1.120638370513916, 'learning_rate': 4.483050847457627e-05, 'epoch': 0.35}


 12%|█▏        | 3560/30000 [18:46<2:18:33,  3.18it/s]

{'loss': 0.319, 'grad_norm': 0.6178040504455566, 'learning_rate': 4.48135593220339e-05, 'epoch': 0.36}


 12%|█▏        | 3570/30000 [18:49<2:18:20,  3.18it/s]

{'loss': 0.2867, 'grad_norm': 0.6916740536689758, 'learning_rate': 4.479661016949153e-05, 'epoch': 0.36}


 12%|█▏        | 3580/30000 [18:52<2:18:34,  3.18it/s]

{'loss': 0.3024, 'grad_norm': 0.6078788638114929, 'learning_rate': 4.477966101694915e-05, 'epoch': 0.36}


 12%|█▏        | 3590/30000 [18:55<2:18:16,  3.18it/s]

{'loss': 0.2756, 'grad_norm': 0.6399679780006409, 'learning_rate': 4.476271186440678e-05, 'epoch': 0.36}


 12%|█▏        | 3600/30000 [18:58<2:18:13,  3.18it/s]

{'loss': 0.2909, 'grad_norm': 0.8625954985618591, 'learning_rate': 4.474576271186441e-05, 'epoch': 0.36}


 12%|█▏        | 3610/30000 [19:02<2:18:11,  3.18it/s]

{'loss': 0.2903, 'grad_norm': 0.8666464686393738, 'learning_rate': 4.472881355932204e-05, 'epoch': 0.36}


 12%|█▏        | 3620/30000 [19:05<2:18:18,  3.18it/s]

{'loss': 0.2964, 'grad_norm': 0.672201931476593, 'learning_rate': 4.471186440677966e-05, 'epoch': 0.36}


 12%|█▏        | 3630/30000 [19:08<2:18:12,  3.18it/s]

{'loss': 0.3008, 'grad_norm': 0.8473816514015198, 'learning_rate': 4.469491525423729e-05, 'epoch': 0.36}


 12%|█▏        | 3640/30000 [19:11<2:18:10,  3.18it/s]

{'loss': 0.3015, 'grad_norm': 0.7352260947227478, 'learning_rate': 4.467796610169492e-05, 'epoch': 0.36}


 12%|█▏        | 3650/30000 [19:14<2:18:06,  3.18it/s]

{'loss': 0.3082, 'grad_norm': 0.8318928480148315, 'learning_rate': 4.466101694915254e-05, 'epoch': 0.36}


 12%|█▏        | 3660/30000 [19:17<2:18:03,  3.18it/s]

{'loss': 0.2936, 'grad_norm': 0.8086314797401428, 'learning_rate': 4.464406779661017e-05, 'epoch': 0.37}


 12%|█▏        | 3670/30000 [19:20<2:18:11,  3.18it/s]

{'loss': 0.2975, 'grad_norm': 0.8824827671051025, 'learning_rate': 4.46271186440678e-05, 'epoch': 0.37}


 12%|█▏        | 3680/30000 [19:24<2:17:47,  3.18it/s]

{'loss': 0.2798, 'grad_norm': 0.58163982629776, 'learning_rate': 4.461016949152543e-05, 'epoch': 0.37}


 12%|█▏        | 3690/30000 [19:27<2:17:47,  3.18it/s]

{'loss': 0.2937, 'grad_norm': 0.7239499688148499, 'learning_rate': 4.459322033898305e-05, 'epoch': 0.37}


 12%|█▏        | 3700/30000 [19:30<2:17:48,  3.18it/s]

{'loss': 0.2931, 'grad_norm': 0.647504448890686, 'learning_rate': 4.457627118644068e-05, 'epoch': 0.37}


 12%|█▏        | 3710/30000 [19:33<2:17:54,  3.18it/s]

{'loss': 0.2919, 'grad_norm': 0.6202300190925598, 'learning_rate': 4.455932203389831e-05, 'epoch': 0.37}


 12%|█▏        | 3720/30000 [19:36<2:17:38,  3.18it/s]

{'loss': 0.2843, 'grad_norm': 0.605057418346405, 'learning_rate': 4.4542372881355934e-05, 'epoch': 0.37}


 12%|█▏        | 3730/30000 [19:39<2:17:45,  3.18it/s]

{'loss': 0.2842, 'grad_norm': 0.5957178473472595, 'learning_rate': 4.452542372881356e-05, 'epoch': 0.37}


 12%|█▏        | 3740/30000 [19:43<2:19:02,  3.15it/s]

{'loss': 0.2922, 'grad_norm': 0.6866041421890259, 'learning_rate': 4.4508474576271186e-05, 'epoch': 0.37}


 12%|█▎        | 3750/30000 [19:46<2:17:44,  3.18it/s]

{'loss': 0.3046, 'grad_norm': 0.9705011248588562, 'learning_rate': 4.4491525423728816e-05, 'epoch': 0.38}


 13%|█▎        | 3760/30000 [19:49<2:17:34,  3.18it/s]

{'loss': 0.2825, 'grad_norm': 0.7396829724311829, 'learning_rate': 4.447457627118644e-05, 'epoch': 0.38}


 13%|█▎        | 3770/30000 [19:52<2:18:46,  3.15it/s]

{'loss': 0.3004, 'grad_norm': 0.7666935920715332, 'learning_rate': 4.445762711864407e-05, 'epoch': 0.38}


 13%|█▎        | 3780/30000 [19:55<2:17:36,  3.18it/s]

{'loss': 0.2965, 'grad_norm': 0.6285961270332336, 'learning_rate': 4.44406779661017e-05, 'epoch': 0.38}


 13%|█▎        | 3790/30000 [19:58<2:17:24,  3.18it/s]

{'loss': 0.3007, 'grad_norm': 0.6948471069335938, 'learning_rate': 4.4423728813559326e-05, 'epoch': 0.38}


 13%|█▎        | 3800/30000 [20:01<2:17:26,  3.18it/s]

{'loss': 0.2929, 'grad_norm': 0.8306883573532104, 'learning_rate': 4.440677966101695e-05, 'epoch': 0.38}


 13%|█▎        | 3810/30000 [20:05<2:17:26,  3.18it/s]

{'loss': 0.3087, 'grad_norm': 0.5944392085075378, 'learning_rate': 4.438983050847458e-05, 'epoch': 0.38}


 13%|█▎        | 3820/30000 [20:08<2:17:08,  3.18it/s]

{'loss': 0.2989, 'grad_norm': 1.0077245235443115, 'learning_rate': 4.437288135593221e-05, 'epoch': 0.38}


 13%|█▎        | 3830/30000 [20:11<2:17:28,  3.17it/s]

{'loss': 0.2967, 'grad_norm': 0.7594165802001953, 'learning_rate': 4.435593220338983e-05, 'epoch': 0.38}


 13%|█▎        | 3840/30000 [20:14<2:17:05,  3.18it/s]

{'loss': 0.3007, 'grad_norm': 0.6940098404884338, 'learning_rate': 4.433898305084746e-05, 'epoch': 0.38}


 13%|█▎        | 3850/30000 [20:17<2:16:57,  3.18it/s]

{'loss': 0.286, 'grad_norm': 0.6032402515411377, 'learning_rate': 4.432203389830509e-05, 'epoch': 0.39}


 13%|█▎        | 3860/30000 [20:20<2:17:09,  3.18it/s]

{'loss': 0.3065, 'grad_norm': 0.7396628260612488, 'learning_rate': 4.430508474576272e-05, 'epoch': 0.39}


 13%|█▎        | 3870/30000 [20:24<2:16:55,  3.18it/s]

{'loss': 0.3002, 'grad_norm': 0.8710162043571472, 'learning_rate': 4.428813559322034e-05, 'epoch': 0.39}


 13%|█▎        | 3880/30000 [20:27<2:16:57,  3.18it/s]

{'loss': 0.3058, 'grad_norm': 0.854247510433197, 'learning_rate': 4.427118644067797e-05, 'epoch': 0.39}


 13%|█▎        | 3890/30000 [20:30<2:16:55,  3.18it/s]

{'loss': 0.2916, 'grad_norm': 0.669819176197052, 'learning_rate': 4.42542372881356e-05, 'epoch': 0.39}


 13%|█▎        | 3900/30000 [20:33<2:16:32,  3.19it/s]

{'loss': 0.2975, 'grad_norm': 0.5871636271476746, 'learning_rate': 4.423728813559322e-05, 'epoch': 0.39}


 13%|█▎        | 3910/30000 [20:36<2:16:45,  3.18it/s]

{'loss': 0.2973, 'grad_norm': 0.6900805830955505, 'learning_rate': 4.422033898305085e-05, 'epoch': 0.39}


 13%|█▎        | 3920/30000 [20:39<2:16:38,  3.18it/s]

{'loss': 0.2888, 'grad_norm': 1.0232573747634888, 'learning_rate': 4.420338983050848e-05, 'epoch': 0.39}


 13%|█▎        | 3930/30000 [20:42<2:16:37,  3.18it/s]

{'loss': 0.2775, 'grad_norm': 0.692487359046936, 'learning_rate': 4.41864406779661e-05, 'epoch': 0.39}


 13%|█▎        | 3940/30000 [20:46<2:16:32,  3.18it/s]

{'loss': 0.2929, 'grad_norm': 0.678026020526886, 'learning_rate': 4.4169491525423726e-05, 'epoch': 0.39}


 13%|█▎        | 3950/30000 [20:49<2:16:28,  3.18it/s]

{'loss': 0.2771, 'grad_norm': 0.6358934640884399, 'learning_rate': 4.4152542372881355e-05, 'epoch': 0.4}


 13%|█▎        | 3960/30000 [20:52<2:16:27,  3.18it/s]

{'loss': 0.281, 'grad_norm': 0.7119143009185791, 'learning_rate': 4.4135593220338984e-05, 'epoch': 0.4}


 13%|█▎        | 3970/30000 [20:55<2:16:23,  3.18it/s]

{'loss': 0.285, 'grad_norm': 0.6681285500526428, 'learning_rate': 4.4118644067796614e-05, 'epoch': 0.4}


 13%|█▎        | 3980/30000 [20:58<2:16:27,  3.18it/s]

{'loss': 0.2897, 'grad_norm': 0.7258105874061584, 'learning_rate': 4.4101694915254236e-05, 'epoch': 0.4}


 13%|█▎        | 3990/30000 [21:01<2:16:22,  3.18it/s]

{'loss': 0.2684, 'grad_norm': 0.7384130358695984, 'learning_rate': 4.4084745762711866e-05, 'epoch': 0.4}


 13%|█▎        | 4000/30000 [21:04<2:16:18,  3.18it/s]

{'loss': 0.3119, 'grad_norm': 0.6597203016281128, 'learning_rate': 4.4067796610169495e-05, 'epoch': 0.4}


 13%|█▎        | 4010/30000 [21:08<2:16:28,  3.17it/s]

{'loss': 0.2917, 'grad_norm': 0.6696610450744629, 'learning_rate': 4.405084745762712e-05, 'epoch': 0.4}


 13%|█▎        | 4020/30000 [21:11<2:16:02,  3.18it/s]

{'loss': 0.3131, 'grad_norm': 1.0624117851257324, 'learning_rate': 4.403389830508475e-05, 'epoch': 0.4}


 13%|█▎        | 4030/30000 [21:14<2:16:05,  3.18it/s]

{'loss': 0.2859, 'grad_norm': 0.6979932188987732, 'learning_rate': 4.4016949152542376e-05, 'epoch': 0.4}


 13%|█▎        | 4040/30000 [21:17<2:17:04,  3.16it/s]

{'loss': 0.3004, 'grad_norm': 0.9050206542015076, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.4}


 14%|█▎        | 4050/30000 [21:20<2:16:00,  3.18it/s]

{'loss': 0.2981, 'grad_norm': 0.630763828754425, 'learning_rate': 4.398305084745763e-05, 'epoch': 0.41}


 14%|█▎        | 4060/30000 [21:23<2:15:53,  3.18it/s]

{'loss': 0.283, 'grad_norm': 0.6298004984855652, 'learning_rate': 4.396610169491526e-05, 'epoch': 0.41}


 14%|█▎        | 4070/30000 [21:26<2:15:49,  3.18it/s]

{'loss': 0.2961, 'grad_norm': 0.7767544388771057, 'learning_rate': 4.394915254237289e-05, 'epoch': 0.41}


 14%|█▎        | 4080/30000 [21:30<2:15:50,  3.18it/s]

{'loss': 0.2981, 'grad_norm': 0.6339232921600342, 'learning_rate': 4.393220338983051e-05, 'epoch': 0.41}


 14%|█▎        | 4090/30000 [21:33<2:15:48,  3.18it/s]

{'loss': 0.3019, 'grad_norm': 0.5822775959968567, 'learning_rate': 4.391525423728814e-05, 'epoch': 0.41}


 14%|█▎        | 4100/30000 [21:36<2:15:50,  3.18it/s]

{'loss': 0.311, 'grad_norm': 0.7194923758506775, 'learning_rate': 4.389830508474577e-05, 'epoch': 0.41}


 14%|█▎        | 4110/30000 [21:39<2:15:50,  3.18it/s]

{'loss': 0.3026, 'grad_norm': 0.6304830312728882, 'learning_rate': 4.388135593220339e-05, 'epoch': 0.41}


 14%|█▎        | 4120/30000 [21:42<2:15:32,  3.18it/s]

{'loss': 0.2792, 'grad_norm': 0.5512552857398987, 'learning_rate': 4.386440677966102e-05, 'epoch': 0.41}


 14%|█▍        | 4130/30000 [21:45<2:15:39,  3.18it/s]

{'loss': 0.3018, 'grad_norm': 0.7920676469802856, 'learning_rate': 4.384745762711865e-05, 'epoch': 0.41}


 14%|█▍        | 4140/30000 [21:48<2:15:24,  3.18it/s]

{'loss': 0.3133, 'grad_norm': 0.7565373182296753, 'learning_rate': 4.383050847457627e-05, 'epoch': 0.41}


 14%|█▍        | 4150/30000 [21:52<2:15:48,  3.17it/s]

{'loss': 0.281, 'grad_norm': 0.6444233655929565, 'learning_rate': 4.38135593220339e-05, 'epoch': 0.41}


 14%|█▍        | 4160/30000 [21:55<2:15:18,  3.18it/s]

{'loss': 0.3016, 'grad_norm': 0.6607990860939026, 'learning_rate': 4.3796610169491524e-05, 'epoch': 0.42}


 14%|█▍        | 4170/30000 [21:58<2:15:28,  3.18it/s]

{'loss': 0.2691, 'grad_norm': 0.6417115330696106, 'learning_rate': 4.377966101694915e-05, 'epoch': 0.42}


 14%|█▍        | 4180/30000 [22:01<2:15:27,  3.18it/s]

{'loss': 0.2929, 'grad_norm': 1.1183278560638428, 'learning_rate': 4.376271186440678e-05, 'epoch': 0.42}


 14%|█▍        | 4190/30000 [22:04<2:15:26,  3.18it/s]

{'loss': 0.3174, 'grad_norm': 0.6461109519004822, 'learning_rate': 4.3745762711864405e-05, 'epoch': 0.42}


 14%|█▍        | 4200/30000 [22:07<2:15:16,  3.18it/s]

{'loss': 0.2876, 'grad_norm': 0.664521336555481, 'learning_rate': 4.3728813559322035e-05, 'epoch': 0.42}


 14%|█▍        | 4210/30000 [22:11<2:15:05,  3.18it/s]

{'loss': 0.2959, 'grad_norm': 0.8248205184936523, 'learning_rate': 4.3711864406779664e-05, 'epoch': 0.42}


 14%|█▍        | 4220/30000 [22:14<2:15:13,  3.18it/s]

{'loss': 0.2892, 'grad_norm': 0.6747873425483704, 'learning_rate': 4.3694915254237286e-05, 'epoch': 0.42}


 14%|█▍        | 4230/30000 [22:17<2:15:10,  3.18it/s]

{'loss': 0.2757, 'grad_norm': 0.6455169320106506, 'learning_rate': 4.3677966101694916e-05, 'epoch': 0.42}


 14%|█▍        | 4240/30000 [22:20<2:14:58,  3.18it/s]

{'loss': 0.2905, 'grad_norm': 0.5854437947273254, 'learning_rate': 4.3661016949152545e-05, 'epoch': 0.42}


 14%|█▍        | 4250/30000 [22:23<2:14:59,  3.18it/s]

{'loss': 0.274, 'grad_norm': 0.6828842163085938, 'learning_rate': 4.3644067796610175e-05, 'epoch': 0.42}


 14%|█▍        | 4260/30000 [22:26<2:15:05,  3.18it/s]

{'loss': 0.2853, 'grad_norm': 0.8037856817245483, 'learning_rate': 4.36271186440678e-05, 'epoch': 0.43}


 14%|█▍        | 4270/30000 [22:29<2:14:43,  3.18it/s]

{'loss': 0.2713, 'grad_norm': 0.680202066898346, 'learning_rate': 4.3610169491525426e-05, 'epoch': 0.43}


 14%|█▍        | 4280/30000 [22:33<2:14:45,  3.18it/s]

{'loss': 0.2823, 'grad_norm': 0.6765546798706055, 'learning_rate': 4.3593220338983056e-05, 'epoch': 0.43}


 14%|█▍        | 4290/30000 [22:36<2:14:50,  3.18it/s]

{'loss': 0.295, 'grad_norm': 0.8302549123764038, 'learning_rate': 4.357627118644068e-05, 'epoch': 0.43}


 14%|█▍        | 4300/30000 [22:39<2:14:49,  3.18it/s]

{'loss': 0.2889, 'grad_norm': 0.5445040464401245, 'learning_rate': 4.355932203389831e-05, 'epoch': 0.43}


 14%|█▍        | 4310/30000 [22:42<2:14:30,  3.18it/s]

{'loss': 0.2956, 'grad_norm': 0.7494840621948242, 'learning_rate': 4.354237288135594e-05, 'epoch': 0.43}


 14%|█▍        | 4320/30000 [22:45<2:14:36,  3.18it/s]

{'loss': 0.3042, 'grad_norm': 0.6187521815299988, 'learning_rate': 4.3525423728813566e-05, 'epoch': 0.43}


 14%|█▍        | 4330/30000 [22:48<2:14:36,  3.18it/s]

{'loss': 0.2985, 'grad_norm': 0.6135462522506714, 'learning_rate': 4.350847457627119e-05, 'epoch': 0.43}


 14%|█▍        | 4340/30000 [22:51<2:14:28,  3.18it/s]

{'loss': 0.2822, 'grad_norm': 0.6313056349754333, 'learning_rate': 4.349152542372882e-05, 'epoch': 0.43}


 14%|█▍        | 4350/30000 [22:55<2:14:35,  3.18it/s]

{'loss': 0.2935, 'grad_norm': 0.7562248706817627, 'learning_rate': 4.347457627118644e-05, 'epoch': 0.43}


 15%|█▍        | 4360/30000 [22:58<2:14:26,  3.18it/s]

{'loss': 0.2861, 'grad_norm': 1.6644906997680664, 'learning_rate': 4.345762711864407e-05, 'epoch': 0.44}


 15%|█▍        | 4370/30000 [23:01<2:14:28,  3.18it/s]

{'loss': 0.3057, 'grad_norm': 0.6986786723136902, 'learning_rate': 4.344067796610169e-05, 'epoch': 0.44}


 15%|█▍        | 4380/30000 [23:04<2:14:37,  3.17it/s]

{'loss': 0.2983, 'grad_norm': 0.7721360921859741, 'learning_rate': 4.342372881355932e-05, 'epoch': 0.44}


 15%|█▍        | 4390/30000 [23:07<2:14:17,  3.18it/s]

{'loss': 0.3098, 'grad_norm': 0.7757081985473633, 'learning_rate': 4.340677966101695e-05, 'epoch': 0.44}


 15%|█▍        | 4400/30000 [23:10<2:14:05,  3.18it/s]

{'loss': 0.3089, 'grad_norm': 0.6181268692016602, 'learning_rate': 4.3389830508474574e-05, 'epoch': 0.44}


 15%|█▍        | 4410/30000 [23:13<2:14:05,  3.18it/s]

{'loss': 0.2985, 'grad_norm': 0.6550773978233337, 'learning_rate': 4.3372881355932203e-05, 'epoch': 0.44}


 15%|█▍        | 4420/30000 [23:17<2:14:04,  3.18it/s]

{'loss': 0.2949, 'grad_norm': 0.7800536155700684, 'learning_rate': 4.335593220338983e-05, 'epoch': 0.44}


 15%|█▍        | 4430/30000 [23:20<2:14:05,  3.18it/s]

{'loss': 0.2963, 'grad_norm': 0.6315591335296631, 'learning_rate': 4.333898305084746e-05, 'epoch': 0.44}


 15%|█▍        | 4440/30000 [23:23<2:15:33,  3.14it/s]

{'loss': 0.2925, 'grad_norm': 0.5890361666679382, 'learning_rate': 4.3322033898305085e-05, 'epoch': 0.44}


 15%|█▍        | 4450/30000 [23:26<2:13:56,  3.18it/s]

{'loss': 0.2912, 'grad_norm': 0.5768073201179504, 'learning_rate': 4.3305084745762714e-05, 'epoch': 0.45}


 15%|█▍        | 4460/30000 [23:29<2:14:01,  3.18it/s]

{'loss': 0.3009, 'grad_norm': 0.6370007395744324, 'learning_rate': 4.3288135593220343e-05, 'epoch': 0.45}


 15%|█▍        | 4470/30000 [23:32<2:13:57,  3.18it/s]

{'loss': 0.2888, 'grad_norm': 0.6716703772544861, 'learning_rate': 4.3271186440677966e-05, 'epoch': 0.45}


 15%|█▍        | 4480/30000 [23:36<2:13:38,  3.18it/s]

{'loss': 0.2846, 'grad_norm': 0.6017683148384094, 'learning_rate': 4.3254237288135595e-05, 'epoch': 0.45}


 15%|█▍        | 4490/30000 [23:39<2:13:56,  3.17it/s]

{'loss': 0.3084, 'grad_norm': 0.7607077956199646, 'learning_rate': 4.3237288135593225e-05, 'epoch': 0.45}


 15%|█▌        | 4500/30000 [23:42<2:13:38,  3.18it/s]

{'loss': 0.2735, 'grad_norm': 0.6620991230010986, 'learning_rate': 4.3220338983050854e-05, 'epoch': 0.45}


 15%|█▌        | 4510/30000 [23:45<2:13:42,  3.18it/s]

{'loss': 0.2777, 'grad_norm': 0.6640531420707703, 'learning_rate': 4.3203389830508477e-05, 'epoch': 0.45}


 15%|█▌        | 4520/30000 [23:48<2:13:43,  3.18it/s]

{'loss': 0.2991, 'grad_norm': 0.8698699474334717, 'learning_rate': 4.3186440677966106e-05, 'epoch': 0.45}


 15%|█▌        | 4530/30000 [23:51<2:13:13,  3.19it/s]

{'loss': 0.2976, 'grad_norm': 0.6387079954147339, 'learning_rate': 4.3169491525423735e-05, 'epoch': 0.45}


 15%|█▌        | 4540/30000 [23:54<2:13:20,  3.18it/s]

{'loss': 0.2799, 'grad_norm': 0.6842662692070007, 'learning_rate': 4.315254237288136e-05, 'epoch': 0.45}


 15%|█▌        | 4550/30000 [23:58<2:13:16,  3.18it/s]

{'loss': 0.2723, 'grad_norm': 0.5977845788002014, 'learning_rate': 4.313559322033899e-05, 'epoch': 0.46}


 15%|█▌        | 4560/30000 [24:01<2:13:26,  3.18it/s]

{'loss': 0.2844, 'grad_norm': 0.609002411365509, 'learning_rate': 4.311864406779661e-05, 'epoch': 0.46}


 15%|█▌        | 4570/30000 [24:04<2:14:47,  3.14it/s]

{'loss': 0.2739, 'grad_norm': 0.8653842210769653, 'learning_rate': 4.310169491525424e-05, 'epoch': 0.46}


 15%|█▌        | 4580/30000 [24:07<2:13:18,  3.18it/s]

{'loss': 0.2998, 'grad_norm': 0.7320913672447205, 'learning_rate': 4.308474576271186e-05, 'epoch': 0.46}


 15%|█▌        | 4590/30000 [24:10<2:13:24,  3.17it/s]

{'loss': 0.2769, 'grad_norm': 0.5556861162185669, 'learning_rate': 4.306779661016949e-05, 'epoch': 0.46}


 15%|█▌        | 4600/30000 [24:13<2:14:59,  3.14it/s]

{'loss': 0.2787, 'grad_norm': 0.6557185649871826, 'learning_rate': 4.305084745762712e-05, 'epoch': 0.46}


 15%|█▌        | 4610/30000 [24:17<2:13:48,  3.16it/s]

{'loss': 0.2813, 'grad_norm': 0.5316507816314697, 'learning_rate': 4.303389830508475e-05, 'epoch': 0.46}


 15%|█▌        | 4620/30000 [24:20<2:13:05,  3.18it/s]

{'loss': 0.3001, 'grad_norm': 0.6814776659011841, 'learning_rate': 4.301694915254237e-05, 'epoch': 0.46}


 15%|█▌        | 4630/30000 [24:23<2:13:40,  3.16it/s]

{'loss': 0.2919, 'grad_norm': 0.6825424432754517, 'learning_rate': 4.3e-05, 'epoch': 0.46}


 15%|█▌        | 4640/30000 [24:26<2:13:01,  3.18it/s]

{'loss': 0.2865, 'grad_norm': 0.4912846088409424, 'learning_rate': 4.298305084745763e-05, 'epoch': 0.46}


 16%|█▌        | 4650/30000 [24:29<2:14:12,  3.15it/s]

{'loss': 0.286, 'grad_norm': 0.6561529040336609, 'learning_rate': 4.2966101694915254e-05, 'epoch': 0.47}


 16%|█▌        | 4660/30000 [24:32<2:14:45,  3.13it/s]

{'loss': 0.2747, 'grad_norm': 0.7423417568206787, 'learning_rate': 4.294915254237288e-05, 'epoch': 0.47}


 16%|█▌        | 4670/30000 [24:36<2:12:16,  3.19it/s]

{'loss': 0.319, 'grad_norm': 0.619339108467102, 'learning_rate': 4.293220338983051e-05, 'epoch': 0.47}


 16%|█▌        | 4680/30000 [24:39<2:12:12,  3.19it/s]

{'loss': 0.2652, 'grad_norm': 0.5868847966194153, 'learning_rate': 4.291525423728814e-05, 'epoch': 0.47}


 16%|█▌        | 4690/30000 [24:42<2:11:58,  3.20it/s]

{'loss': 0.286, 'grad_norm': 0.6615862846374512, 'learning_rate': 4.2898305084745764e-05, 'epoch': 0.47}


 16%|█▌        | 4700/30000 [24:45<2:13:59,  3.15it/s]

{'loss': 0.2849, 'grad_norm': 0.8710559606552124, 'learning_rate': 4.2881355932203394e-05, 'epoch': 0.47}


 16%|█▌        | 4710/30000 [24:48<2:13:31,  3.16it/s]

{'loss': 0.2886, 'grad_norm': 1.0081723928451538, 'learning_rate': 4.286440677966102e-05, 'epoch': 0.47}


 16%|█▌        | 4720/30000 [24:51<2:13:19,  3.16it/s]

{'loss': 0.2938, 'grad_norm': 0.6442426443099976, 'learning_rate': 4.2847457627118645e-05, 'epoch': 0.47}


 16%|█▌        | 4730/30000 [24:54<2:13:24,  3.16it/s]

{'loss': 0.28, 'grad_norm': 0.82573002576828, 'learning_rate': 4.2830508474576275e-05, 'epoch': 0.47}


 16%|█▌        | 4740/30000 [24:58<2:12:32,  3.18it/s]

{'loss': 0.2735, 'grad_norm': 0.8363503217697144, 'learning_rate': 4.2813559322033904e-05, 'epoch': 0.47}


 16%|█▌        | 4750/30000 [25:01<2:11:43,  3.19it/s]

{'loss': 0.2837, 'grad_norm': 0.8310210704803467, 'learning_rate': 4.279661016949153e-05, 'epoch': 0.47}


 16%|█▌        | 4760/30000 [25:04<2:11:29,  3.20it/s]

{'loss': 0.2977, 'grad_norm': 0.8956178426742554, 'learning_rate': 4.277966101694915e-05, 'epoch': 0.48}


 16%|█▌        | 4770/30000 [25:07<2:11:48,  3.19it/s]

{'loss': 0.3081, 'grad_norm': 0.5845305323600769, 'learning_rate': 4.276271186440678e-05, 'epoch': 0.48}


 16%|█▌        | 4780/30000 [25:10<2:11:25,  3.20it/s]

{'loss': 0.2847, 'grad_norm': 1.8642065525054932, 'learning_rate': 4.274576271186441e-05, 'epoch': 0.48}


 16%|█▌        | 4790/30000 [25:13<2:11:13,  3.20it/s]

{'loss': 0.2892, 'grad_norm': 0.5979171991348267, 'learning_rate': 4.272881355932204e-05, 'epoch': 0.48}


 16%|█▌        | 4800/30000 [25:16<2:11:09,  3.20it/s]

{'loss': 0.2642, 'grad_norm': 0.6995040774345398, 'learning_rate': 4.271186440677966e-05, 'epoch': 0.48}


 16%|█▌        | 4810/30000 [25:20<2:11:09,  3.20it/s]

{'loss': 0.2864, 'grad_norm': 0.614086925983429, 'learning_rate': 4.269491525423729e-05, 'epoch': 0.48}


 16%|█▌        | 4820/30000 [25:23<2:11:24,  3.19it/s]

{'loss': 0.2884, 'grad_norm': 0.8237701058387756, 'learning_rate': 4.267796610169492e-05, 'epoch': 0.48}


 16%|█▌        | 4830/30000 [25:26<2:10:56,  3.20it/s]

{'loss': 0.2897, 'grad_norm': 0.7463858723640442, 'learning_rate': 4.266101694915254e-05, 'epoch': 0.48}


 16%|█▌        | 4840/30000 [25:29<2:10:54,  3.20it/s]

{'loss': 0.272, 'grad_norm': 0.5537822246551514, 'learning_rate': 4.264406779661017e-05, 'epoch': 0.48}


 16%|█▌        | 4850/30000 [25:32<2:10:52,  3.20it/s]

{'loss': 0.2931, 'grad_norm': 0.6183093786239624, 'learning_rate': 4.26271186440678e-05, 'epoch': 0.48}


 16%|█▌        | 4860/30000 [25:35<2:10:54,  3.20it/s]

{'loss': 0.2926, 'grad_norm': 0.6729761362075806, 'learning_rate': 4.261016949152542e-05, 'epoch': 0.49}


 16%|█▌        | 4870/30000 [25:38<2:10:58,  3.20it/s]

{'loss': 0.2879, 'grad_norm': 0.8168068528175354, 'learning_rate': 4.259322033898305e-05, 'epoch': 0.49}


 16%|█▋        | 4880/30000 [25:41<2:10:43,  3.20it/s]

{'loss': 0.297, 'grad_norm': 0.617422342300415, 'learning_rate': 4.257627118644068e-05, 'epoch': 0.49}


 16%|█▋        | 4890/30000 [25:45<2:10:43,  3.20it/s]

{'loss': 0.296, 'grad_norm': 0.6326867938041687, 'learning_rate': 4.255932203389831e-05, 'epoch': 0.49}


 16%|█▋        | 4900/30000 [25:48<2:10:39,  3.20it/s]

{'loss': 0.2928, 'grad_norm': 0.7188516855239868, 'learning_rate': 4.254237288135593e-05, 'epoch': 0.49}


 16%|█▋        | 4910/30000 [25:51<2:10:45,  3.20it/s]

{'loss': 0.2821, 'grad_norm': 0.8448076248168945, 'learning_rate': 4.252542372881356e-05, 'epoch': 0.49}


 16%|█▋        | 4920/30000 [25:54<2:10:32,  3.20it/s]

{'loss': 0.3015, 'grad_norm': 0.639301061630249, 'learning_rate': 4.250847457627119e-05, 'epoch': 0.49}


 16%|█▋        | 4930/30000 [25:57<2:10:50,  3.19it/s]

{'loss': 0.2908, 'grad_norm': 0.9073476791381836, 'learning_rate': 4.2491525423728814e-05, 'epoch': 0.49}


 16%|█▋        | 4940/30000 [26:00<2:10:31,  3.20it/s]

{'loss': 0.2695, 'grad_norm': 0.7001325488090515, 'learning_rate': 4.2474576271186444e-05, 'epoch': 0.49}


 16%|█▋        | 4950/30000 [26:03<2:11:25,  3.18it/s]

{'loss': 0.2852, 'grad_norm': 0.5298324227333069, 'learning_rate': 4.245762711864407e-05, 'epoch': 0.49}


 17%|█▋        | 4960/30000 [26:26<17:40:02,  2.54s/it]

{'loss': 0.2968, 'grad_norm': 0.6209350824356079, 'learning_rate': 4.24406779661017e-05, 'epoch': 0.5}


 17%|█▋        | 4970/30000 [26:52<17:48:00,  2.56s/it]

{'loss': 0.2988, 'grad_norm': 0.5505105257034302, 'learning_rate': 4.242372881355932e-05, 'epoch': 0.5}


 17%|█▋        | 4980/30000 [27:17<17:35:43,  2.53s/it]

{'loss': 0.2885, 'grad_norm': 0.6814633011817932, 'learning_rate': 4.240677966101695e-05, 'epoch': 0.5}


 17%|█▋        | 4990/30000 [27:37<9:55:18,  1.43s/it] 

{'loss': 0.2752, 'grad_norm': 0.6262236833572388, 'learning_rate': 4.238983050847458e-05, 'epoch': 0.5}


 17%|█▋        | 5000/30000 [27:40<2:21:32,  2.94it/s]

{'loss': 0.2629, 'grad_norm': 0.6490533351898193, 'learning_rate': 4.2372881355932206e-05, 'epoch': 0.5}


 17%|█▋        | 5010/30000 [27:44<2:09:52,  3.21it/s]

{'loss': 0.2776, 'grad_norm': 0.6156919002532959, 'learning_rate': 4.235593220338983e-05, 'epoch': 0.5}


 17%|█▋        | 5020/30000 [27:47<2:09:18,  3.22it/s]

{'loss': 0.2845, 'grad_norm': 0.5862128734588623, 'learning_rate': 4.233898305084746e-05, 'epoch': 0.5}


 17%|█▋        | 5030/30000 [27:50<2:09:08,  3.22it/s]

{'loss': 0.2694, 'grad_norm': 0.6005709767341614, 'learning_rate': 4.232203389830509e-05, 'epoch': 0.5}


 17%|█▋        | 5040/30000 [27:53<2:09:10,  3.22it/s]

{'loss': 0.298, 'grad_norm': 0.6145910620689392, 'learning_rate': 4.230508474576271e-05, 'epoch': 0.5}


 17%|█▋        | 5050/30000 [27:56<2:09:03,  3.22it/s]

{'loss': 0.2896, 'grad_norm': 0.6016186475753784, 'learning_rate': 4.228813559322034e-05, 'epoch': 0.51}


 17%|█▋        | 5060/30000 [27:59<2:09:08,  3.22it/s]

{'loss': 0.2943, 'grad_norm': 0.6412099599838257, 'learning_rate': 4.227118644067797e-05, 'epoch': 0.51}


 17%|█▋        | 5070/30000 [28:02<2:09:06,  3.22it/s]

{'loss': 0.282, 'grad_norm': 0.6015849709510803, 'learning_rate': 4.22542372881356e-05, 'epoch': 0.51}


 17%|█▋        | 5080/30000 [28:05<2:09:47,  3.20it/s]

{'loss': 0.2876, 'grad_norm': 0.8282114863395691, 'learning_rate': 4.223728813559322e-05, 'epoch': 0.51}


 17%|█▋        | 5090/30000 [28:08<2:08:55,  3.22it/s]

{'loss': 0.2919, 'grad_norm': 0.7199591398239136, 'learning_rate': 4.222033898305085e-05, 'epoch': 0.51}


 17%|█▋        | 5100/30000 [28:12<2:08:47,  3.22it/s]

{'loss': 0.2809, 'grad_norm': 0.6786491274833679, 'learning_rate': 4.220338983050848e-05, 'epoch': 0.51}


 17%|█▋        | 5110/30000 [28:15<2:08:36,  3.23it/s]

{'loss': 0.295, 'grad_norm': 0.5459167957305908, 'learning_rate': 4.21864406779661e-05, 'epoch': 0.51}


 17%|█▋        | 5120/30000 [28:18<2:08:30,  3.23it/s]

{'loss': 0.2955, 'grad_norm': 0.5395411849021912, 'learning_rate': 4.216949152542373e-05, 'epoch': 0.51}


 17%|█▋        | 5130/30000 [28:21<2:10:25,  3.18it/s]

{'loss': 0.2871, 'grad_norm': 0.6193023324012756, 'learning_rate': 4.215254237288136e-05, 'epoch': 0.51}


 17%|█▋        | 5140/30000 [28:24<2:13:22,  3.11it/s]

{'loss': 0.2823, 'grad_norm': 0.6906324028968811, 'learning_rate': 4.213559322033899e-05, 'epoch': 0.51}


 17%|█▋        | 5150/30000 [28:27<2:11:00,  3.16it/s]

{'loss': 0.285, 'grad_norm': 0.6871585845947266, 'learning_rate': 4.211864406779661e-05, 'epoch': 0.52}


 17%|█▋        | 5160/30000 [28:30<2:11:38,  3.14it/s]

{'loss': 0.272, 'grad_norm': 0.5280659794807434, 'learning_rate': 4.210169491525424e-05, 'epoch': 0.52}


 17%|█▋        | 5170/30000 [28:34<2:11:25,  3.15it/s]

{'loss': 0.2749, 'grad_norm': 0.5069794654846191, 'learning_rate': 4.208474576271187e-05, 'epoch': 0.52}


 17%|█▋        | 5180/30000 [28:37<2:12:41,  3.12it/s]

{'loss': 0.2841, 'grad_norm': 0.5572595596313477, 'learning_rate': 4.2067796610169494e-05, 'epoch': 0.52}


 17%|█▋        | 5190/30000 [28:40<2:11:37,  3.14it/s]

{'loss': 0.2933, 'grad_norm': 0.6446375250816345, 'learning_rate': 4.2050847457627116e-05, 'epoch': 0.52}


 17%|█▋        | 5200/30000 [28:43<2:09:50,  3.18it/s]

{'loss': 0.2994, 'grad_norm': 0.6846193671226501, 'learning_rate': 4.2033898305084746e-05, 'epoch': 0.52}


 17%|█▋        | 5210/30000 [28:46<2:10:04,  3.18it/s]

{'loss': 0.2825, 'grad_norm': 0.5557406544685364, 'learning_rate': 4.2016949152542375e-05, 'epoch': 0.52}


 17%|█▋        | 5220/30000 [28:50<2:09:45,  3.18it/s]

{'loss': 0.2844, 'grad_norm': 0.7251440286636353, 'learning_rate': 4.2e-05, 'epoch': 0.52}


 17%|█▋        | 5230/30000 [28:53<2:10:28,  3.16it/s]

{'loss': 0.283, 'grad_norm': 0.5210162401199341, 'learning_rate': 4.198305084745763e-05, 'epoch': 0.52}


 17%|█▋        | 5240/30000 [28:56<2:09:44,  3.18it/s]

{'loss': 0.2969, 'grad_norm': 1.0637154579162598, 'learning_rate': 4.1966101694915256e-05, 'epoch': 0.52}


 18%|█▊        | 5250/30000 [28:59<2:09:42,  3.18it/s]

{'loss': 0.2934, 'grad_norm': 0.5717556476593018, 'learning_rate': 4.1949152542372886e-05, 'epoch': 0.53}


 18%|█▊        | 5260/30000 [29:02<2:09:38,  3.18it/s]

{'loss': 0.2784, 'grad_norm': 0.6115550398826599, 'learning_rate': 4.193220338983051e-05, 'epoch': 0.53}


 18%|█▊        | 5270/30000 [29:05<2:09:30,  3.18it/s]

{'loss': 0.2779, 'grad_norm': 1.1976706981658936, 'learning_rate': 4.191525423728814e-05, 'epoch': 0.53}


 18%|█▊        | 5280/30000 [29:08<2:09:22,  3.18it/s]

{'loss': 0.2791, 'grad_norm': 0.5482666492462158, 'learning_rate': 4.189830508474577e-05, 'epoch': 0.53}


 18%|█▊        | 5290/30000 [29:12<2:09:25,  3.18it/s]

{'loss': 0.2976, 'grad_norm': 0.7212347388267517, 'learning_rate': 4.188135593220339e-05, 'epoch': 0.53}


 18%|█▊        | 5300/30000 [29:15<2:09:12,  3.19it/s]

{'loss': 0.2834, 'grad_norm': 0.6305532455444336, 'learning_rate': 4.186440677966102e-05, 'epoch': 0.53}


 18%|█▊        | 5310/30000 [29:18<2:09:11,  3.18it/s]

{'loss': 0.2803, 'grad_norm': 0.7392553091049194, 'learning_rate': 4.184745762711865e-05, 'epoch': 0.53}


 18%|█▊        | 5320/30000 [29:21<2:09:16,  3.18it/s]

{'loss': 0.2824, 'grad_norm': 0.9217426180839539, 'learning_rate': 4.183050847457628e-05, 'epoch': 0.53}


 18%|█▊        | 5330/30000 [29:24<2:09:54,  3.16it/s]

{'loss': 0.2826, 'grad_norm': 0.6268206238746643, 'learning_rate': 4.18135593220339e-05, 'epoch': 0.53}


 18%|█▊        | 5340/30000 [29:27<2:13:51,  3.07it/s]

{'loss': 0.2892, 'grad_norm': 0.5884392261505127, 'learning_rate': 4.179661016949153e-05, 'epoch': 0.53}


 18%|█▊        | 5350/30000 [29:31<2:15:06,  3.04it/s]

{'loss': 0.274, 'grad_norm': 0.7455472350120544, 'learning_rate': 4.177966101694916e-05, 'epoch': 0.54}


 18%|█▊        | 5360/30000 [29:34<2:09:13,  3.18it/s]

{'loss': 0.287, 'grad_norm': 0.6756070256233215, 'learning_rate': 4.176271186440678e-05, 'epoch': 0.54}


 18%|█▊        | 5370/30000 [29:37<2:19:22,  2.95it/s]

{'loss': 0.2753, 'grad_norm': 0.6311599612236023, 'learning_rate': 4.174576271186441e-05, 'epoch': 0.54}


 18%|█▊        | 5380/30000 [29:41<2:17:01,  2.99it/s]

{'loss': 0.2765, 'grad_norm': 0.48132607340812683, 'learning_rate': 4.172881355932204e-05, 'epoch': 0.54}


 18%|█▊        | 5390/30000 [29:44<2:09:39,  3.16it/s]

{'loss': 0.275, 'grad_norm': 0.5650287866592407, 'learning_rate': 4.171186440677966e-05, 'epoch': 0.54}


 18%|█▊        | 5400/30000 [29:47<2:09:25,  3.17it/s]

{'loss': 0.2641, 'grad_norm': 0.7200319170951843, 'learning_rate': 4.1694915254237285e-05, 'epoch': 0.54}


 18%|█▊        | 5410/30000 [29:50<2:09:46,  3.16it/s]

{'loss': 0.2737, 'grad_norm': 0.5928342938423157, 'learning_rate': 4.1677966101694915e-05, 'epoch': 0.54}


 18%|█▊        | 5420/30000 [29:53<2:10:22,  3.14it/s]

{'loss': 0.2907, 'grad_norm': 0.6249415278434753, 'learning_rate': 4.1661016949152544e-05, 'epoch': 0.54}


 18%|█▊        | 5430/30000 [29:57<2:08:17,  3.19it/s]

{'loss': 0.2799, 'grad_norm': 0.6058959364891052, 'learning_rate': 4.164406779661017e-05, 'epoch': 0.54}


 18%|█▊        | 5440/30000 [30:00<2:09:27,  3.16it/s]

{'loss': 0.2658, 'grad_norm': 0.5605167150497437, 'learning_rate': 4.1627118644067796e-05, 'epoch': 0.54}


 18%|█▊        | 5450/30000 [30:03<2:08:43,  3.18it/s]

{'loss': 0.2813, 'grad_norm': 0.6046028137207031, 'learning_rate': 4.1610169491525425e-05, 'epoch': 0.55}


 18%|█▊        | 5460/30000 [30:06<2:08:44,  3.18it/s]

{'loss': 0.3015, 'grad_norm': 0.6057665348052979, 'learning_rate': 4.1593220338983055e-05, 'epoch': 0.55}


 18%|█▊        | 5470/30000 [30:09<2:08:08,  3.19it/s]

{'loss': 0.2882, 'grad_norm': 0.6453701853752136, 'learning_rate': 4.157627118644068e-05, 'epoch': 0.55}


 18%|█▊        | 5480/30000 [30:12<2:09:06,  3.17it/s]

{'loss': 0.2891, 'grad_norm': 0.742391049861908, 'learning_rate': 4.1559322033898307e-05, 'epoch': 0.55}


 18%|█▊        | 5490/30000 [30:16<2:08:21,  3.18it/s]

{'loss': 0.2837, 'grad_norm': 0.5816985964775085, 'learning_rate': 4.1542372881355936e-05, 'epoch': 0.55}


 18%|█▊        | 5500/30000 [30:19<2:07:48,  3.19it/s]

{'loss': 0.2764, 'grad_norm': 0.5582712888717651, 'learning_rate': 4.152542372881356e-05, 'epoch': 0.55}


 18%|█▊        | 5510/30000 [30:22<2:08:28,  3.18it/s]

{'loss': 0.2729, 'grad_norm': 0.6294713020324707, 'learning_rate': 4.150847457627119e-05, 'epoch': 0.55}


 18%|█▊        | 5520/30000 [30:25<2:10:21,  3.13it/s]

{'loss': 0.2789, 'grad_norm': 0.5672255158424377, 'learning_rate': 4.149152542372882e-05, 'epoch': 0.55}


 18%|█▊        | 5530/30000 [30:28<2:09:57,  3.14it/s]

{'loss': 0.2735, 'grad_norm': 0.7498718500137329, 'learning_rate': 4.1474576271186446e-05, 'epoch': 0.55}


 18%|█▊        | 5540/30000 [30:31<2:11:31,  3.10it/s]

{'loss': 0.2918, 'grad_norm': 0.5607385039329529, 'learning_rate': 4.145762711864407e-05, 'epoch': 0.55}


 18%|█▊        | 5550/30000 [30:35<2:08:33,  3.17it/s]

{'loss': 0.2833, 'grad_norm': 0.505148708820343, 'learning_rate': 4.14406779661017e-05, 'epoch': 0.56}


 19%|█▊        | 5560/30000 [30:38<2:08:03,  3.18it/s]

{'loss': 0.2626, 'grad_norm': 0.5611065626144409, 'learning_rate': 4.142372881355933e-05, 'epoch': 0.56}


 19%|█▊        | 5570/30000 [30:41<2:07:56,  3.18it/s]

{'loss': 0.2769, 'grad_norm': 0.5830212235450745, 'learning_rate': 4.140677966101695e-05, 'epoch': 0.56}


 19%|█▊        | 5580/30000 [30:44<2:08:57,  3.16it/s]

{'loss': 0.2922, 'grad_norm': 0.6274731159210205, 'learning_rate': 4.138983050847458e-05, 'epoch': 0.56}


 19%|█▊        | 5590/30000 [30:47<2:08:27,  3.17it/s]

{'loss': 0.2619, 'grad_norm': 0.6641478538513184, 'learning_rate': 4.13728813559322e-05, 'epoch': 0.56}


 19%|█▊        | 5600/30000 [30:50<2:09:16,  3.15it/s]

{'loss': 0.2739, 'grad_norm': 0.5943946838378906, 'learning_rate': 4.135593220338983e-05, 'epoch': 0.56}


 19%|█▊        | 5610/30000 [30:54<2:11:45,  3.09it/s]

{'loss': 0.2748, 'grad_norm': 0.5663438439369202, 'learning_rate': 4.1338983050847454e-05, 'epoch': 0.56}


 19%|█▊        | 5620/30000 [30:57<2:12:30,  3.07it/s]

{'loss': 0.2837, 'grad_norm': 0.7690475583076477, 'learning_rate': 4.1322033898305084e-05, 'epoch': 0.56}


 19%|█▉        | 5630/30000 [31:00<2:16:13,  2.98it/s]

{'loss': 0.2751, 'grad_norm': 0.6612385511398315, 'learning_rate': 4.130508474576271e-05, 'epoch': 0.56}


 19%|█▉        | 5640/30000 [31:04<2:13:57,  3.03it/s]

{'loss': 0.2704, 'grad_norm': 0.6085260510444641, 'learning_rate': 4.128813559322034e-05, 'epoch': 0.56}


 19%|█▉        | 5650/30000 [31:07<2:13:06,  3.05it/s]

{'loss': 0.2789, 'grad_norm': 0.5224882960319519, 'learning_rate': 4.1271186440677965e-05, 'epoch': 0.56}


 19%|█▉        | 5660/30000 [31:10<2:09:38,  3.13it/s]

{'loss': 0.2662, 'grad_norm': 0.6961203813552856, 'learning_rate': 4.1254237288135594e-05, 'epoch': 0.57}


 19%|█▉        | 5670/30000 [31:13<2:18:25,  2.93it/s]

{'loss': 0.2803, 'grad_norm': 0.7409279942512512, 'learning_rate': 4.1237288135593223e-05, 'epoch': 0.57}


 19%|█▉        | 5680/30000 [31:17<2:10:48,  3.10it/s]

{'loss': 0.2693, 'grad_norm': 0.542153537273407, 'learning_rate': 4.1220338983050846e-05, 'epoch': 0.57}


 19%|█▉        | 5690/30000 [31:20<2:09:22,  3.13it/s]

{'loss': 0.2783, 'grad_norm': 0.49224403500556946, 'learning_rate': 4.1203389830508475e-05, 'epoch': 0.57}


 19%|█▉        | 5700/30000 [31:23<2:09:02,  3.14it/s]

{'loss': 0.2862, 'grad_norm': 0.7906402349472046, 'learning_rate': 4.1186440677966105e-05, 'epoch': 0.57}


 19%|█▉        | 5710/30000 [31:26<2:09:13,  3.13it/s]

{'loss': 0.2916, 'grad_norm': 0.7612155079841614, 'learning_rate': 4.1169491525423734e-05, 'epoch': 0.57}


 19%|█▉        | 5720/30000 [31:29<2:09:25,  3.13it/s]

{'loss': 0.2867, 'grad_norm': 0.525444746017456, 'learning_rate': 4.115254237288136e-05, 'epoch': 0.57}


 19%|█▉        | 5730/30000 [31:33<2:13:17,  3.03it/s]

{'loss': 0.2683, 'grad_norm': 1.111773133277893, 'learning_rate': 4.1135593220338986e-05, 'epoch': 0.57}


 19%|█▉        | 5740/30000 [31:36<2:09:02,  3.13it/s]

{'loss': 0.2656, 'grad_norm': 0.5602926015853882, 'learning_rate': 4.1118644067796615e-05, 'epoch': 0.57}


 19%|█▉        | 5750/30000 [31:39<2:08:57,  3.13it/s]

{'loss': 0.2633, 'grad_norm': 0.6150907278060913, 'learning_rate': 4.110169491525424e-05, 'epoch': 0.57}


 19%|█▉        | 5760/30000 [31:42<2:08:47,  3.14it/s]

{'loss': 0.2866, 'grad_norm': 0.8921559453010559, 'learning_rate': 4.108474576271187e-05, 'epoch': 0.58}


 19%|█▉        | 5770/30000 [31:46<2:11:39,  3.07it/s]

{'loss': 0.2933, 'grad_norm': 0.614511251449585, 'learning_rate': 4.10677966101695e-05, 'epoch': 0.58}


 19%|█▉        | 5780/30000 [31:49<2:10:05,  3.10it/s]

{'loss': 0.2798, 'grad_norm': 0.5968562960624695, 'learning_rate': 4.1050847457627126e-05, 'epoch': 0.58}


 19%|█▉        | 5790/30000 [31:52<2:11:04,  3.08it/s]

{'loss': 0.2788, 'grad_norm': 0.883049726486206, 'learning_rate': 4.103389830508475e-05, 'epoch': 0.58}


 19%|█▉        | 5800/30000 [31:55<2:08:24,  3.14it/s]

{'loss': 0.2614, 'grad_norm': 0.613979697227478, 'learning_rate': 4.101694915254237e-05, 'epoch': 0.58}


 19%|█▉        | 5810/30000 [31:58<2:07:30,  3.16it/s]

{'loss': 0.281, 'grad_norm': 0.6161765456199646, 'learning_rate': 4.1e-05, 'epoch': 0.58}


 19%|█▉        | 5820/30000 [32:02<2:07:32,  3.16it/s]

{'loss': 0.2788, 'grad_norm': 0.5487979054450989, 'learning_rate': 4.098305084745763e-05, 'epoch': 0.58}


 19%|█▉        | 5830/30000 [32:05<2:11:30,  3.06it/s]

{'loss': 0.2745, 'grad_norm': 0.5924230813980103, 'learning_rate': 4.096610169491525e-05, 'epoch': 0.58}


 19%|█▉        | 5840/30000 [32:08<2:10:06,  3.09it/s]

{'loss': 0.2669, 'grad_norm': 0.8007706999778748, 'learning_rate': 4.094915254237288e-05, 'epoch': 0.58}


 20%|█▉        | 5850/30000 [32:11<2:07:28,  3.16it/s]

{'loss': 0.2645, 'grad_norm': 0.570662796497345, 'learning_rate': 4.093220338983051e-05, 'epoch': 0.58}


 20%|█▉        | 5860/30000 [32:14<2:09:11,  3.11it/s]

{'loss': 0.2718, 'grad_norm': 0.8477504849433899, 'learning_rate': 4.0915254237288134e-05, 'epoch': 0.59}


 20%|█▉        | 5870/30000 [32:18<2:07:49,  3.15it/s]

{'loss': 0.2674, 'grad_norm': 0.5611673593521118, 'learning_rate': 4.089830508474576e-05, 'epoch': 0.59}


 20%|█▉        | 5880/30000 [32:21<2:11:56,  3.05it/s]

{'loss': 0.2699, 'grad_norm': 1.056578516960144, 'learning_rate': 4.088135593220339e-05, 'epoch': 0.59}


 20%|█▉        | 5890/30000 [32:24<2:10:39,  3.08it/s]

{'loss': 0.2808, 'grad_norm': 0.5683829188346863, 'learning_rate': 4.086440677966102e-05, 'epoch': 0.59}


 20%|█▉        | 5900/30000 [32:27<2:08:54,  3.12it/s]

{'loss': 0.2793, 'grad_norm': 0.6242116689682007, 'learning_rate': 4.0847457627118644e-05, 'epoch': 0.59}


 20%|█▉        | 5910/30000 [32:31<2:08:02,  3.14it/s]

{'loss': 0.284, 'grad_norm': 0.5239436030387878, 'learning_rate': 4.0830508474576274e-05, 'epoch': 0.59}


 20%|█▉        | 5920/30000 [32:34<2:08:16,  3.13it/s]

{'loss': 0.285, 'grad_norm': 0.6534799337387085, 'learning_rate': 4.08135593220339e-05, 'epoch': 0.59}


 20%|█▉        | 5930/30000 [32:37<2:07:18,  3.15it/s]

{'loss': 0.2658, 'grad_norm': 0.5293242335319519, 'learning_rate': 4.0796610169491526e-05, 'epoch': 0.59}


 20%|█▉        | 5940/30000 [32:40<2:07:09,  3.15it/s]

{'loss': 0.293, 'grad_norm': 0.6221745014190674, 'learning_rate': 4.0779661016949155e-05, 'epoch': 0.59}


 20%|█▉        | 5946/30000 [32:42<2:07:24,  3.15it/s]

# EVALUATE MODEL

In [None]:
import torch
device = torch.device("cpu")
model.to(device)

In [None]:
class ChessSimulator:
    """
    ChessSimulator simulates a chess game between the pre-trained model and the Stockfish chess engine.
    """
    def __init__(self, model, tokenizer, stockfish_filepath):
        """
        Initialize a chess simulation between the pretrained the Stockfish chess engine.
        This involves initializing a Board state from py-chess to keep track of moves made 
        and an engine instance from Stockfish that can respond accordingly.
        Additionally, this keeps track of the move number and whether or not it is the model's turn in 
        order to construct the prompt the model is expecting in order to generate the next move.
        Note: there are two sequences in a move number; one for white and for black. 
        White's move number is formatted as '{move_number}. ' and black's move number is formatted as '{move_number}. ..'.

        
        Parameters:
            - model (transformers.PreTrainedModel): A Hugging Face transformer model
            such as BERT, GPT-2, T5, etc., pretrained on PGN formatted data.
            - tokenizer (transformers.PreTrainedTokenizer): The corresponding tokenizer
            for the model, used to encode PGN formatted games.
            stockfish_filepath (str, optional): Filepath to Stockfish binary.
        """
        self.model = model
        self.tokenizer = tokenizer
        self.board = chess.Board()
        self.engine = chess.engine.SimpleEngine.popen_uci(stockfish_filepath)
        self.prompt = ''
        self.move_number = 1
        self.modelTurn = True
        self.currentMove = None
    
    def simulateGame(self):
        """
        This function simulates a chess game where the model starts and the engine responds. 
        Each move is processed accordingly to be added into the board and again to update the prompt for the model.
        It returns whether or not the model failed, won/drew a tie, or lost.
        """
        while not self.board.is_checkmate():
            try:
                if self.modelTurn:
                    self.generateModelMove()
                    print(self.prompt)
                else:
                    self.generateChessMove()
                    self.move_number += 1
                self.modelTurn = not self.modelTurn

            except:
                return -1 # signify model error or hallucination
            
        self.engine.quit()

        if self.board.is_checkmate():
            if self.board.turn == chess.WHITE:
                return 0
            else: # counts draws as a win
                return 1
            
    def extract_white_move(self,text):
        """
        Takes in model response and parses relevant move as it pertains to model number.

        Parameters:
            - text (str): model response to prompt
        """
        # Meant to extract white move from generated model string, based on the current move number
        # Normalize spaces
        cleaned_text = " ".join(text.split())

        # Define start and end markers
        start_marker = f"{self.move_number}."
        end_marker = f"{self.move_number}. .."

        # Find the position of the move number
        start_index = cleaned_text.find(start_marker)
        if start_index == -1:
            return None  # Move not found

        # Find the position of the next black move
        end_index = cleaned_text.find(end_marker, start_index)
        
        # Extract substring
        if end_index != -1:
            move_text = cleaned_text[start_index + len(start_marker):end_index].strip()
        else:
            # If no black move is found, take everything after the move number
            move_text = cleaned_text[start_index + len(start_marker):].strip().split()[0]

        return move_text

    def parseModelMove(self, move):
        """
        Takes in model response, extracts relevant to current move number.
        Processes parsed response accordingly to get pushed to chess board by converting to SAN format.
        Processes parsed response accordingly for model prompt by separating move, special pieces, special moves, etc by white spaces
        as is consistent with the preprocessing/tokenization process.

        Parameters:
            - text (str): model response to prompt
        """
        # Extract move using regex
        move = self.extract_white_move(move)
        # Need to strip whitespace, this should accept special pieces and special moves
        # as they're in "algebraic notation"
        self.board.push_san(move.replace(' ',''))
        self.prompt += move

    
    def parseChessMove(self, move):
        """
        Pushes engine response to chess board. Parses engine response so it's consistent with format model is expecting for addition to the prompt by
        separating out special pieces, special moves, moves by white spaces.

        Parameters:
            - move (str): Response returned by chess engine
        """
        # converts move in UCI notation to SAN notation, that model expects
        self.prompt += f' {self.move_number}. .. ' # note: black's move number is always denoted afterwards with a '. ..' as per model standards
        san_move = self.board.san(move)
        # Define patterns for different components
        move_number_pattern = re.compile(r'(\d+\.)')  # Move numbers (e.g., "1.")
        piece_pattern = re.compile(r'([KQRBN])')  # Chess pieces (e.g., "N", "K")
        square_pattern = re.compile(r'([a-h][1-8])')  # Board squares (e4, d5, etc.)
        special_move_pattern = re.compile(r'(O-O|O-O-O|\+|#|x|=Q|=R|=B|=N)')  # Castling, check, capture, promotions
            
        # Ensure move numbers, pieces, and special moves are space-separated
        san_move = move_number_pattern.sub(r'\1 ', san_move)  # Move number spacing
        san_move = piece_pattern.sub(r'\1 ', san_move)  # Piece spacing
        san_move = special_move_pattern.sub(r' \1 ', san_move)  # Special moves spacing
        self.prompt += san_move +' '


    def generateModelMove(self):
        """
        Generates model response and updates prompt.
        """
        self.prompt += f'{self.move_number}. ' # note: white's move number is always denoted afterwards with a singular '.' as per model standards
        inputs = tokenizer(self.prompt, return_tensors="pt")
        outputs = model.generate(**inputs, max_new_tokens=15, num_return_sequences=1) # generate lots of tokens, and parse for information relevant to current move only
        move = tokenizer.decode(outputs[0], skip_special_tokens=True)
        self.currentMove = self.parseModelMove(move)


    def generateChessMove(self):
        """
        Calls chess engine's API to respond to board, parse the move to update the model prompt, and then get pushed to the board.
        """
        result = self.engine.play(self.board, chess.engine.Limit(time=2.0))  # Time limit for the move
        # move needs to parsed into SAN notation before being pushed to board
        self.parseChessMove(result.move)
        self.board.push(result.move)
        


In [None]:

game_outcomes = {0: 0, -1: 0, 1: 0}

for _ in range(100):
    game = ChessSimulator(model, tokenizer, "PATH_TO_STOCKFISH_BINARY")
    game_outcomes[game.simulateGame()] += 1

game_outcomes