In [6]:
import torch

In [7]:
torch.cuda.is_available()

True

In [8]:
import torch
from gpt import GPTModel

GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval();  # Disable dropout during inference

In [9]:
import tiktoken
from gpt import generate_text_simple, create_dataloader_v1

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())

start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"]
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you rentingetic wasnم refres RexMeCHicular stren


In [10]:
from datasets import load_dataset

dataset = load_dataset("wikitext", "wikitext-103-v1", split="train")


In [11]:
dataset['text']


['',
 ' = Valkyria Chronicles III = \n',
 '',
 ' Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . \n',
 " The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving

In [12]:
txt_data = " ".join(dataset["text"])
# train_loader = create_dataloader_v1(txt_data, batch_size=4, max_length=256, stride=128)

train_ratio = 0.90
split_idx = int(train_ratio * len(txt_data))
train_data = txt_data[:split_idx]
val_data = txt_data[split_idx:]


torch.manual_seed(123)

train_loader = create_dataloader_v1(
    train_data,
    batch_size=4,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=4,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [13]:
for batch_idx, (input_batch, target_batch) in enumerate(train_loader):
    print(f"Batch {batch_idx + 1}")
    print("Input batch (tokens):", input_batch)
    print("Target batch (tokens):", target_batch)
    break  # Only printing the first batch as an example

Batch 1
Input batch (tokens): tensor([[ 4870,   286,   257,  ...,   340,   373,   257],
        [  803,  5755,  3212,  ...,   383,  2537,   746],
        [  262,  1152,    64,  ..., 40958,  3417,   262],
        [  764,  6023,  3261,  ...,  4849,   500,   837]])
Target batch (tokens): tensor([[  286,   257,  3996,  ...,   373,   257,  2270],
        [ 5755,  3212,   764,  ...,  2537,   746,  3609],
        [ 1152,    64,  1279,  ...,  3417,   262, 43447],
        [ 6023,  3261,   547,  ...,   500,   837,   287]])


In [14]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss


def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        # Reduce the number of batches to match the total number of batches in the data loader
        # if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

In [None]:
import math

def compute_perplexity(model, data_loader, device):
    model.eval()
    total_loss = 0.0
    total_tokens = 0

    with torch.no_grad():
        for input_batch, target_batch in data_loader:
            input_batch, target_batch = input_batch.to(device), target_batch.to(device)
            logits = model(input_batch)
            loss = torch.nn.functional.cross_entropy(
                logits.view(-1, logits.size(-1)),
                target_batch.view(-1),
                reduction='sum'
            )
            total_loss += loss.item()
            total_tokens += target_batch.numel()

    avg_loss = total_loss / total_tokens
    perplexity = math.exp(avg_loss)
    return perplexity


In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [16]:

# # Note:
# # Uncommenting the following lines will allow the code to run on Apple Silicon chips, if applicable,
# # which is approximately 2x faster than on an Apple CPU (as measured on an M3 MacBook Air).
# # However, the resulting loss values may be slightly different.

# #if torch.cuda.is_available():
# #    device = torch.device("cuda")
# #elif torch.backends.mps.is_available():
# #    device = torch.device("mps")
# #else:
# #    device = torch.device("cpu")
# #
# # print(f"Using {device} device.")


# model.to(device) # no assignment model = model.to(device) necessary for nn.Module classes


# torch.manual_seed(123) # For reproducibility due to the shuffling in the data loader

# with torch.no_grad(): # Disable gradient tracking for efficiency because we are not training, yet
#     train_loss = calc_loss_loader(train_loader, model, device)
#     val_loss = calc_loss_loader(val_loader, model, device)

# print("Training loss:", train_loss)
# print("Validation loss:", val_loss)


In [17]:
import logging
import deepspeed
from tqdm import tqdm  # tqdm을 함수처럼 사용

# Setup logging configuration
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(message)s", 
    level=logging.INFO,  # You can adjust the level to DEBUG, INFO, WARNING, etc.
    handlers=[
        logging.FileHandler("training_log.txt"),  # Logs will be saved to this file
        logging.StreamHandler()  # Also log to console
    ]
)

# Now use logging instead of print
logger = logging.getLogger(__name__)

[2025-04-08 22:10:27,298] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/jiyoon/miniconda3/envs/LLMs/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/jiyoon/miniconda3/envs/LLMs/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/home/jiyoon/miniconda3/envs/LLMs/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/home/jiyoon/miniconda3/envs/LLMs/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/home/jiyoon/miniconda3/envs/LLMs/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'
/home/jiyoon/miniconda3/envs/LLMs/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'
/home/jiyoon/miniconda3/

In [None]:
import deepspeed
import os

def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                       eval_freq, eval_iter, start_context, tokenizer, deepspeed_config, save_dir="./checkpoints"):

    # Initialize DeepSpeed
    model, optimizer, _, _ = deepspeed.initialize(args=None, model=model, optimizer=optimizer, config_params=deepspeed_config)

    # Initialize lists to track losses and tokens seen
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    os.makedirs(save_dir, exist_ok=True)

    # Main training loop
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode

        logger.info(f"Starting Epoch {epoch+1}...")

        for input_batch, target_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}", unit="batch"):
            input_batch = input_batch.to(device)
            target_batch = target_batch.to(device)

            optimizer.zero_grad()  # Reset loss gradients from previous batch iteration

            # Compute the loss
            loss = calc_loss_batch(input_batch, target_batch, model, device)

            # print(loss)
            # print(f"Loss type: {loss.dtype}")
            # print(f"Loss scale type: {model.loss_scale.dtype}")
            # if not isinstance(loss, torch.Tensor):
            #     loss = torch.tensor(loss).to(device)


            # Backward pass and step the optimizer
            model.backward(loss)  # Backward pass with DeepSpeed
            model.step()  # Step optimizer using DeepSpeed

            tokens_seen += input_batch.numel()
            global_step += 1

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter
                )
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                ppl = compute_perplexity(model, val_loader, device)
                track_tokens_seen.append(tokens_seen)
                logger.info(f"Epoch {epoch+1} (Step {global_step:06d}): "
                            f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}"
                            f"Validation Perplexity: {ppl:.2f}")
                
                
        logger.info(f"Epoch {epoch+1} completed. Generating a sample...")
        # Print a sample text after each epoch
        generate_and_print_sample(
            model, tokenizer, device, start_context
        )
        # Save checkpoint
        epoch_ckpt_dir = os.path.join(save_dir, f"checkpoint-epoch{epoch+1}")
        model.save_checkpoint(epoch_ckpt_dir)
        logger.info(f"Checkpoint saved at: {epoch_ckpt_dir}")

    return train_losses, val_losses, track_tokens_seen

def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss


def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model, idx=encoded,
            max_new_tokens=50, context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))  # Compact print format
    logger.info(f"Generated Text: {decoded_text}")  # Log the generated text
    model.train()

In [19]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

train_losses, track_tokens_seen = train_model_simple(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    device=device,
    num_epochs=3,
    eval_freq=1000,
    eval_iter=10,
    start_context="Every effort moves you",
    tokenizer=tokenizer,
    deepspeed_config="./ds_config.json"
)

[2025-04-08 22:10:33,811] [INFO] [logging.py:107:log_dist] [Rank -1] DeepSpeed info: version=0.16.5, git-hash=unknown, git-branch=unknown
[2025-04-08 22:10:33,812] [INFO] [comm.py:658:init_distributed] cdb=None
[2025-04-08 22:10:33,812] [INFO] [comm.py:673:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment...
[1744117833.963829] [gpusystem:364203:0]    cuda_copy_md.c:348  UCX  WARN                cannot set sync_memops on CUDA VMM without cuCtxSetFlags() (address=0x7fb10d400000)
[1744117833.967401] [gpusystem:364203:0]    cuda_copy_md.c:348  UCX  WARN              cannot set sync_memops on CUDA VMM without cuCtxSetFlags() (address=0x7fb10d400000)
[2025-04-08 22:10:33,972] [INFO] [comm.py:728:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=10.125.70.48, master_port=29500
[2025-04-08 22:10:33,972] [INFO] [comm.py:689:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[20

2025-04-08 22:10:35,940 - INFO - Starting Epoch 1...
Epoch 1:   0%|          | 0/105122 [00:00<?, ?batch/s]2025-04-08 22:10:36,570 - INFO - Epoch 1 (Step 000000): Train loss 10.998, Val loss 10.991
Epoch 1:   0%|          | 1/105122 [00:00<18:24:20,  1.59batch/s]

[2025-04-08 22:10:36,603] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4294967296, reducing to 2147483648
[2025-04-08 22:10:36,657] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2147483648, reducing to 1073741824


Epoch 1:   0%|          | 5/105122 [00:00<3:24:36,  8.56batch/s] 

[2025-04-08 22:10:36,710] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1073741824, reducing to 536870912
[2025-04-08 22:10:36,761] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 536870912, reducing to 268435456


Epoch 1:   0%|          | 9/105122 [00:00<1:57:26, 14.92batch/s]

[2025-04-08 22:10:36,813] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 268435456, reducing to 134217728
[2025-04-08 22:10:36,863] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 134217728, reducing to 67108864


Epoch 1:   0%|          | 13/105122 [00:00<1:25:25, 20.51batch/s]

[2025-04-08 22:10:36,916] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 67108864, reducing to 33554432
[2025-04-08 22:10:36,967] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 33554432, reducing to 16777216


Epoch 1:   0%|          | 17/105122 [00:01<1:09:57, 25.04batch/s]

[2025-04-08 22:10:37,019] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16777216, reducing to 8388608
[2025-04-08 22:10:37,069] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8388608, reducing to 4194304


Epoch 1:   0%|          | 21/105122 [00:01<1:01:15, 28.60batch/s]

[2025-04-08 22:10:37,121] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4194304, reducing to 2097152
[2025-04-08 22:10:37,173] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, reducing to 1048576


Epoch 1:   0%|          | 25/105122 [00:01<55:31, 31.54batch/s]  

[2025-04-08 22:10:37,224] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1048576, reducing to 524288
[2025-04-08 22:10:37,275] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 524288, reducing to 262144


Epoch 1:   0%|          | 29/105122 [00:01<52:17, 33.50batch/s]

[2025-04-08 22:10:37,328] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 262144, reducing to 131072
[2025-04-08 22:10:37,379] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:   1%|          | 998/105122 [00:38<1:03:23, 27.38batch/s]2025-04-08 22:11:14,542 - INFO - Epoch 1 (Step 001000): Train loss 6.242, Val loss 6.309
Epoch 1:   2%|▏         | 1998/105122 [01:16<1:03:13, 27.18batch/s]2025-04-08 22:11:52,803 - INFO - Epoch 1 (Step 002000): Train loss 5.879, Val loss 5.917
Epoch 1:   2%|▏         | 2032/105122 [01:18<1:04:14, 26.75batch/s]

[2025-04-08 22:11:54,014] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:   3%|▎         | 2998/105122 [01:55<1:02:40, 27.15batch/s]2025-04-08 22:12:31,307 - INFO - Epoch 1 (Step 003000): Train loss 5.668, Val loss 5.625
Epoch 1:   4%|▍         | 3998/105122 [02:33<1:02:27, 26.99batch/s]2025-04-08 22:13:09,781 - INFO - Epoch 1 (Step 004000): Train loss 5.325, Val loss 5.432
Epoch 1:   4%|▍         | 4034/105122 [02:35<1:03:14, 26.64batch/s]

[2025-04-08 22:13:11,089] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:   5%|▍         | 5000/105122 [03:12<1:01:55, 26.95batch/s]2025-04-08 22:13:48,307 - INFO - Epoch 1 (Step 005000): Train loss 5.531, Val loss 5.296
Epoch 1:   6%|▌         | 5998/105122 [03:51<1:00:51, 27.14batch/s]2025-04-08 22:14:28,077 - INFO - Epoch 1 (Step 006000): Train loss 5.189, Val loss 5.213
Epoch 1:   6%|▌         | 6036/105122 [03:53<1:01:33, 26.83batch/s]

[2025-04-08 22:14:29,438] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:   7%|▋         | 6998/105122 [04:30<1:00:34, 27.00batch/s]2025-04-08 22:15:06,566 - INFO - Epoch 1 (Step 007000): Train loss 5.222, Val loss 5.077
Epoch 1:   8%|▊         | 8000/105122 [05:09<1:00:11, 26.90batch/s]2025-04-08 22:15:45,089 - INFO - Epoch 1 (Step 008000): Train loss 5.052, Val loss 5.035
Epoch 1:   8%|▊         | 8037/105122 [05:10<59:01, 27.41batch/s]  

[2025-04-08 22:15:46,536] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:   9%|▊         | 8999/105122 [05:47<58:30, 27.38batch/s]  2025-04-08 22:16:23,596 - INFO - Epoch 1 (Step 009000): Train loss 5.049, Val loss 4.846
Epoch 1:  10%|▉         | 10000/105122 [06:26<59:05, 26.83batch/s] 2025-04-08 22:17:02,119 - INFO - Epoch 1 (Step 010000): Train loss 4.793, Val loss 4.779
Epoch 1:  10%|▉         | 10040/105122 [06:27<1:00:00, 26.40batch/s]

[2025-04-08 22:17:03,636] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  10%|█         | 11000/105122 [07:04<58:18, 26.90batch/s]  2025-04-08 22:17:40,585 - INFO - Epoch 1 (Step 011000): Train loss 4.915, Val loss 4.721
Epoch 1:  11%|█▏        | 11998/105122 [07:42<57:27, 27.01batch/s]  2025-04-08 22:18:20,326 - INFO - Epoch 1 (Step 012000): Train loss 4.829, Val loss 4.588
Epoch 1:  11%|█▏        | 12040/105122 [07:45<1:02:25, 24.85batch/s]

[2025-04-08 22:18:21,921] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  12%|█▏        | 13000/105122 [08:22<56:53, 26.98batch/s]  2025-04-08 22:18:58,737 - INFO - Epoch 1 (Step 013000): Train loss 4.566, Val loss 4.525
Epoch 1:  13%|█▎        | 13998/105122 [09:01<55:58, 27.13batch/s]  2025-04-08 22:19:37,151 - INFO - Epoch 1 (Step 014000): Train loss 4.804, Val loss 4.520
Epoch 1:  13%|█▎        | 14044/105122 [09:02<56:38, 26.80batch/s]  

[2025-04-08 22:19:38,812] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  14%|█▍        | 15000/105122 [09:39<55:41, 26.97batch/s]  2025-04-08 22:20:15,602 - INFO - Epoch 1 (Step 015000): Train loss 4.807, Val loss 4.438
Epoch 1:  15%|█▌        | 16000/105122 [10:17<55:10, 26.92batch/s]  2025-04-08 22:20:54,036 - INFO - Epoch 1 (Step 016000): Train loss 4.519, Val loss 4.410
Epoch 1:  15%|█▌        | 16044/105122 [10:19<55:05, 26.95batch/s]  

[2025-04-08 22:20:55,776] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  16%|█▌        | 17000/105122 [10:56<54:22, 27.01batch/s]  2025-04-08 22:21:32,628 - INFO - Epoch 1 (Step 017000): Train loss 4.509, Val loss 4.338
Epoch 1:  17%|█▋        | 18000/105122 [11:34<53:37, 27.08batch/s]  2025-04-08 22:22:11,067 - INFO - Epoch 1 (Step 018000): Train loss 4.593, Val loss 4.300
Epoch 1:  17%|█▋        | 18046/105122 [11:36<53:48, 26.97batch/s]  

[2025-04-08 22:22:12,871] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  18%|█▊        | 18998/105122 [12:14<57:06, 25.14batch/s]  2025-04-08 22:22:50,889 - INFO - Epoch 1 (Step 019000): Train loss 4.596, Val loss 4.270
Epoch 1:  19%|█▉        | 20000/105122 [12:53<52:52, 26.83batch/s]  2025-04-08 22:23:29,338 - INFO - Epoch 1 (Step 020000): Train loss 4.261, Val loss 4.219
Epoch 1:  19%|█▉        | 20048/105122 [12:55<53:45, 26.38batch/s]  

[2025-04-08 22:23:31,222] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  20%|█▉        | 21000/105122 [13:31<51:24, 27.27batch/s]  2025-04-08 22:24:07,814 - INFO - Epoch 1 (Step 021000): Train loss 4.440, Val loss 4.198
Epoch 1:  21%|██        | 22000/105122 [14:10<50:59, 27.17batch/s]  2025-04-08 22:24:46,263 - INFO - Epoch 1 (Step 022000): Train loss 4.487, Val loss 4.208
Epoch 1:  21%|██        | 22050/105122 [14:12<51:19, 26.98batch/s]  

[2025-04-08 22:24:48,221] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  22%|██▏       | 22998/105122 [14:48<54:45, 24.99batch/s]  2025-04-08 22:25:24,878 - INFO - Epoch 1 (Step 023000): Train loss 4.309, Val loss 4.182
Epoch 1:  23%|██▎       | 23998/105122 [15:27<49:48, 27.15batch/s]  2025-04-08 22:26:03,625 - INFO - Epoch 1 (Step 024000): Train loss 4.275, Val loss 4.140
Epoch 1:  23%|██▎       | 24052/105122 [15:29<49:59, 27.03batch/s]

[2025-04-08 22:26:05,657] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  24%|██▍       | 25000/105122 [16:06<49:07, 27.18batch/s]  2025-04-08 22:26:42,145 - INFO - Epoch 1 (Step 025000): Train loss 4.302, Val loss 4.115
Epoch 1:  25%|██▍       | 26000/105122 [16:45<50:45, 25.98batch/s]  2025-04-08 22:27:22,006 - INFO - Epoch 1 (Step 026000): Train loss 4.134, Val loss 4.140
Epoch 1:  25%|██▍       | 26056/105122 [16:48<48:47, 27.01batch/s]  

[2025-04-08 22:27:24,126] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  26%|██▌       | 27000/105122 [17:24<49:13, 26.45batch/s]  2025-04-08 22:28:00,558 - INFO - Epoch 1 (Step 027000): Train loss 4.180, Val loss 4.102
Epoch 1:  27%|██▋       | 27998/105122 [18:02<47:38, 26.98batch/s]  2025-04-08 22:28:39,013 - INFO - Epoch 1 (Step 028000): Train loss 4.232, Val loss 4.059
Epoch 1:  27%|██▋       | 28056/105122 [18:05<47:42, 26.93batch/s]

[2025-04-08 22:28:41,199] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  28%|██▊       | 29000/105122 [18:41<46:43, 27.15batch/s]  2025-04-08 22:29:17,671 - INFO - Epoch 1 (Step 029000): Train loss 4.268, Val loss 4.014
Epoch 1:  29%|██▊       | 29998/105122 [19:20<47:44, 26.23batch/s]  2025-04-08 22:29:56,755 - INFO - Epoch 1 (Step 030000): Train loss 4.241, Val loss 4.018
Epoch 1:  29%|██▊       | 30058/105122 [19:23<47:59, 26.06batch/s]

[2025-04-08 22:29:59,070] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  29%|██▉       | 30998/105122 [19:59<45:48, 26.97batch/s]  2025-04-08 22:30:35,569 - INFO - Epoch 1 (Step 031000): Train loss 4.127, Val loss 4.007
Epoch 1:  30%|███       | 32000/105122 [20:39<45:44, 26.64batch/s]  2025-04-08 22:31:15,843 - INFO - Epoch 1 (Step 032000): Train loss 4.024, Val loss 3.954
Epoch 1:  30%|███       | 32060/105122 [20:42<45:31, 26.75batch/s]

[2025-04-08 22:31:18,233] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  31%|███▏      | 32998/105122 [21:18<44:27, 27.04batch/s]  2025-04-08 22:31:54,823 - INFO - Epoch 1 (Step 033000): Train loss 4.145, Val loss 3.981
Epoch 1:  32%|███▏      | 33998/105122 [21:57<43:38, 27.16batch/s]  2025-04-08 22:32:33,411 - INFO - Epoch 1 (Step 034000): Train loss 4.076, Val loss 3.967
Epoch 1:  32%|███▏      | 34064/105122 [21:59<45:14, 26.17batch/s]

[2025-04-08 22:32:35,838] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  33%|███▎      | 35000/105122 [22:36<43:17, 26.99batch/s]  2025-04-08 22:33:12,106 - INFO - Epoch 1 (Step 035000): Train loss 4.097, Val loss 3.939
Epoch 1:  34%|███▍      | 35999/105122 [23:14<41:45, 27.59batch/s]  2025-04-08 22:33:50,594 - INFO - Epoch 1 (Step 036000): Train loss 4.125, Val loss 3.919
Epoch 1:  34%|███▍      | 36066/105122 [23:17<42:28, 27.09batch/s]

[2025-04-08 22:33:53,070] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  35%|███▌      | 36999/105122 [23:53<41:38, 27.26batch/s]  2025-04-08 22:34:29,236 - INFO - Epoch 1 (Step 037000): Train loss 3.981, Val loss 3.894
Epoch 1:  36%|███▌      | 37999/105122 [24:33<2:42:52,  6.87batch/s]2025-04-08 22:35:09,239 - INFO - Epoch 1 (Step 038000): Train loss 3.939, Val loss 3.906
Epoch 1:  36%|███▌      | 38066/105122 [24:35<41:25, 26.98batch/s]  

[2025-04-08 22:35:11,786] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  37%|███▋      | 39000/105122 [25:11<40:29, 27.22batch/s]  2025-04-08 22:35:47,691 - INFO - Epoch 1 (Step 039000): Train loss 3.952, Val loss 3.885
Epoch 1:  38%|███▊      | 39998/105122 [25:50<40:29, 26.81batch/s]  2025-04-08 22:36:26,259 - INFO - Epoch 1 (Step 040000): Train loss 3.874, Val loss 3.880
Epoch 1:  38%|███▊      | 40068/105122 [25:52<40:05, 27.05batch/s]

[2025-04-08 22:36:28,913] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  39%|███▉      | 41000/105122 [26:28<39:18, 27.19batch/s]  2025-04-08 22:37:04,849 - INFO - Epoch 1 (Step 041000): Train loss 4.058, Val loss 3.880
Epoch 1:  40%|███▉      | 42000/105122 [27:07<38:58, 27.00batch/s]  2025-04-08 22:37:43,421 - INFO - Epoch 1 (Step 042000): Train loss 3.816, Val loss 3.869
Epoch 1:  40%|████      | 42072/105122 [27:10<39:35, 26.54batch/s]

[2025-04-08 22:37:46,118] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  41%|████      | 42998/105122 [27:45<38:06, 27.17batch/s]  2025-04-08 22:38:21,936 - INFO - Epoch 1 (Step 043000): Train loss 3.907, Val loss 3.834
Epoch 1:  42%|████▏     | 44000/105122 [28:24<37:29, 27.18batch/s]  2025-04-08 22:39:00,503 - INFO - Epoch 1 (Step 044000): Train loss 3.886, Val loss 3.841
Epoch 1:  42%|████▏     | 44074/105122 [28:28<1:49:27,  9.30batch/s]

[2025-04-08 22:39:04,625] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  43%|████▎     | 44998/105122 [29:04<37:54, 26.43batch/s]  2025-04-08 22:39:40,354 - INFO - Epoch 1 (Step 045000): Train loss 3.878, Val loss 3.797
Epoch 1:  44%|████▍     | 46000/105122 [29:42<36:31, 26.98batch/s]  2025-04-08 22:40:18,831 - INFO - Epoch 1 (Step 046000): Train loss 3.913, Val loss 3.801
Epoch 1:  44%|████▍     | 46074/105122 [29:45<36:28, 26.98batch/s]

[2025-04-08 22:40:21,686] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  45%|████▍     | 46998/105122 [30:21<36:26, 26.58batch/s]  2025-04-08 22:40:57,432 - INFO - Epoch 1 (Step 047000): Train loss 3.853, Val loss 3.754
Epoch 1:  46%|████▌     | 47998/105122 [30:59<35:18, 26.97batch/s]  2025-04-08 22:41:36,123 - INFO - Epoch 1 (Step 048000): Train loss 3.928, Val loss 3.772
Epoch 1:  46%|████▌     | 48078/105122 [31:03<35:02, 27.13batch/s]

[2025-04-08 22:41:39,048] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  47%|████▋     | 49000/105122 [31:38<34:26, 27.15batch/s]  2025-04-08 22:42:14,631 - INFO - Epoch 1 (Step 049000): Train loss 3.822, Val loss 3.757
Epoch 1:  48%|████▊     | 50000/105122 [32:17<33:59, 27.03batch/s]  2025-04-08 22:42:53,145 - INFO - Epoch 1 (Step 050000): Train loss 3.871, Val loss 3.754
Epoch 1:  48%|████▊     | 50080/105122 [32:20<33:54, 27.05batch/s]

[2025-04-08 22:42:56,143] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  49%|████▊     | 51000/105122 [32:55<33:38, 26.82batch/s]  2025-04-08 22:43:31,671 - INFO - Epoch 1 (Step 051000): Train loss 3.889, Val loss 3.746
Epoch 1:  49%|████▉     | 51998/105122 [33:35<32:43, 27.06batch/s]  2025-04-08 22:44:11,496 - INFO - Epoch 1 (Step 052000): Train loss 3.929, Val loss 3.740
Epoch 1:  50%|████▉     | 52082/105122 [33:38<32:53, 26.87batch/s]

[2025-04-08 22:44:14,586] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  50%|█████     | 52998/105122 [34:13<32:41, 26.57batch/s]  2025-04-08 22:44:50,113 - INFO - Epoch 1 (Step 053000): Train loss 3.846, Val loss 3.728
Epoch 1:  51%|█████▏    | 54000/105122 [34:52<31:37, 26.94batch/s]  2025-04-08 22:45:28,491 - INFO - Epoch 1 (Step 054000): Train loss 3.807, Val loss 3.714
Epoch 1:  51%|█████▏    | 54084/105122 [34:55<31:22, 27.11batch/s]

[2025-04-08 22:45:31,629] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  52%|█████▏    | 54998/105122 [35:30<30:58, 26.96batch/s]  2025-04-08 22:46:07,012 - INFO - Epoch 1 (Step 055000): Train loss 3.800, Val loss 3.700
Epoch 1:  53%|█████▎    | 56000/105122 [36:09<30:30, 26.83batch/s]  2025-04-08 22:46:45,564 - INFO - Epoch 1 (Step 056000): Train loss 3.835, Val loss 3.715
Epoch 1:  53%|█████▎    | 56086/105122 [36:12<30:12, 27.05batch/s]

[2025-04-08 22:46:48,778] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  54%|█████▍    | 56998/105122 [36:47<29:30, 27.19batch/s]  2025-04-08 22:47:24,005 - INFO - Epoch 1 (Step 057000): Train loss 3.883, Val loss 3.701
Epoch 1:  55%|█████▌    | 58000/105122 [37:27<32:17, 24.32batch/s]  2025-04-08 22:48:03,929 - INFO - Epoch 1 (Step 058000): Train loss 3.815, Val loss 3.698
Epoch 1:  55%|█████▌    | 58088/105122 [37:31<30:00, 26.12batch/s]

[2025-04-08 22:48:07,236] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  56%|█████▌    | 59000/105122 [38:06<28:26, 27.02batch/s]  2025-04-08 22:48:42,530 - INFO - Epoch 1 (Step 059000): Train loss 3.720, Val loss 3.686
Epoch 1:  57%|█████▋    | 59998/105122 [38:44<27:55, 26.93batch/s]  2025-04-08 22:49:21,093 - INFO - Epoch 1 (Step 060000): Train loss 3.709, Val loss 3.673
Epoch 1:  57%|█████▋    | 60090/105122 [38:48<27:48, 26.99batch/s]

[2025-04-08 22:49:24,521] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  58%|█████▊    | 60999/105122 [39:23<26:46, 27.47batch/s]  2025-04-08 22:49:59,706 - INFO - Epoch 1 (Step 061000): Train loss 3.636, Val loss 3.683
Epoch 1:  59%|█████▉    | 61998/105122 [40:02<26:33, 27.06batch/s]  2025-04-08 22:50:38,185 - INFO - Epoch 1 (Step 062000): Train loss 3.747, Val loss 3.675
Epoch 1:  59%|█████▉    | 62092/105122 [40:05<26:29, 27.08batch/s]

[2025-04-08 22:50:41,654] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  60%|█████▉    | 62998/105122 [40:40<26:32, 26.45batch/s]  2025-04-08 22:51:16,806 - INFO - Epoch 1 (Step 063000): Train loss 3.808, Val loss 3.654
Epoch 1:  61%|██████    | 64000/105122 [41:19<25:24, 26.97batch/s]  2025-04-08 22:51:55,333 - INFO - Epoch 1 (Step 064000): Train loss 3.634, Val loss 3.675
Epoch 1:  61%|██████    | 64092/105122 [41:24<25:43, 26.58batch/s]  

[2025-04-08 22:52:00,199] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  62%|██████▏   | 64998/105122 [41:59<24:45, 27.02batch/s]  2025-04-08 22:52:35,201 - INFO - Epoch 1 (Step 065000): Train loss 3.899, Val loss 3.636
Epoch 1:  63%|██████▎   | 66000/105122 [42:37<24:22, 26.75batch/s]  2025-04-08 22:53:13,724 - INFO - Epoch 1 (Step 066000): Train loss 3.736, Val loss 3.617
Epoch 1:  63%|██████▎   | 66094/105122 [42:41<24:01, 27.08batch/s]

[2025-04-08 22:53:17,302] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  64%|██████▎   | 67000/105122 [43:16<23:32, 26.99batch/s]  2025-04-08 22:53:52,149 - INFO - Epoch 1 (Step 067000): Train loss 3.648, Val loss 3.651
Epoch 1:  65%|██████▍   | 68000/105122 [43:54<22:53, 27.02batch/s]  2025-04-08 22:54:30,691 - INFO - Epoch 1 (Step 068000): Train loss 3.669, Val loss 3.644
Epoch 1:  65%|██████▍   | 68098/105122 [43:58<22:47, 27.08batch/s]

[2025-04-08 22:54:34,356] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  66%|██████▌   | 68998/105122 [44:33<22:20, 26.95batch/s]  2025-04-08 22:55:09,276 - INFO - Epoch 1 (Step 069000): Train loss 3.743, Val loss 3.603
Epoch 1:  67%|██████▋   | 69998/105122 [45:11<21:38, 27.04batch/s]  2025-04-08 22:55:47,808 - INFO - Epoch 1 (Step 070000): Train loss 3.645, Val loss 3.630
Epoch 1:  67%|██████▋   | 70098/105122 [45:15<22:07, 26.39batch/s]

[2025-04-08 22:55:51,566] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  68%|██████▊   | 70998/105122 [45:51<34:52, 16.30batch/s]  2025-04-08 22:56:27,707 - INFO - Epoch 1 (Step 071000): Train loss 3.628, Val loss 3.626
Epoch 1:  68%|██████▊   | 72000/105122 [46:30<20:32, 26.88batch/s]  2025-04-08 22:57:06,289 - INFO - Epoch 1 (Step 072000): Train loss 3.608, Val loss 3.629
Epoch 1:  69%|██████▊   | 72102/105122 [46:34<20:21, 27.03batch/s]

[2025-04-08 22:57:10,103] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  69%|██████▉   | 72998/105122 [47:08<19:54, 26.89batch/s]  2025-04-08 22:57:44,760 - INFO - Epoch 1 (Step 073000): Train loss 3.666, Val loss 3.611
Epoch 1:  70%|███████   | 73998/105122 [47:47<19:06, 27.14batch/s]  2025-04-08 22:58:23,253 - INFO - Epoch 1 (Step 074000): Train loss 3.455, Val loss 3.619
Epoch 1:  70%|███████   | 74102/105122 [47:51<19:05, 27.08batch/s]

[2025-04-08 22:58:27,138] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  71%|███████▏  | 75000/105122 [48:25<18:30, 27.12batch/s]  2025-04-08 22:59:01,785 - INFO - Epoch 1 (Step 075000): Train loss 3.721, Val loss 3.600
Epoch 1:  72%|███████▏  | 76000/105122 [49:04<17:53, 27.12batch/s]  2025-04-08 22:59:40,325 - INFO - Epoch 1 (Step 076000): Train loss 3.764, Val loss 3.618
Epoch 1:  72%|███████▏  | 76104/105122 [49:08<17:53, 27.02batch/s]

[2025-04-08 22:59:44,331] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  73%|███████▎  | 77000/105122 [49:42<17:23, 26.94batch/s]  2025-04-08 23:00:18,890 - INFO - Epoch 1 (Step 077000): Train loss 3.671, Val loss 3.609
Epoch 1:  74%|███████▍  | 77998/105122 [50:22<16:55, 26.70batch/s]  2025-04-08 23:00:58,822 - INFO - Epoch 1 (Step 078000): Train loss 3.719, Val loss 3.592
Epoch 1:  74%|███████▍  | 78106/105122 [50:26<16:33, 27.19batch/s]

[2025-04-08 23:01:02,856] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  75%|███████▌  | 78998/105122 [51:01<15:59, 27.22batch/s]  2025-04-08 23:01:37,321 - INFO - Epoch 1 (Step 079000): Train loss 3.787, Val loss 3.590
Epoch 1:  75%|███████▌  | 79302/105122 [51:12<16:06, 26.72batch/s]

[2025-04-08 23:01:48,629] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768


Epoch 1:  76%|███████▌  | 80000/105122 [51:39<15:34, 26.88batch/s]  2025-04-08 23:02:15,798 - INFO - Epoch 1 (Step 080000): Train loss 3.726, Val loss 3.579
Epoch 1:  77%|███████▋  | 80998/105122 [52:18<15:01, 26.75batch/s]2025-04-08 23:02:54,394 - INFO - Epoch 1 (Step 081000): Train loss 3.599, Val loss 3.585
Epoch 1:  78%|███████▊  | 81999/105122 [52:56<13:55, 27.67batch/s]2025-04-08 23:03:33,079 - INFO - Epoch 1 (Step 082000): Train loss 3.720, Val loss 3.545
Epoch 1:  79%|███████▉  | 83000/105122 [53:35<13:44, 26.82batch/s]2025-04-08 23:04:11,553 - INFO - Epoch 1 (Step 083000): Train loss 3.696, Val loss 3.569
Epoch 1:  79%|███████▉  | 83304/105122 [53:48<13:30, 26.93batch/s]

[2025-04-08 23:04:24,293] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  80%|███████▉  | 83999/105122 [54:15<50:53,  6.92batch/s]2025-04-08 23:04:51,480 - INFO - Epoch 1 (Step 084000): Train loss 3.600, Val loss 3.586
Epoch 1:  81%|████████  | 84998/105122 [54:53<12:28, 26.88batch/s]2025-04-08 23:05:30,043 - INFO - Epoch 1 (Step 085000): Train loss 3.699, Val loss 3.566
Epoch 1:  81%|████████  | 85308/105122 [55:05<12:14, 26.99batch/s]

[2025-04-08 23:05:41,504] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  82%|████████▏ | 85998/105122 [55:32<11:49, 26.94batch/s]2025-04-08 23:06:08,574 - INFO - Epoch 1 (Step 086000): Train loss 3.618, Val loss 3.587
Epoch 1:  83%|████████▎ | 87000/105122 [56:11<11:20, 26.62batch/s]2025-04-08 23:06:47,169 - INFO - Epoch 1 (Step 087000): Train loss 3.634, Val loss 3.553
Epoch 1:  83%|████████▎ | 87308/105122 [56:22<10:52, 27.32batch/s]

[2025-04-08 23:06:58,682] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  84%|████████▎ | 87998/105122 [56:49<10:31, 27.13batch/s]2025-04-08 23:07:25,751 - INFO - Epoch 1 (Step 088000): Train loss 3.519, Val loss 3.540
Epoch 1:  85%|████████▍ | 88914/105122 [57:25<09:59, 27.06batch/s]

[2025-04-08 23:08:00,999] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768


Epoch 1:  85%|████████▍ | 88998/105122 [57:28<09:51, 27.27batch/s]2025-04-08 23:08:04,226 - INFO - Epoch 1 (Step 089000): Train loss 3.589, Val loss 3.563
Epoch 1:  86%|████████▌ | 89998/105122 [58:06<09:21, 26.94batch/s]2025-04-08 23:08:42,880 - INFO - Epoch 1 (Step 090000): Train loss 3.689, Val loss 3.556
Epoch 1:  87%|████████▋ | 90998/105122 [58:46<08:54, 26.45batch/s]2025-04-08 23:09:22,806 - INFO - Epoch 1 (Step 091000): Train loss 3.584, Val loss 3.545
Epoch 1:  88%|████████▊ | 92000/105122 [59:25<08:14, 26.52batch/s]2025-04-08 23:10:01,336 - INFO - Epoch 1 (Step 092000): Train loss 3.643, Val loss 3.546
Epoch 1:  88%|████████▊ | 92914/105122 [1:00:00<07:29, 27.15batch/s]

[2025-04-08 23:10:36,807] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  88%|████████▊ | 92998/105122 [1:00:03<07:29, 26.98batch/s]2025-04-08 23:10:39,966 - INFO - Epoch 1 (Step 093000): Train loss 3.559, Val loss 3.524
Epoch 1:  89%|████████▉ | 93998/105122 [1:00:42<06:52, 26.94batch/s]2025-04-08 23:11:18,552 - INFO - Epoch 1 (Step 094000): Train loss 3.666, Val loss 3.562
Epoch 1:  90%|████████▉ | 94589/105122 [1:01:05<06:26, 27.22batch/s]

[2025-04-08 23:11:41,854] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768


Epoch 1:  90%|█████████ | 94998/105122 [1:01:21<06:22, 26.45batch/s]2025-04-08 23:11:57,155 - INFO - Epoch 1 (Step 095000): Train loss 3.488, Val loss 3.528
Epoch 1:  91%|█████████▏| 95998/105122 [1:01:59<05:38, 26.97batch/s]2025-04-08 23:12:35,787 - INFO - Epoch 1 (Step 096000): Train loss 3.636, Val loss 3.545
Epoch 1:  92%|█████████▏| 97000/105122 [1:02:38<05:00, 27.06batch/s]2025-04-08 23:13:14,348 - INFO - Epoch 1 (Step 097000): Train loss 3.546, Val loss 3.553
Epoch 1:  93%|█████████▎| 97998/105122 [1:03:18<04:23, 27.01batch/s]2025-04-08 23:13:54,289 - INFO - Epoch 1 (Step 098000): Train loss 3.651, Val loss 3.527
Epoch 1:  94%|█████████▍| 98590/105122 [1:03:40<04:00, 27.11batch/s]

[2025-04-08 23:14:16,298] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  94%|█████████▍| 98998/105122 [1:03:56<03:47, 26.93batch/s]2025-04-08 23:14:32,850 - INFO - Epoch 1 (Step 099000): Train loss 3.713, Val loss 3.503
Epoch 1:  95%|█████████▍| 99798/105122 [1:04:27<03:18, 26.80batch/s]

[2025-04-08 23:15:03,803] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768


Epoch 1:  95%|█████████▌| 99998/105122 [1:04:35<03:11, 26.72batch/s]2025-04-08 23:15:11,358 - INFO - Epoch 1 (Step 100000): Train loss 3.729, Val loss 3.540
Epoch 1:  95%|█████████▌| 100004/105122 [1:04:35<03:47, 22.45batch/s]

[2025-04-08 23:15:11,613] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384


Epoch 1:  96%|█████████▌| 100998/105122 [1:05:13<02:36, 26.33batch/s]2025-04-08 23:15:49,900 - INFO - Epoch 1 (Step 101000): Train loss 3.620, Val loss 3.542
Epoch 1:  97%|█████████▋| 102000/105122 [1:05:52<01:54, 27.22batch/s]2025-04-08 23:16:28,530 - INFO - Epoch 1 (Step 102000): Train loss 3.434, Val loss 3.517
Epoch 1:  98%|█████████▊| 102998/105122 [1:06:30<01:20, 26.36batch/s]2025-04-08 23:17:07,069 - INFO - Epoch 1 (Step 103000): Train loss 3.612, Val loss 3.517
Epoch 1:  98%|█████████▊| 103098/105122 [1:06:36<02:06, 16.03batch/s]

[2025-04-08 23:17:12,076] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384


Epoch 1:  99%|█████████▉| 103998/105122 [1:07:10<00:43, 25.79batch/s]2025-04-08 23:17:46,999 - INFO - Epoch 1 (Step 104000): Train loss 3.577, Val loss 3.531
Epoch 1: 100%|█████████▉| 104998/105122 [1:07:49<00:04, 26.66batch/s]2025-04-08 23:18:25,642 - INFO - Epoch 1 (Step 105000): Train loss 3.502, Val loss 3.511
Epoch 1: 100%|██████████| 105122/105122 [1:07:54<00:00, 25.80batch/s]
2025-04-08 23:18:30,141 - INFO - Epoch 1 completed. Generating a sample...
2025-04-08 23:18:30,345 - INFO - Generated Text: Every effort moves you 're going to the world . " 
   = = Production = = 
   The episode was written by series co @-@ creator Trey Parker , who had previously worked on the series . Parker was the first to write the


Every effort moves you 're going to the world . "     = = Production = =     The episode was written by series co @-@ creator Trey Parker , who had previously worked on the series . Parker was the first to write the
[2025-04-08 23:18:30,368] [INFO] [logging.py:107:log_dist] [Rank 0] [Torch] Checkpoint global_step52561 is about to be saved!
[2025-04-08 23:18:30,370] [INFO] [logging.py:107:log_dist] [Rank 0] Saving model checkpoint: ./checkpoints/checkpoint-epoch1/global_step52561/mp_rank_00_model_states.pt
[2025-04-08 23:18:30,371] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./checkpoints/checkpoint-epoch1/global_step52561/mp_rank_00_model_states.pt...


[rank0]:[W408 23:18:30.580142724 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.


[2025-04-08 23:18:30,707] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./checkpoints/checkpoint-epoch1/global_step52561/mp_rank_00_model_states.pt.
[2025-04-08 23:18:30,708] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./checkpoints/checkpoint-epoch1/global_step52561/zero_pp_rank_0_mp_rank_00_optim_states.pt...
[2025-04-08 23:18:32,642] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./checkpoints/checkpoint-epoch1/global_step52561/zero_pp_rank_0_mp_rank_00_optim_states.pt.
[2025-04-08 23:18:32,652] [INFO] [engine.py:3672:_save_zero_checkpoint] zero checkpoint saved ./checkpoints/checkpoint-epoch1/global_step52561/zero_pp_rank_0_mp_rank_00_optim_states.pt
[2025-04-08 23:18:32,653] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step52561 is ready now!


2025-04-08 23:18:32,654 - INFO - Checkpoint saved at: ./checkpoints/checkpoint-epoch1
2025-04-08 23:18:32,654 - INFO - Starting Epoch 2...
Epoch 2:   1%|          | 878/105122 [00:34<1:04:43, 26.84batch/s]2025-04-08 23:19:07,242 - INFO - Epoch 2 (Step 106000): Train loss 3.546, Val loss 3.476
Epoch 2:   2%|▏         | 1876/105122 [01:12<1:03:55, 26.92batch/s]2025-04-08 23:19:45,787 - INFO - Epoch 2 (Step 107000): Train loss 3.588, Val loss 3.520
Epoch 2:   3%|▎         | 2878/105122 [01:51<1:02:56, 27.07batch/s]2025-04-08 23:20:24,361 - INFO - Epoch 2 (Step 108000): Train loss 3.582, Val loss 3.513
Epoch 2:   3%|▎         | 3098/105122 [02:01<1:23:46, 20.30batch/s]

[2025-04-08 23:20:33,986] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768


Epoch 2:   4%|▎         | 3877/105122 [02:30<1:01:26, 27.47batch/s]2025-04-08 23:21:02,929 - INFO - Epoch 2 (Step 109000): Train loss 3.602, Val loss 3.506
Epoch 2:   4%|▍         | 4466/105122 [02:53<1:01:58, 27.07batch/s]

[2025-04-08 23:21:26,149] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384


Epoch 2:   5%|▍         | 4878/105122 [03:10<1:01:47, 27.04batch/s]2025-04-08 23:21:42,893 - INFO - Epoch 2 (Step 110000): Train loss 3.471, Val loss 3.481
Epoch 2:   6%|▌         | 5876/105122 [03:48<1:00:48, 27.20batch/s]2025-04-08 23:22:21,415 - INFO - Epoch 2 (Step 111000): Train loss 3.627, Val loss 3.505
Epoch 2:   7%|▋         | 6876/105122 [04:27<1:00:25, 27.10batch/s]2025-04-08 23:22:59,907 - INFO - Epoch 2 (Step 112000): Train loss 3.510, Val loss 3.477
Epoch 2:   7%|▋         | 7878/105122 [05:05<1:00:07, 26.96batch/s]2025-04-08 23:23:38,578 - INFO - Epoch 2 (Step 113000): Train loss 3.550, Val loss 3.503
Epoch 2:   8%|▊         | 8878/105122 [05:44<59:36, 26.91batch/s]  2025-04-08 23:24:17,177 - INFO - Epoch 2 (Step 114000): Train loss 3.481, Val loss 3.499
Epoch 2:   9%|▊         | 9138/105122 [05:54<59:54, 26.70batch/s]  

[2025-04-08 23:24:26,874] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768


Epoch 2:   9%|▉         | 9878/105122 [06:23<58:45, 27.01batch/s]  2025-04-08 23:24:55,822 - INFO - Epoch 2 (Step 115000): Train loss 3.571, Val loss 3.508
Epoch 2:  10%|█         | 10878/105122 [07:01<1:00:05, 26.14batch/s]2025-04-08 23:25:34,436 - INFO - Epoch 2 (Step 116000): Train loss 3.561, Val loss 3.487
Epoch 2:  11%|█         | 11521/105122 [07:27<56:24, 27.65batch/s]  

[2025-04-08 23:25:59,727] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768


Epoch 2:  11%|█▏        | 11878/105122 [07:41<57:48, 26.89batch/s]  2025-04-08 23:26:14,422 - INFO - Epoch 2 (Step 117000): Train loss 3.469, Val loss 3.491
Epoch 2:  12%|█▏        | 12286/105122 [07:56<58:45, 26.33batch/s]  

[2025-04-08 23:26:29,658] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384


Epoch 2:  12%|█▏        | 12876/105122 [08:20<56:38, 27.14batch/s]  2025-04-08 23:26:52,876 - INFO - Epoch 2 (Step 118000): Train loss 3.548, Val loss 3.505
Epoch 2:  13%|█▎        | 13878/105122 [08:58<57:41, 26.36batch/s]  2025-04-08 23:27:31,555 - INFO - Epoch 2 (Step 119000): Train loss 3.544, Val loss 3.482
Epoch 2:  14%|█▍        | 14878/105122 [09:37<55:58, 26.87batch/s]  2025-04-08 23:28:10,213 - INFO - Epoch 2 (Step 120000): Train loss 3.503, Val loss 3.481
Epoch 2:  15%|█▍        | 15338/105122 [09:56<54:58, 27.22batch/s]  

[2025-04-08 23:28:28,785] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384


Epoch 2:  15%|█▌        | 15876/105122 [10:15<56:13, 26.46batch/s]2025-04-08 23:28:48,829 - INFO - Epoch 2 (Step 121000): Train loss 3.474, Val loss 3.485
Epoch 2:  16%|█▌        | 16878/105122 [10:54<54:33, 26.96batch/s]  2025-04-08 23:29:27,434 - INFO - Epoch 2 (Step 122000): Train loss 3.514, Val loss 3.460
Epoch 2:  17%|█▋        | 17564/105122 [11:21<54:18, 26.87batch/s]  

[2025-04-08 23:29:54,409] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384


Epoch 2:  17%|█▋        | 17876/105122 [11:34<54:42, 26.58batch/s]  2025-04-08 23:30:07,377 - INFO - Epoch 2 (Step 123000): Train loss 3.488, Val loss 3.442
Epoch 2:  18%|█▊        | 18878/105122 [12:13<53:12, 27.02batch/s]  2025-04-08 23:30:46,129 - INFO - Epoch 2 (Step 124000): Train loss 3.370, Val loss 3.455
Epoch 2:  19%|█▉        | 19876/105122 [12:51<52:52, 26.87batch/s]  2025-04-08 23:31:24,814 - INFO - Epoch 2 (Step 125000): Train loss 3.664, Val loss 3.486
Epoch 2:  20%|█▉        | 20878/105122 [13:30<51:52, 27.06batch/s]  2025-04-08 23:32:03,483 - INFO - Epoch 2 (Step 126000): Train loss 3.603, Val loss 3.441
Epoch 2:  20%|██        | 21056/105122 [13:37<53:04, 26.40batch/s]  

[2025-04-08 23:32:10,191] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384


Epoch 2:  21%|██        | 21680/105122 [14:01<51:50, 26.83batch/s]  

[2025-04-08 23:32:34,703] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  21%|██        | 21878/105122 [14:09<51:26, 26.97batch/s]2025-04-08 23:32:42,104 - INFO - Epoch 2 (Step 127000): Train loss 3.488, Val loss 3.446
Epoch 2:  22%|██▏       | 22878/105122 [14:48<51:44, 26.49batch/s]  2025-04-08 23:33:20,851 - INFO - Epoch 2 (Step 128000): Train loss 3.457, Val loss 3.441
Epoch 2:  23%|██▎       | 23876/105122 [15:26<50:28, 26.82batch/s]  2025-04-08 23:34:00,849 - INFO - Epoch 2 (Step 129000): Train loss 3.660, Val loss 3.457
Epoch 2:  24%|██▎       | 24878/105122 [16:06<50:28, 26.50batch/s]  2025-04-08 23:34:39,483 - INFO - Epoch 2 (Step 130000): Train loss 3.513, Val loss 3.458
Epoch 2:  25%|██▍       | 25876/105122 [16:45<48:51, 27.04batch/s]  2025-04-08 23:35:18,099 - INFO - Epoch 2 (Step 131000): Train loss 3.430, Val loss 3.450
Epoch 2:  25%|██▍       | 26074/105122 [16:52<49:49, 26.44batch/s]

[2025-04-08 23:35:25,403] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384


Epoch 2:  26%|██▌       | 26876/105122 [17:23<48:15, 27.02batch/s]  2025-04-08 23:35:56,694 - INFO - Epoch 2 (Step 132000): Train loss 3.488, Val loss 3.465
Epoch 2:  26%|██▌       | 27382/105122 [17:44<51:09, 25.32batch/s]  

[2025-04-08 23:36:16,847] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  27%|██▋       | 27876/105122 [18:02<47:44, 26.97batch/s]2025-04-08 23:36:35,362 - INFO - Epoch 2 (Step 133000): Train loss 3.552, Val loss 3.455
Epoch 2:  27%|██▋       | 28876/105122 [18:41<47:12, 26.92batch/s]  2025-04-08 23:37:14,076 - INFO - Epoch 2 (Step 134000): Train loss 3.466, Val loss 3.457
Epoch 2:  28%|██▊       | 29878/105122 [19:19<46:11, 27.15batch/s]  2025-04-08 23:37:52,707 - INFO - Epoch 2 (Step 135000): Train loss 3.527, Val loss 3.453
Epoch 2:  29%|██▉       | 30878/105122 [19:59<46:05, 26.85batch/s]  2025-04-08 23:38:32,775 - INFO - Epoch 2 (Step 136000): Train loss 3.454, Val loss 3.455
Epoch 2:  30%|███       | 31619/105122 [20:27<44:38, 27.44batch/s]

[2025-04-08 23:39:00,445] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384


Epoch 2:  30%|███       | 31878/105122 [20:38<45:37, 26.76batch/s]  2025-04-08 23:39:11,438 - INFO - Epoch 2 (Step 137000): Train loss 3.401, Val loss 3.435
Epoch 2:  31%|███       | 32526/105122 [21:02<44:45, 27.04batch/s]

[2025-04-08 23:39:35,573] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  31%|███▏      | 32878/105122 [21:17<44:39, 26.96batch/s]  2025-04-08 23:39:50,103 - INFO - Epoch 2 (Step 138000): Train loss 3.434, Val loss 3.443
Epoch 2:  32%|███▏      | 33878/105122 [21:55<44:11, 26.87batch/s]  2025-04-08 23:40:28,741 - INFO - Epoch 2 (Step 139000): Train loss 3.548, Val loss 3.431
Epoch 2:  33%|███▎      | 34876/105122 [22:34<43:55, 26.65batch/s]  2025-04-08 23:41:07,367 - INFO - Epoch 2 (Step 140000): Train loss 3.506, Val loss 3.435
Epoch 2:  33%|███▎      | 35144/105122 [22:45<1:47:00, 10.90batch/s]

[2025-04-08 23:41:18,726] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  34%|███▍      | 35878/105122 [23:13<42:56, 26.87batch/s]  2025-04-08 23:41:45,957 - INFO - Epoch 2 (Step 141000): Train loss 3.506, Val loss 3.445
Epoch 2:  35%|███▌      | 36876/105122 [23:53<1:24:24, 13.48batch/s]2025-04-08 23:42:26,114 - INFO - Epoch 2 (Step 142000): Train loss 3.529, Val loss 3.454
Epoch 2:  36%|███▌      | 37878/105122 [24:31<43:40, 25.66batch/s]  2025-04-08 23:43:04,773 - INFO - Epoch 2 (Step 143000): Train loss 3.510, Val loss 3.441
Epoch 2:  37%|███▋      | 38490/105122 [24:54<41:14, 26.93batch/s]

[2025-04-08 23:43:27,618] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  37%|███▋      | 38878/105122 [25:10<41:21, 26.70batch/s]  2025-04-08 23:43:43,454 - INFO - Epoch 2 (Step 144000): Train loss 3.442, Val loss 3.429
Epoch 2:  38%|███▊      | 39878/105122 [25:49<40:08, 27.09batch/s]  2025-04-08 23:44:22,136 - INFO - Epoch 2 (Step 145000): Train loss 3.505, Val loss 3.431
Epoch 2:  39%|███▉      | 40876/105122 [26:27<40:18, 26.56batch/s]  2025-04-08 23:45:00,811 - INFO - Epoch 2 (Step 146000): Train loss 3.468, Val loss 3.452
Epoch 2:  39%|███▉      | 41458/105122 [26:51<39:34, 26.82batch/s]  

[2025-04-08 23:45:23,848] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  40%|███▉      | 41878/105122 [27:06<39:11, 26.90batch/s]2025-04-08 23:45:39,528 - INFO - Epoch 2 (Step 147000): Train loss 3.579, Val loss 3.447
Epoch 2:  41%|████      | 42876/105122 [27:45<38:02, 27.27batch/s]  2025-04-08 23:46:18,169 - INFO - Epoch 2 (Step 148000): Train loss 3.437, Val loss 3.451
Epoch 2:  42%|████▏     | 43878/105122 [28:25<37:53, 26.94batch/s]  2025-04-08 23:46:58,202 - INFO - Epoch 2 (Step 149000): Train loss 3.530, Val loss 3.446
Epoch 2:  42%|████▏     | 44468/105122 [28:47<37:45, 26.77batch/s]

[2025-04-08 23:47:20,242] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  43%|████▎     | 44878/105122 [29:04<37:06, 27.06batch/s]  2025-04-08 23:47:36,893 - INFO - Epoch 2 (Step 150000): Train loss 3.469, Val loss 3.442
Epoch 2:  44%|████▎     | 45878/105122 [29:42<36:47, 26.83batch/s]  2025-04-08 23:48:15,538 - INFO - Epoch 2 (Step 151000): Train loss 3.426, Val loss 3.443
Epoch 2:  44%|████▍     | 46739/105122 [30:16<35:49, 27.17batch/s]  

[2025-04-08 23:48:48,983] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  45%|████▍     | 46876/105122 [30:21<35:49, 27.10batch/s]2025-04-08 23:48:54,144 - INFO - Epoch 2 (Step 152000): Train loss 3.470, Val loss 3.411
Epoch 2:  46%|████▌     | 47862/105122 [30:59<36:03, 26.46batch/s]  

[2025-04-08 23:49:32,084] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  46%|████▌     | 47878/105122 [30:59<35:35, 26.80batch/s]2025-04-08 23:49:32,747 - INFO - Epoch 2 (Step 153000): Train loss 3.569, Val loss 3.403
Epoch 2:  46%|████▋     | 48878/105122 [31:38<34:49, 26.92batch/s]  2025-04-08 23:50:11,341 - INFO - Epoch 2 (Step 154000): Train loss 3.513, Val loss 3.416
Epoch 2:  47%|████▋     | 49878/105122 [32:18<38:06, 24.16batch/s]  2025-04-08 23:50:51,401 - INFO - Epoch 2 (Step 155000): Train loss 3.513, Val loss 3.411
Epoch 2:  48%|████▊     | 50876/105122 [32:57<33:31, 26.97batch/s]  2025-04-08 23:51:30,137 - INFO - Epoch 2 (Step 156000): Train loss 3.360, Val loss 3.387
Epoch 2:  49%|████▉     | 51878/105122 [33:35<32:55, 26.95batch/s]  2025-04-08 23:52:08,774 - INFO - Epoch 2 (Step 157000): Train loss 3.547, Val loss 3.405
Epoch 2:  50%|█████     | 52876/105122 [34:14<32:33, 26.74batch/s]  2025-04-08 23:52:47,421 - INFO - Epoch 2 (Step 158000): Train loss 3.429, Val loss 3.412
Epoch 2:  51%|█████▏    | 53876/105122 [34:53<31:41, 26.94ba

[2025-04-08 23:53:27,582] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384


Epoch 2:  52%|█████▏    | 54160/105122 [35:03<31:36, 26.88batch/s]

[2025-04-08 23:53:36,571] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  52%|█████▏    | 54804/105122 [35:29<31:12, 26.87batch/s]  

[2025-04-08 23:54:01,873] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  52%|█████▏    | 54876/105122 [35:31<31:07, 26.91batch/s]2025-04-08 23:54:04,685 - INFO - Epoch 2 (Step 160000): Train loss 3.305, Val loss 3.406
Epoch 2:  53%|█████▎    | 55876/105122 [36:10<30:30, 26.91batch/s]  2025-04-08 23:54:43,382 - INFO - Epoch 2 (Step 161000): Train loss 3.531, Val loss 3.438
Epoch 2:  54%|█████▍    | 56876/105122 [36:50<30:30, 26.36batch/s]  2025-04-08 23:55:23,460 - INFO - Epoch 2 (Step 162000): Train loss 3.486, Val loss 3.399
Epoch 2:  55%|█████▍    | 57301/105122 [37:06<29:11, 27.31batch/s]

[2025-04-08 23:55:39,216] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  55%|█████▌    | 57878/105122 [37:29<29:55, 26.32batch/s]  2025-04-08 23:56:02,152 - INFO - Epoch 2 (Step 163000): Train loss 3.553, Val loss 3.411
Epoch 2:  56%|█████▌    | 58878/105122 [38:08<28:43, 26.84batch/s]  2025-04-08 23:56:40,814 - INFO - Epoch 2 (Step 164000): Train loss 3.402, Val loss 3.413
Epoch 2:  57%|█████▋    | 59878/105122 [38:46<28:00, 26.93batch/s]  2025-04-08 23:57:19,388 - INFO - Epoch 2 (Step 165000): Train loss 3.440, Val loss 3.432
Epoch 2:  57%|█████▋    | 60282/105122 [39:03<28:45, 25.98batch/s]  

[2025-04-08 23:57:35,854] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  58%|█████▊    | 60878/105122 [39:25<27:23, 26.92batch/s]2025-04-08 23:57:58,011 - INFO - Epoch 2 (Step 166000): Train loss 3.401, Val loss 3.416
Epoch 2:  59%|█████▊    | 61542/105122 [39:51<27:03, 26.85batch/s]  

[2025-04-08 23:58:24,139] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 2:  59%|█████▉    | 61876/105122 [40:03<26:48, 26.88batch/s]2025-04-08 23:58:36,702 - INFO - Epoch 2 (Step 167000): Train loss 3.472, Val loss 3.410
Epoch 2:  60%|█████▉    | 62876/105122 [40:43<26:45, 26.31batch/s]  2025-04-08 23:59:16,696 - INFO - Epoch 2 (Step 168000): Train loss 3.298, Val loss 3.410
Epoch 2:  60%|██████    | 63322/105122 [41:00<26:15, 26.54batch/s]

[2025-04-08 23:59:33,229] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2048, reducing to 1024


Epoch 2:  61%|██████    | 63878/105122 [41:22<25:34, 26.88batch/s]  2025-04-08 23:59:55,406 - INFO - Epoch 2 (Step 169000): Train loss 3.289, Val loss 3.408
Epoch 2:  62%|██████▏   | 64878/105122 [42:01<24:56, 26.89batch/s]  2025-04-09 00:00:34,099 - INFO - Epoch 2 (Step 170000): Train loss 3.331, Val loss 3.412
Epoch 2:  62%|██████▏   | 65476/105122 [42:25<24:58, 26.46batch/s]  

[2025-04-09 00:00:57,731] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2048, reducing to 1024


Epoch 2:  63%|██████▎   | 65876/105122 [42:39<24:21, 26.85batch/s]2025-04-09 00:01:12,740 - INFO - Epoch 2 (Step 171000): Train loss 3.522, Val loss 3.396
Epoch 2:  64%|██████▎   | 66876/105122 [43:18<23:41, 26.90batch/s]  2025-04-09 00:01:51,380 - INFO - Epoch 2 (Step 172000): Train loss 3.415, Val loss 3.387
Epoch 2:  65%|██████▍   | 67878/105122 [43:57<22:49, 27.19batch/s]  2025-04-09 00:02:29,998 - INFO - Epoch 2 (Step 173000): Train loss 3.339, Val loss 3.410
Epoch 2:  66%|██████▌   | 68877/105122 [44:37<1:32:44,  6.51batch/s]2025-04-09 00:03:09,975 - INFO - Epoch 2 (Step 174000): Train loss 3.271, Val loss 3.424
Epoch 2:  66%|██████▋   | 69878/105122 [45:15<22:01, 26.67batch/s]  2025-04-09 00:03:48,701 - INFO - Epoch 2 (Step 175000): Train loss 3.496, Val loss 3.408
Epoch 2:  67%|██████▋   | 70876/105122 [45:54<21:20, 26.74batch/s]  2025-04-09 00:04:27,394 - INFO - Epoch 2 (Step 176000): Train loss 3.529, Val loss 3.396
Epoch 2:  68%|██████▊   | 71876/105122 [46:33<20:30, 27.01ba

[2025-04-09 00:05:50,669] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  70%|███████   | 73876/105122 [47:50<19:18, 26.98batch/s]  2025-04-09 00:06:23,543 - INFO - Epoch 2 (Step 179000): Train loss 3.316, Val loss 3.402
Epoch 2:  71%|███████   | 74878/105122 [48:29<18:46, 26.86batch/s]  2025-04-09 00:07:02,145 - INFO - Epoch 2 (Step 180000): Train loss 3.386, Val loss 3.380
Epoch 2:  71%|███████▏  | 75082/105122 [48:38<19:28, 25.70batch/s]  

[2025-04-09 00:07:11,192] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  72%|███████▏  | 75876/105122 [49:09<18:02, 27.03batch/s]  2025-04-09 00:07:42,155 - INFO - Epoch 2 (Step 181000): Train loss 3.583, Val loss 3.376
Epoch 2:  73%|███████▎  | 76876/105122 [49:47<17:27, 26.97batch/s]  2025-04-09 00:08:20,836 - INFO - Epoch 2 (Step 182000): Train loss 3.480, Val loss 3.371
Epoch 2:  74%|███████▍  | 77878/105122 [50:26<17:08, 26.49batch/s]  2025-04-09 00:08:59,535 - INFO - Epoch 2 (Step 183000): Train loss 3.410, Val loss 3.399
Epoch 2:  75%|███████▌  | 78876/105122 [51:05<16:20, 26.76batch/s]  2025-04-09 00:09:38,329 - INFO - Epoch 2 (Step 184000): Train loss 3.482, Val loss 3.382
Epoch 2:  76%|███████▌  | 79878/105122 [51:44<15:38, 26.89batch/s]  2025-04-09 00:10:16,867 - INFO - Epoch 2 (Step 185000): Train loss 3.479, Val loss 3.409
Epoch 2:  76%|███████▌  | 79992/105122 [51:48<15:45, 26.59batch/s]

[2025-04-09 00:10:21,152] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  76%|███████▋  | 80208/105122 [51:57<15:28, 26.84batch/s]  

[2025-04-09 00:10:30,597] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  77%|███████▋  | 80878/105122 [52:22<15:17, 26.44batch/s]2025-04-09 00:10:55,581 - INFO - Epoch 2 (Step 186000): Train loss 3.338, Val loss 3.389
Epoch 2:  78%|███████▊  | 81878/105122 [53:02<24:08, 16.04batch/s]2025-04-09 00:11:35,698 - INFO - Epoch 2 (Step 187000): Train loss 3.397, Val loss 3.388
Epoch 2:  78%|███████▊  | 82504/105122 [53:26<14:04, 26.78batch/s]

[2025-04-09 00:11:59,108] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  79%|███████▉  | 82876/105122 [53:41<13:48, 26.86batch/s]2025-04-09 00:12:14,343 - INFO - Epoch 2 (Step 188000): Train loss 3.390, Val loss 3.406
Epoch 2:  80%|███████▉  | 83876/105122 [54:20<13:10, 26.87batch/s]2025-04-09 00:12:52,948 - INFO - Epoch 2 (Step 189000): Train loss 3.418, Val loss 3.388
Epoch 2:  81%|████████  | 84876/105122 [54:58<12:36, 26.77batch/s]2025-04-09 00:13:31,562 - INFO - Epoch 2 (Step 190000): Train loss 3.439, Val loss 3.392
Epoch 2:  81%|████████▏ | 85650/105122 [55:28<12:00, 27.01batch/s]

[2025-04-09 00:14:01,646] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  82%|████████▏ | 85878/105122 [55:37<12:02, 26.64batch/s]2025-04-09 00:14:10,185 - INFO - Epoch 2 (Step 191000): Train loss 3.381, Val loss 3.396
Epoch 2:  83%|████████▎ | 86878/105122 [56:16<11:22, 26.72batch/s]2025-04-09 00:14:48,847 - INFO - Epoch 2 (Step 192000): Train loss 3.511, Val loss 3.403
Epoch 2:  84%|████████▎ | 87876/105122 [56:54<10:38, 27.00batch/s]2025-04-09 00:15:27,525 - INFO - Epoch 2 (Step 193000): Train loss 3.529, Val loss 3.387
Epoch 2:  85%|████████▍ | 88878/105122 [57:34<10:11, 26.57batch/s]2025-04-09 00:16:07,512 - INFO - Epoch 2 (Step 194000): Train loss 3.521, Val loss 3.390
Epoch 2:  85%|████████▍ | 88960/105122 [57:37<10:00, 26.92batch/s]

[2025-04-09 00:16:10,596] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  85%|████████▌ | 89878/105122 [58:13<09:28, 26.83batch/s]2025-04-09 00:16:46,161 - INFO - Epoch 2 (Step 195000): Train loss 3.362, Val loss 3.409
Epoch 2:  86%|████████▋ | 90878/105122 [58:52<08:47, 27.01batch/s]2025-04-09 00:17:24,823 - INFO - Epoch 2 (Step 196000): Train loss 3.259, Val loss 3.388
Epoch 2:  87%|████████▋ | 91878/105122 [59:30<08:11, 26.97batch/s]2025-04-09 00:18:03,572 - INFO - Epoch 2 (Step 197000): Train loss 3.466, Val loss 3.386
Epoch 2:  88%|████████▊ | 92186/105122 [59:42<08:01, 26.86batch/s]

[2025-04-09 00:18:15,143] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  88%|████████▊ | 92738/105122 [1:00:04<07:45, 26.62batch/s]

[2025-04-09 00:18:37,028] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 2:  88%|████████▊ | 92876/105122 [1:00:09<07:37, 26.78batch/s]2025-04-09 00:18:42,213 - INFO - Epoch 2 (Step 198000): Train loss 3.442, Val loss 3.368
Epoch 2:  89%|████████▉ | 93878/105122 [1:00:48<06:58, 26.89batch/s]2025-04-09 00:19:20,821 - INFO - Epoch 2 (Step 199000): Train loss 3.390, Val loss 3.361
Epoch 2:  89%|████████▉ | 94002/105122 [1:00:54<10:05, 18.36batch/s]

[2025-04-09 00:19:26,887] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2048, reducing to 1024


Epoch 2:  90%|█████████ | 94876/105122 [1:01:27<07:45, 22.02batch/s]2025-04-09 00:20:00,843 - INFO - Epoch 2 (Step 200000): Train loss 3.377, Val loss 3.390
Epoch 2:  91%|█████████ | 95878/105122 [1:02:06<05:49, 26.43batch/s]2025-04-09 00:20:39,703 - INFO - Epoch 2 (Step 201000): Train loss 3.313, Val loss 3.405
Epoch 2:  92%|█████████▏| 96878/105122 [1:02:45<05:04, 27.05batch/s]2025-04-09 00:21:18,342 - INFO - Epoch 2 (Step 202000): Train loss 3.455, Val loss 3.411
Epoch 2:  93%|█████████▎| 97876/105122 [1:03:24<04:26, 27.20batch/s]2025-04-09 00:21:57,058 - INFO - Epoch 2 (Step 203000): Train loss 3.329, Val loss 3.388
Epoch 2:  94%|█████████▍| 98876/105122 [1:04:02<04:05, 25.42batch/s]2025-04-09 00:22:35,856 - INFO - Epoch 2 (Step 204000): Train loss 3.520, Val loss 3.380
Epoch 2:  95%|█████████▌| 99876/105122 [1:04:41<03:14, 26.96batch/s]2025-04-09 00:23:14,611 - INFO - Epoch 2 (Step 205000): Train loss 3.586, Val loss 3.373
Epoch 2:  96%|█████████▌| 100878/105122 [1:05:20<02:38, 26

[2025-04-09 00:24:12,220] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  97%|█████████▋| 101560/105122 [1:05:47<02:13, 26.77batch/s]

[2025-04-09 00:24:20,096] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 2:  97%|█████████▋| 101877/105122 [1:06:00<01:58, 27.34batch/s]2025-04-09 00:24:33,252 - INFO - Epoch 2 (Step 207000): Train loss 3.415, Val loss 3.381
Epoch 2:  97%|█████████▋| 102050/105122 [1:06:06<01:53, 26.96batch/s]

[2025-04-09 00:24:39,694] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2048, reducing to 1024


Epoch 2:  98%|█████████▊| 102876/105122 [1:06:39<01:23, 27.04batch/s]2025-04-09 00:25:11,930 - INFO - Epoch 2 (Step 208000): Train loss 3.549, Val loss 3.390
Epoch 2:  99%|█████████▉| 103878/105122 [1:07:17<00:46, 26.98batch/s]2025-04-09 00:25:50,715 - INFO - Epoch 2 (Step 209000): Train loss 3.418, Val loss 3.390
Epoch 2: 100%|█████████▉| 104876/105122 [1:07:56<00:09, 26.81batch/s]2025-04-09 00:26:29,430 - INFO - Epoch 2 (Step 210000): Train loss 3.326, Val loss 3.392
Epoch 2: 100%|██████████| 105122/105122 [1:08:05<00:00, 25.73batch/s]
2025-04-09 00:26:38,523 - INFO - Epoch 2 completed. Generating a sample...
2025-04-09 00:26:38,712 - INFO - Generated Text: Every effort moves you up to the point where you 're going to get a lot of things . " 
   = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


Every effort moves you up to the point where you 're going to get a lot of things . "     = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
[2025-04-09 00:26:38,714] [INFO] [logging.py:107:log_dist] [Rank 0] [Torch] Checkpoint global_step105122 is about to be saved!
[2025-04-09 00:26:38,717] [INFO] [logging.py:107:log_dist] [Rank 0] Saving model checkpoint: ./checkpoints/checkpoint-epoch2/global_step105122/mp_rank_00_model_states.pt
[2025-04-09 00:26:38,717] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./checkpoints/checkpoint-epoch2/global_step105122/mp_rank_00_model_states.pt...
[2025-04-09 00:26:39,057] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./checkpoints/checkpoint-epoch2/global_step105122/mp_rank_00_model_states.pt.
[2025-04-09 00:26:39,059] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./checkpoints/checkpoint-epoch2/global_step105122/zero_pp_rank_0_mp_rank_00_optim_states.pt...
[2025-04-09 00:26:40,990] [INFO] [torch_checkpoi

2025-04-09 00:26:40,992 - INFO - Checkpoint saved at: ./checkpoints/checkpoint-epoch2
2025-04-09 00:26:40,993 - INFO - Starting Epoch 3...
Epoch 3:   1%|          | 754/105122 [00:29<1:04:33, 26.95batch/s]2025-04-09 00:27:11,045 - INFO - Epoch 3 (Step 211000): Train loss 3.490, Val loss 3.383
Epoch 3:   2%|▏         | 1756/105122 [01:10<1:46:27, 16.18batch/s]2025-04-09 00:27:51,218 - INFO - Epoch 3 (Step 212000): Train loss 3.350, Val loss 3.377
Epoch 3:   3%|▎         | 2754/105122 [01:48<1:03:17, 26.95batch/s]2025-04-09 00:28:30,038 - INFO - Epoch 3 (Step 213000): Train loss 3.381, Val loss 3.399
Epoch 3:   4%|▎         | 3756/105122 [02:27<1:02:57, 26.83batch/s]2025-04-09 00:29:08,698 - INFO - Epoch 3 (Step 214000): Train loss 3.446, Val loss 3.395
Epoch 3:   4%|▍         | 4014/105122 [02:37<1:02:03, 27.15batch/s]

[2025-04-09 00:29:18,487] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:   5%|▍         | 4756/105122 [03:06<1:02:20, 26.83batch/s]2025-04-09 00:29:47,385 - INFO - Epoch 3 (Step 215000): Train loss 3.217, Val loss 3.358
Epoch 3:   5%|▌         | 5754/105122 [03:44<1:01:28, 26.94batch/s]2025-04-09 00:30:26,091 - INFO - Epoch 3 (Step 216000): Train loss 3.512, Val loss 3.341
Epoch 3:   6%|▋         | 6756/105122 [04:23<1:00:36, 27.05batch/s]2025-04-09 00:31:04,803 - INFO - Epoch 3 (Step 217000): Train loss 3.467, Val loss 3.381
Epoch 3:   7%|▋         | 7754/105122 [05:02<1:00:58, 26.61batch/s]2025-04-09 00:31:43,491 - INFO - Epoch 3 (Step 218000): Train loss 3.551, Val loss 3.358
Epoch 3:   8%|▊         | 7908/105122 [05:09<1:00:23, 26.83batch/s]

[2025-04-09 00:31:50,523] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:   8%|▊         | 8190/105122 [05:19<59:52, 26.98batch/s]  

[2025-04-09 00:32:01,061] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 3:   8%|▊         | 8754/105122 [05:42<59:48, 26.86batch/s]  2025-04-09 00:32:23,483 - INFO - Epoch 3 (Step 219000): Train loss 3.392, Val loss 3.364
Epoch 3:   9%|▉         | 9754/105122 [06:20<59:01, 26.93batch/s]  2025-04-09 00:33:02,085 - INFO - Epoch 3 (Step 220000): Train loss 3.510, Val loss 3.353
Epoch 3:  10%|█         | 10754/105122 [06:59<58:46, 26.76batch/s]  2025-04-09 00:33:40,827 - INFO - Epoch 3 (Step 221000): Train loss 3.406, Val loss 3.345
Epoch 3:  11%|█         | 11756/105122 [07:38<57:57, 26.85batch/s]  2025-04-09 00:34:19,496 - INFO - Epoch 3 (Step 222000): Train loss 3.520, Val loss 3.368
Epoch 3:  12%|█▏        | 12756/105122 [08:17<56:57, 27.03batch/s]  2025-04-09 00:34:58,224 - INFO - Epoch 3 (Step 223000): Train loss 3.358, Val loss 3.368
Epoch 3:  13%|█▎        | 13280/105122 [08:38<56:39, 27.02batch/s]  

[2025-04-09 00:35:19,137] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  13%|█▎        | 13754/105122 [08:55<56:45, 26.83batch/s]2025-04-09 00:35:36,976 - INFO - Epoch 3 (Step 224000): Train loss 3.504, Val loss 3.371
Epoch 3:  14%|█▍        | 14754/105122 [09:35<58:41, 25.66batch/s]  2025-04-09 00:36:16,959 - INFO - Epoch 3 (Step 225000): Train loss 3.455, Val loss 3.381
Epoch 3:  15%|█▍        | 15460/105122 [10:02<55:41, 26.84batch/s]  

[2025-04-09 00:36:43,266] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  15%|█▍        | 15754/105122 [10:14<54:51, 27.15batch/s]  2025-04-09 00:36:55,576 - INFO - Epoch 3 (Step 226000): Train loss 3.409, Val loss 3.391
Epoch 3:  16%|█▌        | 16756/105122 [10:53<55:30, 26.53batch/s]  2025-04-09 00:37:34,238 - INFO - Epoch 3 (Step 227000): Train loss 3.349, Val loss 3.371
Epoch 3:  16%|█▋        | 17098/105122 [11:05<54:21, 26.99batch/s]  

[2025-04-09 00:37:47,073] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 3:  17%|█▋        | 17756/105122 [11:31<55:24, 26.28batch/s]  2025-04-09 00:38:12,960 - INFO - Epoch 3 (Step 228000): Train loss 3.391, Val loss 3.355
Epoch 3:  18%|█▊        | 18754/105122 [12:10<53:52, 26.72batch/s]  2025-04-09 00:38:51,610 - INFO - Epoch 3 (Step 229000): Train loss 3.388, Val loss 3.347
Epoch 3:  18%|█▊        | 19342/105122 [12:33<52:48, 27.07batch/s]  

[2025-04-09 00:39:14,846] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 3:  19%|█▉        | 19754/105122 [12:49<52:58, 26.86batch/s]2025-04-09 00:39:30,288 - INFO - Epoch 3 (Step 230000): Train loss 3.440, Val loss 3.368
Epoch 3:  20%|█▉        | 20754/105122 [13:27<52:14, 26.92batch/s]  2025-04-09 00:40:08,863 - INFO - Epoch 3 (Step 231000): Train loss 3.485, Val loss 3.377
Epoch 3:  21%|██        | 21756/105122 [14:07<51:38, 26.90batch/s]  2025-04-09 00:40:48,873 - INFO - Epoch 3 (Step 232000): Train loss 3.399, Val loss 3.362
Epoch 3:  22%|██▏       | 22756/105122 [14:46<50:45, 27.04batch/s]  2025-04-09 00:41:27,556 - INFO - Epoch 3 (Step 233000): Train loss 3.386, Val loss 3.370
Epoch 3:  22%|██▏       | 23334/105122 [15:08<51:28, 26.48batch/s]  

[2025-04-09 00:41:49,212] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 3:  23%|██▎       | 23754/105122 [15:25<50:11, 27.02batch/s]  2025-04-09 00:42:06,280 - INFO - Epoch 3 (Step 234000): Train loss 3.516, Val loss 3.355
Epoch 3:  24%|██▎       | 24754/105122 [16:03<49:58, 26.80batch/s]  2025-04-09 00:42:44,906 - INFO - Epoch 3 (Step 235000): Train loss 3.444, Val loss 3.352
Epoch 3:  24%|██▍       | 25754/105122 [16:42<49:10, 26.90batch/s]  2025-04-09 00:43:23,536 - INFO - Epoch 3 (Step 236000): Train loss 3.402, Val loss 3.367
Epoch 3:  25%|██▍       | 26125/105122 [16:57<47:54, 27.49batch/s]  

[2025-04-09 00:43:38,671] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 3:  25%|██▌       | 26754/105122 [17:21<49:13, 26.54batch/s]2025-04-09 00:44:02,214 - INFO - Epoch 3 (Step 237000): Train loss 3.395, Val loss 3.371
Epoch 3:  26%|██▋       | 27754/105122 [18:01<47:54, 26.91batch/s]  2025-04-09 00:44:42,296 - INFO - Epoch 3 (Step 238000): Train loss 3.544, Val loss 3.402
Epoch 3:  27%|██▋       | 28456/105122 [18:27<47:22, 26.98batch/s]

[2025-04-09 00:45:08,504] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 3:  27%|██▋       | 28754/105122 [18:39<47:05, 27.03batch/s]  2025-04-09 00:45:21,028 - INFO - Epoch 3 (Step 239000): Train loss 3.310, Val loss 3.373
Epoch 3:  28%|██▊       | 29756/105122 [19:18<46:52, 26.80batch/s]  2025-04-09 00:45:59,779 - INFO - Epoch 3 (Step 240000): Train loss 3.473, Val loss 3.375
Epoch 3:  29%|██▉       | 30754/105122 [19:57<45:54, 27.00batch/s]  2025-04-09 00:46:38,496 - INFO - Epoch 3 (Step 241000): Train loss 3.417, Val loss 3.374
Epoch 3:  30%|███       | 31556/105122 [20:28<46:23, 26.43batch/s]  

[2025-04-09 00:47:09,749] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 3:  30%|███       | 31754/105122 [20:36<45:17, 27.00batch/s]2025-04-09 00:47:17,264 - INFO - Epoch 3 (Step 242000): Train loss 3.343, Val loss 3.387
Epoch 3:  31%|███       | 32754/105122 [21:14<45:10, 26.70batch/s]  2025-04-09 00:47:55,923 - INFO - Epoch 3 (Step 243000): Train loss 3.274, Val loss 3.376
Epoch 3:  31%|███       | 32836/105122 [21:17<44:56, 26.81batch/s]

[2025-04-09 00:47:58,935] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2048, reducing to 1024


Epoch 3:  32%|███▏      | 33754/105122 [21:54<54:12, 21.95batch/s]  2025-04-09 00:48:35,979 - INFO - Epoch 3 (Step 244000): Train loss 3.326, Val loss 3.369
Epoch 3:  33%|███▎      | 34756/105122 [22:33<43:24, 27.01batch/s]  2025-04-09 00:49:14,665 - INFO - Epoch 3 (Step 245000): Train loss 3.486, Val loss 3.379
Epoch 3:  33%|███▎      | 35188/105122 [22:49<43:14, 26.96batch/s]

[2025-04-09 00:49:30,799] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2048, reducing to 1024


Epoch 3:  34%|███▍      | 35756/105122 [23:12<42:54, 26.95batch/s]  2025-04-09 00:49:53,252 - INFO - Epoch 3 (Step 246000): Train loss 3.410, Val loss 3.348
Epoch 3:  35%|███▍      | 36754/105122 [23:50<42:14, 26.98batch/s]  2025-04-09 00:50:31,863 - INFO - Epoch 3 (Step 247000): Train loss 3.459, Val loss 3.356
Epoch 3:  36%|███▌      | 37754/105122 [24:29<42:18, 26.54batch/s]  2025-04-09 00:51:10,666 - INFO - Epoch 3 (Step 248000): Train loss 3.283, Val loss 3.364
Epoch 3:  37%|███▋      | 38756/105122 [25:08<41:18, 26.78batch/s]  2025-04-09 00:51:49,386 - INFO - Epoch 3 (Step 249000): Train loss 3.300, Val loss 3.361
Epoch 3:  38%|███▊      | 39754/105122 [25:46<41:23, 26.32batch/s]  2025-04-09 00:52:28,118 - INFO - Epoch 3 (Step 250000): Train loss 3.469, Val loss 3.350
Epoch 3:  38%|███▊      | 39837/105122 [25:51<41:40, 26.11batch/s]  

[2025-04-09 00:52:32,496] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 3:  39%|███▉      | 40754/105122 [26:26<39:46, 26.97batch/s]  2025-04-09 00:53:08,119 - INFO - Epoch 3 (Step 251000): Train loss 3.363, Val loss 3.352
Epoch 3:  40%|███▉      | 41536/105122 [26:57<52:38, 20.13batch/s]  

[2025-04-09 00:53:38,649] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2048, reducing to 1024


Epoch 3:  40%|███▉      | 41754/105122 [27:05<39:04, 27.03batch/s]2025-04-09 00:53:46,823 - INFO - Epoch 3 (Step 252000): Train loss 3.395, Val loss 3.354
Epoch 3:  40%|████      | 42140/105122 [27:20<39:25, 26.62batch/s]

[2025-04-09 00:54:01,193] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1024, reducing to 512


Epoch 3:  41%|████      | 42756/105122 [27:44<38:39, 26.88batch/s]  2025-04-09 00:54:25,477 - INFO - Epoch 3 (Step 253000): Train loss 3.386, Val loss 3.373
Epoch 3:  42%|████▏     | 43754/105122 [28:23<38:11, 26.79batch/s]  2025-04-09 00:55:04,199 - INFO - Epoch 3 (Step 254000): Train loss 3.397, Val loss 3.377
Epoch 3:  43%|████▎     | 44756/105122 [29:01<37:09, 27.07batch/s]  2025-04-09 00:55:42,878 - INFO - Epoch 3 (Step 255000): Train loss 3.302, Val loss 3.388
Epoch 3:  44%|████▎     | 45756/105122 [29:40<36:47, 26.89batch/s]  2025-04-09 00:56:21,555 - INFO - Epoch 3 (Step 256000): Train loss 3.350, Val loss 3.339
Epoch 3:  44%|████▍     | 46756/105122 [30:20<36:54, 26.35batch/s]  2025-04-09 00:57:01,646 - INFO - Epoch 3 (Step 257000): Train loss 3.359, Val loss 3.345
Epoch 3:  45%|████▌     | 47754/105122 [30:59<36:22, 26.29batch/s]  2025-04-09 00:57:40,406 - INFO - Epoch 3 (Step 258000): Train loss 3.467, Val loss 3.366
Epoch 3:  46%|████▌     | 48470/105122 [31:27<37:43, 25.03

[2025-04-09 00:58:08,355] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 3:  46%|████▋     | 48755/105122 [31:37<34:29, 27.24batch/s]2025-04-09 00:58:19,119 - INFO - Epoch 3 (Step 259000): Train loss 3.347, Val loss 3.381
Epoch 3:  47%|████▋     | 49160/105122 [31:53<34:23, 27.12batch/s]

[2025-04-09 00:58:34,146] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2048, reducing to 1024


Epoch 3:  47%|████▋     | 49756/105122 [32:16<34:02, 27.11batch/s]  2025-04-09 00:58:57,793 - INFO - Epoch 3 (Step 260000): Train loss 3.385, Val loss 3.373
Epoch 3:  48%|████▊     | 50756/105122 [32:55<33:41, 26.90batch/s]  2025-04-09 00:59:36,542 - INFO - Epoch 3 (Step 261000): Train loss 3.396, Val loss 3.364
Epoch 3:  49%|████▉     | 51756/105122 [33:34<33:09, 26.83batch/s]  2025-04-09 01:00:15,188 - INFO - Epoch 3 (Step 262000): Train loss 3.378, Val loss 3.365
Epoch 3:  50%|█████     | 52756/105122 [34:14<2:00:13,  7.26batch/s]2025-04-09 01:00:55,220 - INFO - Epoch 3 (Step 263000): Train loss 3.197, Val loss 3.365
Epoch 3:  51%|█████     | 53399/105122 [34:38<31:38, 27.24batch/s]  

[2025-04-09 01:01:19,282] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 3:  51%|█████     | 53754/105122 [34:52<31:56, 26.81batch/s]  2025-04-09 01:01:33,908 - INFO - Epoch 3 (Step 264000): Train loss 3.369, Val loss 3.358
Epoch 3:  52%|█████▏    | 54754/105122 [35:31<31:08, 26.95batch/s]  2025-04-09 01:02:12,695 - INFO - Epoch 3 (Step 265000): Train loss 3.504, Val loss 3.366
Epoch 3:  53%|█████▎    | 55458/105122 [35:59<30:55, 26.77batch/s]  

[2025-04-09 01:02:40,239] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 3:  53%|█████▎    | 55754/105122 [36:10<30:24, 27.05batch/s]2025-04-09 01:02:51,392 - INFO - Epoch 3 (Step 266000): Train loss 3.355, Val loss 3.342
Epoch 3:  54%|█████▍    | 56754/105122 [36:48<29:53, 26.97batch/s]  2025-04-09 01:03:30,166 - INFO - Epoch 3 (Step 267000): Train loss 3.423, Val loss 3.344
Epoch 3:  55%|█████▍    | 57754/105122 [37:27<29:19, 26.93batch/s]  2025-04-09 01:04:08,822 - INFO - Epoch 3 (Step 268000): Train loss 3.430, Val loss 3.376
Epoch 3:  56%|█████▌    | 58756/105122 [38:06<28:37, 26.99batch/s]  2025-04-09 01:04:47,474 - INFO - Epoch 3 (Step 269000): Train loss 3.354, Val loss 3.342
Epoch 3:  57%|█████▋    | 59754/105122 [38:46<28:01, 26.98batch/s]  2025-04-09 01:05:27,454 - INFO - Epoch 3 (Step 270000): Train loss 3.239, Val loss 3.343
Epoch 3:  57%|█████▋    | 59802/105122 [38:48<28:31, 26.48batch/s]

[2025-04-09 01:05:29,287] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  58%|█████▊    | 60754/105122 [39:24<27:29, 26.89batch/s]  2025-04-09 01:06:06,177 - INFO - Epoch 3 (Step 271000): Train loss 3.390, Val loss 3.356
Epoch 3:  59%|█████▊    | 61754/105122 [40:03<26:59, 26.78batch/s]  2025-04-09 01:06:44,855 - INFO - Epoch 3 (Step 272000): Train loss 3.272, Val loss 3.370
Epoch 3:  60%|█████▉    | 62754/105122 [40:42<26:08, 27.01batch/s]  2025-04-09 01:07:23,486 - INFO - Epoch 3 (Step 273000): Train loss 3.395, Val loss 3.365
Epoch 3:  60%|█████▉    | 63052/105122 [40:53<26:11, 26.78batch/s]

[2025-04-09 01:07:34,507] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  61%|██████    | 63754/105122 [41:20<25:55, 26.60batch/s]  2025-04-09 01:08:02,163 - INFO - Epoch 3 (Step 274000): Train loss 3.414, Val loss 3.362
Epoch 3:  62%|██████▏   | 64756/105122 [41:59<25:18, 26.59batch/s]  2025-04-09 01:08:40,832 - INFO - Epoch 3 (Step 275000): Train loss 3.240, Val loss 3.345
Epoch 3:  63%|██████▎   | 65756/105122 [42:39<49:02, 13.38batch/s]  2025-04-09 01:09:20,866 - INFO - Epoch 3 (Step 276000): Train loss 3.407, Val loss 3.345
Epoch 3:  63%|██████▎   | 66628/105122 [43:13<34:35, 18.54batch/s]  

[2025-04-09 01:09:54,750] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  64%|██████▎   | 66756/105122 [43:18<23:42, 26.97batch/s]2025-04-09 01:09:59,489 - INFO - Epoch 3 (Step 277000): Train loss 3.400, Val loss 3.364
Epoch 3:  64%|██████▎   | 66946/105122 [43:25<23:33, 27.01batch/s]

[2025-04-09 01:10:06,655] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 3:  64%|██████▍   | 67756/105122 [43:56<23:05, 26.98batch/s]  2025-04-09 01:10:38,045 - INFO - Epoch 3 (Step 278000): Train loss 3.323, Val loss 3.354
Epoch 3:  65%|██████▌   | 68754/105122 [44:35<22:27, 26.99batch/s]  2025-04-09 01:11:16,831 - INFO - Epoch 3 (Step 279000): Train loss 3.464, Val loss 3.353
Epoch 3:  66%|██████▋   | 69756/105122 [45:14<21:52, 26.95batch/s]  2025-04-09 01:11:55,549 - INFO - Epoch 3 (Step 280000): Train loss 3.309, Val loss 3.346
Epoch 3:  67%|██████▋   | 69998/105122 [45:23<21:45, 26.90batch/s]

[2025-04-09 01:12:04,590] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 3:  67%|██████▋   | 70756/105122 [45:53<21:08, 27.10batch/s]  2025-04-09 01:12:34,146 - INFO - Epoch 3 (Step 281000): Train loss 3.306, Val loss 3.370
Epoch 3:  68%|██████▊   | 71274/105122 [46:13<21:12, 26.60batch/s]  

[2025-04-09 01:12:54,840] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2048, reducing to 1024


Epoch 3:  68%|██████▊   | 71754/105122 [46:31<20:46, 26.76batch/s]2025-04-09 01:13:12,859 - INFO - Epoch 3 (Step 282000): Train loss 3.556, Val loss 3.358
Epoch 3:  69%|██████▉   | 72664/105122 [47:08<49:59, 10.82batch/s]  

[2025-04-09 01:13:49,443] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1024, reducing to 512


Epoch 3:  69%|██████▉   | 72756/105122 [47:11<20:02, 26.91batch/s]2025-04-09 01:13:52,907 - INFO - Epoch 3 (Step 283000): Train loss 3.153, Val loss 3.366
Epoch 3:  70%|███████   | 73756/105122 [47:50<19:26, 26.89batch/s]  2025-04-09 01:14:31,635 - INFO - Epoch 3 (Step 284000): Train loss 3.389, Val loss 3.351
Epoch 3:  71%|███████   | 74754/105122 [48:29<18:48, 26.91batch/s]  2025-04-09 01:15:10,280 - INFO - Epoch 3 (Step 285000): Train loss 3.465, Val loss 3.379
Epoch 3:  72%|███████▏  | 75754/105122 [49:07<18:14, 26.84batch/s]  2025-04-09 01:15:48,986 - INFO - Epoch 3 (Step 286000): Train loss 3.489, Val loss 3.369
Epoch 3:  72%|███████▏  | 76126/105122 [49:23<34:37, 13.95batch/s]  

[2025-04-09 01:16:04,158] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1024, reducing to 512


Epoch 3:  73%|███████▎  | 76754/105122 [49:46<17:35, 26.88batch/s]2025-04-09 01:16:27,692 - INFO - Epoch 3 (Step 287000): Train loss 3.364, Val loss 3.379
Epoch 3:  74%|███████▍  | 77754/105122 [50:25<17:02, 26.78batch/s]  2025-04-09 01:17:06,415 - INFO - Epoch 3 (Step 288000): Train loss 3.561, Val loss 3.366
Epoch 3:  74%|███████▍  | 77924/105122 [50:33<17:15, 26.26batch/s]  

[2025-04-09 01:17:14,182] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 512, reducing to 256


Epoch 3:  75%|███████▍  | 78756/105122 [51:05<16:42, 26.29batch/s]  2025-04-09 01:17:46,475 - INFO - Epoch 3 (Step 289000): Train loss 3.322, Val loss 3.362
Epoch 3:  76%|███████▌  | 79756/105122 [51:43<15:41, 26.94batch/s]  2025-04-09 01:18:25,081 - INFO - Epoch 3 (Step 290000): Train loss 3.419, Val loss 3.355
Epoch 3:  77%|███████▋  | 80756/105122 [52:22<15:06, 26.89batch/s]2025-04-09 01:19:03,753 - INFO - Epoch 3 (Step 291000): Train loss 3.404, Val loss 3.335
Epoch 3:  78%|███████▊  | 81754/105122 [53:01<14:23, 27.05batch/s]  2025-04-09 01:19:42,479 - INFO - Epoch 3 (Step 292000): Train loss 3.382, Val loss 3.342
Epoch 3:  79%|███████▊  | 82756/105122 [53:40<13:41, 27.21batch/s]2025-04-09 01:20:21,146 - INFO - Epoch 3 (Step 293000): Train loss 3.303, Val loss 3.368
Epoch 3:  79%|███████▉  | 83260/105122 [54:00<13:34, 26.84batch/s]

[2025-04-09 01:20:41,456] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1024, reducing to 512


Epoch 3:  80%|███████▉  | 83756/105122 [54:18<13:14, 26.90batch/s]2025-04-09 01:20:59,921 - INFO - Epoch 3 (Step 294000): Train loss 3.465, Val loss 3.339
Epoch 3:  80%|████████  | 84270/105122 [54:39<12:54, 26.91batch/s]

[2025-04-09 01:21:20,594] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 512, reducing to 256


Epoch 3:  81%|████████  | 84756/105122 [54:57<12:31, 27.09batch/s]2025-04-09 01:21:38,678 - INFO - Epoch 3 (Step 295000): Train loss 3.456, Val loss 3.338
Epoch 3:  82%|████████▏ | 85756/105122 [55:37<12:03, 26.76batch/s]2025-04-09 01:22:18,636 - INFO - Epoch 3 (Step 296000): Train loss 3.312, Val loss 3.329
Epoch 3:  83%|████████▎ | 86754/105122 [56:16<11:24, 26.84batch/s]2025-04-09 01:22:57,287 - INFO - Epoch 3 (Step 297000): Train loss 3.399, Val loss 3.352
Epoch 3:  83%|████████▎ | 87756/105122 [56:54<10:51, 26.67batch/s]2025-04-09 01:23:35,987 - INFO - Epoch 3 (Step 298000): Train loss 3.285, Val loss 3.333
Epoch 3:  84%|████████▍ | 88756/105122 [57:33<10:06, 26.98batch/s]2025-04-09 01:24:14,692 - INFO - Epoch 3 (Step 299000): Train loss 3.322, Val loss 3.333
Epoch 3:  85%|████████▌ | 89756/105122 [58:12<09:40, 26.47batch/s]2025-04-09 01:24:53,337 - INFO - Epoch 3 (Step 300000): Train loss 3.489, Val loss 3.334
Epoch 3:  86%|████████▋ | 90756/105122 [58:50<08:53, 26.91batch/s]2025

[2025-04-09 01:27:10,771] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 3:  89%|████████▉ | 93370/105122 [1:00:32<07:16, 26.92batch/s]

[2025-04-09 01:27:13,733] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2048, reducing to 1024


Epoch 3:  89%|████████▉ | 93756/105122 [1:00:48<07:08, 26.50batch/s]2025-04-09 01:27:29,431 - INFO - Epoch 3 (Step 304000): Train loss 3.590, Val loss 3.349
Epoch 3:  90%|█████████ | 94754/105122 [1:01:26<06:26, 26.81batch/s]2025-04-09 01:28:08,141 - INFO - Epoch 3 (Step 305000): Train loss 3.355, Val loss 3.332
Epoch 3:  91%|█████████ | 95754/105122 [1:02:05<05:50, 26.73batch/s]2025-04-09 01:28:46,815 - INFO - Epoch 3 (Step 306000): Train loss 3.465, Val loss 3.343
Epoch 3:  92%|█████████▏| 96756/105122 [1:02:44<05:12, 26.79batch/s]2025-04-09 01:29:25,547 - INFO - Epoch 3 (Step 307000): Train loss 3.248, Val loss 3.350
Epoch 3:  93%|█████████▎| 97755/105122 [1:03:24<17:47,  6.90batch/s]2025-04-09 01:30:05,514 - INFO - Epoch 3 (Step 308000): Train loss 3.377, Val loss 3.362
Epoch 3:  94%|█████████▍| 98754/105122 [1:04:03<03:58, 26.71batch/s]2025-04-09 01:30:44,256 - INFO - Epoch 3 (Step 309000): Train loss 3.340, Val loss 3.353
Epoch 3:  95%|█████████▍| 99610/105122 [1:04:36<03:23, 27.

[2025-04-09 01:31:17,390] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  95%|█████████▍| 99754/105122 [1:04:41<03:21, 26.64batch/s]2025-04-09 01:31:22,802 - INFO - Epoch 3 (Step 310000): Train loss 3.450, Val loss 3.334
Epoch 3:  96%|█████████▌| 100724/105122 [1:05:19<02:44, 26.66batch/s]

[2025-04-09 01:32:00,181] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 3:  96%|█████████▌| 100756/105122 [1:05:20<02:42, 26.93batch/s]2025-04-09 01:32:01,432 - INFO - Epoch 3 (Step 311000): Train loss 3.472, Val loss 3.316
Epoch 3:  97%|█████████▋| 101756/105122 [1:05:59<02:04, 26.99batch/s]2025-04-09 01:32:40,187 - INFO - Epoch 3 (Step 312000): Train loss 3.286, Val loss 3.346
Epoch 3:  98%|█████████▊| 102756/105122 [1:06:37<01:28, 26.88batch/s]2025-04-09 01:33:18,876 - INFO - Epoch 3 (Step 313000): Train loss 3.354, Val loss 3.340
Epoch 3:  98%|█████████▊| 103259/105122 [1:06:58<01:07, 27.54batch/s]

[2025-04-09 01:33:39,039] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 3:  99%|█████████▊| 103718/105122 [1:07:15<00:52, 26.51batch/s]

[2025-04-09 01:33:56,123] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2048, reducing to 1024


Epoch 3:  99%|█████████▊| 103754/105122 [1:07:16<00:50, 27.07batch/s]2025-04-09 01:33:57,582 - INFO - Epoch 3 (Step 314000): Train loss 3.525, Val loss 3.321
Epoch 3:  99%|█████████▉| 103858/105122 [1:07:21<00:47, 26.66batch/s]

[2025-04-09 01:34:02,823] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1024, reducing to 512


Epoch 3: 100%|█████████▉| 104756/105122 [1:07:56<00:13, 26.70batch/s]2025-04-09 01:34:37,735 - INFO - Epoch 3 (Step 315000): Train loss 3.338, Val loss 3.310
Epoch 3: 100%|██████████| 105122/105122 [1:08:10<00:00, 25.70batch/s]
2025-04-09 01:34:51,312 - INFO - Epoch 3 completed. Generating a sample...
2025-04-09 01:34:51,500 - INFO - Generated Text: Every effort moves you to the next level . " 
   = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


Every effort moves you to the next level . "     = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
[2025-04-09 01:34:51,502] [INFO] [logging.py:107:log_dist] [Rank 0] [Torch] Checkpoint global_step157683 is about to be saved!
[2025-04-09 01:34:51,505] [INFO] [logging.py:107:log_dist] [Rank 0] Saving model checkpoint: ./checkpoints/checkpoint-epoch3/global_step157683/mp_rank_00_model_states.pt
[2025-04-09 01:34:51,505] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./checkpoints/checkpoint-epoch3/global_step157683/mp_rank_00_model_states.pt...
[2025-04-09 01:34:51,861] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./checkpoints/checkpoint-epoch3/global_step157683/mp_rank_00_model_states.pt.
[2025-04-09 01:34:51,863] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./checkpoints/checkpoint-epoch3/global_step157683/zero_pp_rank_0_mp_rank_00_optim_states.pt...
[2025-04-09 01:34:53,797] [INFO] [torch_checkpoint_engine.py:23:sav

2025-04-09 01:34:53,800 - INFO - Checkpoint saved at: ./checkpoints/checkpoint-epoch3


ValueError: too many values to unpack (expected 2)