In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

In [3]:
import torch
from gpt import GPTModel

GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 128, # Shortened context length (orig: 1024)
    "emb_dim": 256,        # Embedding dimension
    "n_heads": 4,         # Number of attention heads
    "n_layers": 4,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval();  # Disable dropout during inference

In [4]:
import tiktoken
from gpt import generate_text_simple, create_dataloader_v1

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())

start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"]
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you mosquitoes popcorn appeaseAMES Lucy decide Pittsburgh Stainless utilizationRegistration


In [5]:
from datasets import load_dataset

dataset = load_dataset("wikitext", "wikitext-103-v1", split="train")


In [6]:
dataset['text']


['',
 ' = Valkyria Chronicles III = \n',
 '',
 ' Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . \n',
 " The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving

In [7]:
txt_data = " ".join(dataset["text"])
# train_loader = create_dataloader_v1(txt_data, batch_size=4, max_length=256, stride=128)

train_ratio = 0.90
split_idx = int(train_ratio * len(txt_data))
train_data = txt_data[:split_idx]
val_data = txt_data[split_idx:]


torch.manual_seed(123)

train_loader = create_dataloader_v1(
    train_data,
    batch_size=4,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=4,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [8]:
for batch_idx, (input_batch, target_batch) in enumerate(train_loader):
    print(f"Batch {batch_idx + 1}")
    print("Input batch (tokens):", input_batch)
    print("Target batch (tokens):", target_batch)
    break  # Only printing the first batch as an example

Batch 1
Input batch (tokens): tensor([[ 2422,  9964,  1028,  4849, 17072,   772,   981,   991,   739,  1886,
          2162,  3977, 12444,   284, 21194,   625,  9964,   810, 26402,   373,
           407,  1682,   287,  3877,   837, 41109,   284,  1277,   465, 13463,
          3371,   262, 39785,  5822,  2138,   621, 34618, 22576,   764,  3977,
           705,    82,  2106,   460,   307,  1775,   355,   281,  8453,   544,
           837,   257, 16716,  3761,   837,   329,   262, 13239,   837,   290,
           517,  5734,   329, 26402,   705,    82,  3896,   764,  2750,   262,
          1367,  2154,    82,   290,  1367,  1795,    82,   837,  8830, 20006,
           547, 18416,   284,  1104,   262, 13239,   837, 11476,   780,   340,
           373,  1290,  1497,   290,   612,   547,   517, 12273,  4786,   287,
          2031,   837,   475,   635,   780,   443,  1676,  1837,   373,  3221,
          3177, 11871,  9837,   764,   220,   198,   220,  3977,   373, 20524,
         21925,  1028,

In [28]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss


def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        # Reduce the number of batches to match the total number of batches in the data loader
        # if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [11]:

# # Note:
# # Uncommenting the following lines will allow the code to run on Apple Silicon chips, if applicable,
# # which is approximately 2x faster than on an Apple CPU (as measured on an M3 MacBook Air).
# # However, the resulting loss values may be slightly different.

# #if torch.cuda.is_available():
# #    device = torch.device("cuda")
# #elif torch.backends.mps.is_available():
# #    device = torch.device("mps")
# #else:
# #    device = torch.device("cpu")
# #
# # print(f"Using {device} device.")


# model.to(device) # no assignment model = model.to(device) necessary for nn.Module classes


# torch.manual_seed(123) # For reproducibility due to the shuffling in the data loader

# with torch.no_grad(): # Disable gradient tracking for efficiency because we are not training, yet
#     train_loss = calc_loss_loader(train_loader, model, device)
#     val_loss = calc_loss_loader(val_loader, model, device)

# print("Training loss:", train_loss)
# print("Validation loss:", val_loss)


In [36]:
import logging
import deepspeed
from tqdm import tqdm  # tqdm을 함수처럼 사용

# Setup logging configuration
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(message)s", 
    level=logging.INFO,  # You can adjust the level to DEBUG, INFO, WARNING, etc.
    handlers=[
        logging.FileHandler("training_log.txt"),  # Logs will be saved to this file
        logging.StreamHandler()  # Also log to console
    ]
)

# Now use logging instead of print
logger = logging.getLogger(__name__)

In [45]:
import deepspeed


def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                       eval_freq, eval_iter, start_context, tokenizer, deepspeed_config):

    # Initialize DeepSpeed
    model, optimizer, _, _ = deepspeed.initialize(args=None, model=model, optimizer=optimizer, config_params=deepspeed_config)

    # Initialize lists to track losses and tokens seen
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    # Main training loop
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode

        logger.info(f"Starting Epoch {epoch+1}...")

        for input_batch, target_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}", unit="batch"):
            input_batch = input_batch.to(device)
            target_batch = target_batch.to(device)

            optimizer.zero_grad()  # Reset loss gradients from previous batch iteration

            # Compute the loss
            loss = calc_loss_batch(input_batch, target_batch, model, device)

            # print(loss)
            # print(f"Loss type: {loss.dtype}")
            # print(f"Loss scale type: {model.loss_scale.dtype}")
            # if not isinstance(loss, torch.Tensor):
            #     loss = torch.tensor(loss).to(device)


            # Backward pass and step the optimizer
            model.backward(loss)  # Backward pass with DeepSpeed
            model.step()  # Step optimizer using DeepSpeed

            tokens_seen += input_batch.numel()
            global_step += 1

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter
                )
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                logger.info(f"Epoch {epoch+1} (Step {global_step:06d}): "
                            f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")
                
                
        logger.info(f"Epoch {epoch+1} completed. Generating a sample...")
        # Print a sample text after each epoch
        generate_and_print_sample(
            model, tokenizer, device, start_context
        )

    return train_losses, val_losses, track_tokens_seen

def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss


def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model, idx=encoded,
            max_new_tokens=50, context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))  # Compact print format
    logger.info(f"Generated Text: {decoded_text}")  # Log the generated text
    model.train()

In [46]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

train_losses, track_tokens_seen = train_model_simple(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    device=device,
    num_epochs=3,
    eval_freq=1000,
    eval_iter=10,
    start_context="Every effort moves you",
    tokenizer=tokenizer,
    deepspeed_config="./ds_config.json"
)

[2025-04-02 19:35:32,917] [INFO] [logging.py:107:log_dist] [Rank 0] DeepSpeed info: version=0.16.5, git-hash=unknown, git-branch=unknown
[2025-04-02 19:35:32,919] [INFO] [config.py:734:__init__] Config mesh_device None world_size = 1
[2025-04-02 19:35:32,924] [INFO] [logging.py:107:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
[2025-04-02 19:35:32,925] [INFO] [logging.py:107:log_dist] [Rank 0] Using client Optimizer as basic optimizer
[2025-04-02 19:35:32,925] [INFO] [logging.py:107:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer
[2025-04-02 19:35:32,926] [INFO] [logging.py:107:log_dist] [Rank 0] DeepSpeed Basic Optimizer = AdamW
[2025-04-02 19:35:32,926] [INFO] [utils.py:59:is_zero_supported_optimizer] Checking ZeRO support for optimizer=AdamW type=<class 'torch.optim.adamw.AdamW'>
[2025-04-02 19:35:32,926] [INFO] [logging.py:107:log_dist] [Rank 0] Creating torch.float16 ZeRO stage 2 optimizer
[2025-04-02 19:35:32,926] [INFO] [stage_1_a

2025-04-02 19:35:35,634 - INFO - Starting Epoch 1...
Epoch 1:   0%|          | 0/210244 [00:00<?, ?batch/s]2025-04-02 19:35:35,733 - INFO - Epoch 1 (Step 000000): Train loss 10.962, Val loss 10.964


[2025-04-02 19:35:35,746] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4294967296, reducing to 2147483648


Epoch 1:   0%|          | 2/210244 [00:00<3:15:37, 17.91batch/s]

[2025-04-02 19:35:35,775] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2147483648, reducing to 1073741824
[2025-04-02 19:35:35,794] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1073741824, reducing to 536870912
[2025-04-02 19:35:35,814] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 536870912, reducing to 268435456
[2025-04-02 19:35:35,833] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 268435456, reducing to 134217728
[2025-04-02 19:35:35,852] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 134217728, reducing to 67108864


Epoch 1:   0%|          | 12/210244 [00:00<56:20, 62.19batch/s] 

[2025-04-02 19:35:35,871] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 67108864, reducing to 33554432
[2025-04-02 19:35:35,889] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 33554432, reducing to 16777216
[2025-04-02 19:35:35,908] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16777216, reducing to 8388608
[2025-04-02 19:35:35,927] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8388608, reducing to 4194304
[2025-04-02 19:35:35,947] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4194304, reducing to 2097152


Epoch 1:   0%|          | 23/210244 [00:00<42:48, 81.86batch/s]

[2025-04-02 19:35:35,967] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, reducing to 1048576
[2025-04-02 19:35:35,986] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1048576, reducing to 524288
[2025-04-02 19:35:36,005] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 524288, reducing to 262144
[2025-04-02 19:35:36,024] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 262144, reducing to 131072
[2025-04-02 19:35:36,043] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:   0%|          | 998/210244 [00:12<43:10, 80.78batch/s]2025-04-02 19:35:48,115 - INFO - Epoch 1 (Step 001000): Train loss 6.612, Val loss 6.774
Epoch 1:   1%|          | 1993/210244 [00:26<43:04, 80.57batch/s]  2025-04-02 19:36:01,885 - INFO - Epoch 1 (Step 002000): Train loss 6.179, Val loss 6.484
Epoch 1:   1%|          | 2028/210244 [00:26<45:48, 75.76batch/s]

[2025-04-02 19:36:02,292] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:   1%|▏         | 2995/210244 [00:38<42:37, 81.03batch/s]2025-04-02 19:36:14,347 - INFO - Epoch 1 (Step 003000): Train loss 5.915, Val loss 6.286
Epoch 1:   2%|▏         | 3996/210244 [00:51<43:20, 79.30batch/s]2025-04-02 19:36:27,038 - INFO - Epoch 1 (Step 004000): Train loss 5.982, Val loss 6.175
Epoch 1:   2%|▏         | 4030/210244 [00:51<47:24, 72.50batch/s]

[2025-04-02 19:36:27,491] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:   2%|▏         | 4996/210244 [01:05<42:18, 80.84batch/s]  2025-04-02 19:36:40,880 - INFO - Epoch 1 (Step 005000): Train loss 5.952, Val loss 6.035
Epoch 1:   3%|▎         | 5992/210244 [01:17<44:24, 76.66batch/s]2025-04-02 19:36:53,375 - INFO - Epoch 1 (Step 006000): Train loss 5.710, Val loss 5.974
Epoch 1:   3%|▎         | 6034/210244 [01:18<45:04, 75.49batch/s]

[2025-04-02 19:36:53,841] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:   3%|▎         | 7000/210244 [01:31<44:01, 76.93batch/s]  2025-04-02 19:37:07,207 - INFO - Epoch 1 (Step 007000): Train loss 5.779, Val loss 5.883
Epoch 1:   4%|▍         | 7995/210244 [01:43<42:01, 80.22batch/s]2025-04-02 19:37:19,767 - INFO - Epoch 1 (Step 008000): Train loss 5.609, Val loss 5.805
Epoch 1:   4%|▍         | 8031/210244 [01:44<43:26, 77.59batch/s]

[2025-04-02 19:37:20,243] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:   4%|▍         | 8993/210244 [01:56<41:10, 81.45batch/s]2025-04-02 19:37:32,170 - INFO - Epoch 1 (Step 009000): Train loss 5.456, Val loss 5.731
Epoch 1:   5%|▍         | 9997/210244 [02:10<39:58, 83.49batch/s]  2025-04-02 19:37:45,995 - INFO - Epoch 1 (Step 010000): Train loss 5.433, Val loss 5.659
Epoch 1:   5%|▍         | 10041/210244 [02:10<42:38, 78.26batch/s]

[2025-04-02 19:37:46,503] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:   5%|▌         | 10998/210244 [02:22<42:31, 78.10batch/s]2025-04-02 19:37:58,436 - INFO - Epoch 1 (Step 011000): Train loss 5.447, Val loss 5.661
Epoch 1:   6%|▌         | 11999/210244 [02:35<41:33, 79.52batch/s]2025-04-02 19:38:10,886 - INFO - Epoch 1 (Step 012000): Train loss 5.377, Val loss 5.597
Epoch 1:   6%|▌         | 12039/210244 [02:35<44:19, 74.53batch/s]

[2025-04-02 19:38:11,442] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:   6%|▌         | 12997/210244 [02:49<40:41, 80.80batch/s]  2025-04-02 19:38:24,806 - INFO - Epoch 1 (Step 013000): Train loss 5.536, Val loss 5.524
Epoch 1:   7%|▋         | 14000/210244 [03:01<40:33, 80.64batch/s]2025-04-02 19:38:37,254 - INFO - Epoch 1 (Step 014000): Train loss 5.370, Val loss 5.480
Epoch 1:   7%|▋         | 14043/210244 [03:02<42:19, 77.26batch/s]

[2025-04-02 19:38:37,817] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:   7%|▋         | 14994/210244 [03:15<40:19, 80.69batch/s]  2025-04-02 19:38:50,961 - INFO - Epoch 1 (Step 015000): Train loss 5.252, Val loss 5.414
Epoch 1:   8%|▊         | 15995/210244 [03:27<39:17, 82.39batch/s]2025-04-02 19:39:03,467 - INFO - Epoch 1 (Step 016000): Train loss 5.098, Val loss 5.375
Epoch 1:   8%|▊         | 16039/210244 [03:28<41:40, 77.67batch/s]

[2025-04-02 19:39:04,050] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:   8%|▊         | 16996/210244 [03:40<39:57, 80.61batch/s]2025-04-02 19:39:15,946 - INFO - Epoch 1 (Step 017000): Train loss 5.140, Val loss 5.380
Epoch 1:   9%|▊         | 17993/210244 [03:53<39:19, 81.49batch/s]  2025-04-02 19:39:29,738 - INFO - Epoch 1 (Step 018000): Train loss 5.149, Val loss 5.352
Epoch 1:   9%|▊         | 18044/210244 [03:54<41:26, 77.31batch/s]

[2025-04-02 19:39:30,350] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:   9%|▉         | 18997/210244 [04:06<38:25, 82.95batch/s]2025-04-02 19:39:42,219 - INFO - Epoch 1 (Step 019000): Train loss 5.005, Val loss 5.326
Epoch 1:  10%|▉         | 19993/210244 [04:20<39:16, 80.74batch/s]  2025-04-02 19:39:56,151 - INFO - Epoch 1 (Step 020000): Train loss 4.980, Val loss 5.261
Epoch 1:  10%|▉         | 20051/210244 [04:21<40:48, 77.69batch/s]

[2025-04-02 19:39:56,804] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  10%|▉         | 20992/210244 [04:32<38:38, 81.63batch/s]2025-04-02 19:40:08,640 - INFO - Epoch 1 (Step 021000): Train loss 5.099, Val loss 5.248
Epoch 1:  10%|█         | 21998/210244 [04:45<38:38, 81.20batch/s]2025-04-02 19:40:21,090 - INFO - Epoch 1 (Step 022000): Train loss 5.113, Val loss 5.227
Epoch 1:  10%|█         | 22049/210244 [04:46<40:09, 78.10batch/s]

[2025-04-02 19:40:21,748] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  11%|█         | 22998/210244 [04:59<38:42, 80.62batch/s]  2025-04-02 19:40:35,064 - INFO - Epoch 1 (Step 023000): Train loss 4.841, Val loss 5.220
Epoch 1:  11%|█▏        | 23994/210244 [05:11<39:37, 78.35batch/s]2025-04-02 19:40:47,508 - INFO - Epoch 1 (Step 024000): Train loss 4.993, Val loss 5.173
Epoch 1:  11%|█▏        | 24052/210244 [05:12<40:10, 77.23batch/s]

[2025-04-02 19:40:48,208] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  12%|█▏        | 25000/210244 [05:24<37:56, 81.39batch/s]2025-04-02 19:40:59,967 - INFO - Epoch 1 (Step 025000): Train loss 5.161, Val loss 5.166
Epoch 1:  12%|█▏        | 26000/210244 [05:38<37:44, 81.37batch/s]  2025-04-02 19:41:13,892 - INFO - Epoch 1 (Step 026000): Train loss 4.877, Val loss 5.150
Epoch 1:  12%|█▏        | 26051/210244 [05:38<39:18, 78.10batch/s]

[2025-04-02 19:41:14,603] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  13%|█▎        | 26994/210244 [05:50<38:07, 80.12batch/s]2025-04-02 19:41:26,317 - INFO - Epoch 1 (Step 027000): Train loss 4.948, Val loss 5.116
Epoch 1:  13%|█▎        | 27998/210244 [06:04<37:25, 81.15batch/s]  2025-04-02 19:41:40,029 - INFO - Epoch 1 (Step 028000): Train loss 4.870, Val loss 5.108
Epoch 1:  13%|█▎        | 28059/210244 [06:05<38:18, 79.26batch/s]

[2025-04-02 19:41:40,762] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  14%|█▍        | 28994/210244 [06:16<37:40, 80.17batch/s]2025-04-02 19:41:52,522 - INFO - Epoch 1 (Step 029000): Train loss 4.804, Val loss 5.073
Epoch 1:  14%|█▍        | 29996/210244 [06:29<37:14, 80.68batch/s]2025-04-02 19:42:04,974 - INFO - Epoch 1 (Step 030000): Train loss 5.028, Val loss 5.061
Epoch 1:  14%|█▍        | 30057/210244 [06:30<37:44, 79.59batch/s]

[2025-04-02 19:42:05,733] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  15%|█▍        | 30997/210244 [06:42<36:36, 81.60batch/s]  2025-04-02 19:42:18,738 - INFO - Epoch 1 (Step 031000): Train loss 5.032, Val loss 5.033
Epoch 1:  15%|█▌        | 31993/210244 [06:55<36:33, 81.26batch/s]2025-04-02 19:42:31,141 - INFO - Epoch 1 (Step 032000): Train loss 4.833, Val loss 5.016
Epoch 1:  15%|█▌        | 32060/210244 [06:56<41:46, 71.10batch/s]

[2025-04-02 19:42:31,988] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  16%|█▌        | 32992/210244 [07:09<39:06, 75.53batch/s]  2025-04-02 19:42:45,064 - INFO - Epoch 1 (Step 033000): Train loss 4.821, Val loss 5.018
Epoch 1:  16%|█▌        | 33996/210244 [07:21<37:01, 79.35batch/s]2025-04-02 19:42:57,562 - INFO - Epoch 1 (Step 034000): Train loss 4.846, Val loss 4.977
Epoch 1:  16%|█▌        | 34063/210244 [07:22<37:22, 78.56batch/s]

[2025-04-02 19:42:58,376] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  17%|█▋        | 35000/210244 [07:34<36:47, 79.37batch/s]2025-04-02 19:43:10,165 - INFO - Epoch 1 (Step 035000): Train loss 4.711, Val loss 4.998
Epoch 1:  17%|█▋        | 35999/210244 [07:48<35:13, 82.46batch/s]  2025-04-02 19:43:23,924 - INFO - Epoch 1 (Step 036000): Train loss 4.912, Val loss 4.968
Epoch 1:  17%|█▋        | 36060/210244 [07:49<36:47, 78.92batch/s]

[2025-04-02 19:43:24,773] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  18%|█▊        | 36993/210244 [08:00<34:58, 82.56batch/s]2025-04-02 19:43:36,383 - INFO - Epoch 1 (Step 037000): Train loss 4.785, Val loss 4.957
Epoch 1:  18%|█▊        | 37999/210244 [08:14<47:55, 59.90batch/s]  2025-04-02 19:43:50,219 - INFO - Epoch 1 (Step 038000): Train loss 4.701, Val loss 4.968
Epoch 1:  18%|█▊        | 38061/210244 [08:15<36:43, 78.16batch/s]

[2025-04-02 19:43:51,067] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  19%|█▊        | 38994/210244 [08:26<35:14, 81.01batch/s]2025-04-02 19:44:02,625 - INFO - Epoch 1 (Step 039000): Train loss 4.842, Val loss 4.950
Epoch 1:  19%|█▉        | 39998/210244 [08:39<35:46, 79.33batch/s]2025-04-02 19:44:15,363 - INFO - Epoch 1 (Step 040000): Train loss 4.646, Val loss 4.935
Epoch 1:  19%|█▉        | 40065/210244 [08:40<36:01, 78.73batch/s]

[2025-04-02 19:44:16,264] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  19%|█▉        | 40995/210244 [08:53<34:28, 81.83batch/s]  2025-04-02 19:44:29,239 - INFO - Epoch 1 (Step 041000): Train loss 4.710, Val loss 4.920
Epoch 1:  20%|█▉        | 42000/210244 [09:06<35:10, 79.72batch/s]2025-04-02 19:44:41,754 - INFO - Epoch 1 (Step 042000): Train loss 4.660, Val loss 4.893
Epoch 1:  20%|██        | 42069/210244 [09:06<35:13, 79.59batch/s]

[2025-04-02 19:44:42,669] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  20%|██        | 42997/210244 [09:18<34:15, 81.36batch/s]2025-04-02 19:44:54,360 - INFO - Epoch 1 (Step 043000): Train loss 4.679, Val loss 4.906
Epoch 1:  21%|██        | 43995/210244 [09:32<33:48, 81.96batch/s]  2025-04-02 19:45:08,171 - INFO - Epoch 1 (Step 044000): Train loss 4.649, Val loss 4.914
Epoch 1:  21%|██        | 44075/210244 [09:33<34:16, 80.79batch/s]

[2025-04-02 19:45:09,095] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  21%|██▏       | 44995/210244 [09:44<33:54, 81.21batch/s]2025-04-02 19:45:20,584 - INFO - Epoch 1 (Step 045000): Train loss 4.527, Val loss 4.896
Epoch 1:  22%|██▏       | 45998/210244 [09:58<34:37, 79.06batch/s]  2025-04-02 19:45:34,309 - INFO - Epoch 1 (Step 046000): Train loss 4.762, Val loss 4.895
Epoch 1:  22%|██▏       | 46069/210244 [09:59<33:54, 80.71batch/s]

[2025-04-02 19:45:35,258] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  22%|██▏       | 46994/210244 [10:11<33:45, 80.60batch/s]2025-04-02 19:45:46,856 - INFO - Epoch 1 (Step 047000): Train loss 4.712, Val loss 4.890
Epoch 1:  23%|██▎       | 47998/210244 [10:23<33:47, 80.03batch/s]2025-04-02 19:45:59,393 - INFO - Epoch 1 (Step 048000): Train loss 4.768, Val loss 4.876
Epoch 1:  23%|██▎       | 48077/210244 [10:24<33:29, 80.70batch/s]

[2025-04-02 19:46:00,370] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  23%|██▎       | 48992/210244 [10:37<33:14, 80.85batch/s]  2025-04-02 19:46:13,250 - INFO - Epoch 1 (Step 049000): Train loss 4.529, Val loss 4.884
Epoch 1:  24%|██▍       | 49996/210244 [10:49<32:59, 80.96batch/s]2025-04-02 19:46:25,764 - INFO - Epoch 1 (Step 050000): Train loss 4.691, Val loss 4.877
Epoch 1:  24%|██▍       | 50079/210244 [10:51<35:24, 75.38batch/s]

[2025-04-02 19:46:26,820] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  24%|██▍       | 50999/210244 [11:03<32:59, 80.46batch/s]  2025-04-02 19:46:39,681 - INFO - Epoch 1 (Step 051000): Train loss 4.652, Val loss 4.830
Epoch 1:  25%|██▍       | 51993/210244 [11:16<32:14, 81.79batch/s]2025-04-02 19:46:52,034 - INFO - Epoch 1 (Step 052000): Train loss 4.330, Val loss 4.842
Epoch 1:  25%|██▍       | 52082/210244 [11:17<34:04, 77.36batch/s]

[2025-04-02 19:46:53,067] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  25%|██▌       | 52996/210244 [11:28<34:35, 75.75batch/s]2025-04-02 19:47:04,660 - INFO - Epoch 1 (Step 053000): Train loss 4.581, Val loss 4.839
Epoch 1:  26%|██▌       | 53992/210244 [11:42<32:03, 81.25batch/s]  2025-04-02 19:47:18,596 - INFO - Epoch 1 (Step 054000): Train loss 4.668, Val loss 4.849
Epoch 1:  26%|██▌       | 54077/210244 [11:43<32:38, 79.73batch/s]

[2025-04-02 19:47:19,669] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  26%|██▌       | 54999/210244 [11:55<31:19, 82.58batch/s]2025-04-02 19:47:31,167 - INFO - Epoch 1 (Step 055000): Train loss 4.582, Val loss 4.838
Epoch 1:  27%|██▋       | 55996/210244 [12:09<1:11:42, 35.85batch/s]2025-04-02 19:47:45,049 - INFO - Epoch 1 (Step 056000): Train loss 4.525, Val loss 4.845
Epoch 1:  27%|██▋       | 56084/210244 [12:10<33:19, 77.10batch/s]  

[2025-04-02 19:47:46,119] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  27%|██▋       | 56996/210244 [12:21<31:16, 81.69batch/s]2025-04-02 19:47:57,500 - INFO - Epoch 1 (Step 057000): Train loss 4.449, Val loss 4.834
Epoch 1:  28%|██▊       | 57998/210244 [12:34<31:50, 79.69batch/s]2025-04-02 19:48:09,947 - INFO - Epoch 1 (Step 058000): Train loss 4.551, Val loss 4.823
Epoch 1:  28%|██▊       | 58084/210244 [12:35<31:54, 79.50batch/s]

[2025-04-02 19:48:11,061] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  28%|██▊       | 58996/210244 [12:48<31:18, 80.50batch/s]  2025-04-02 19:48:23,770 - INFO - Epoch 1 (Step 059000): Train loss 4.794, Val loss 4.815
Epoch 1:  29%|██▊       | 59997/210244 [13:00<30:20, 82.53batch/s]2025-04-02 19:48:36,163 - INFO - Epoch 1 (Step 060000): Train loss 4.634, Val loss 4.814
Epoch 1:  29%|██▊       | 60086/210244 [13:01<30:36, 81.76batch/s]

[2025-04-02 19:48:37,272] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  29%|██▉       | 60999/210244 [13:12<30:39, 81.12batch/s]2025-04-02 19:48:48,576 - INFO - Epoch 1 (Step 061000): Train loss 4.666, Val loss 4.815
Epoch 1:  29%|██▉       | 61992/210244 [13:26<30:22, 81.35batch/s]  2025-04-02 19:49:02,327 - INFO - Epoch 1 (Step 062000): Train loss 4.376, Val loss 4.779
Epoch 1:  30%|██▉       | 62092/210244 [13:27<31:02, 79.56batch/s]

[2025-04-02 19:49:03,514] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  30%|██▉       | 62999/210244 [13:39<30:11, 81.29batch/s]2025-04-02 19:49:14,784 - INFO - Epoch 1 (Step 063000): Train loss 4.773, Val loss 4.751
Epoch 1:  30%|███       | 64000/210244 [13:52<29:38, 82.22batch/s]  2025-04-02 19:49:28,519 - INFO - Epoch 1 (Step 064000): Train loss 4.492, Val loss 4.771
Epoch 1:  30%|███       | 64090/210244 [13:53<30:08, 80.80batch/s]

[2025-04-02 19:49:29,686] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  31%|███       | 64993/210244 [14:05<29:14, 82.81batch/s]2025-04-02 19:49:40,944 - INFO - Epoch 1 (Step 065000): Train loss 4.632, Val loss 4.757
Epoch 1:  31%|███▏      | 65999/210244 [14:17<29:15, 82.17batch/s]2025-04-02 19:49:53,460 - INFO - Epoch 1 (Step 066000): Train loss 4.340, Val loss 4.775
Epoch 1:  31%|███▏      | 66097/210244 [14:19<30:04, 79.88batch/s]

[2025-04-02 19:49:54,669] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  32%|███▏      | 66998/210244 [14:31<30:08, 79.22batch/s]  2025-04-02 19:50:07,306 - INFO - Epoch 1 (Step 067000): Train loss 4.672, Val loss 4.786
Epoch 1:  32%|███▏      | 67997/210244 [14:44<29:15, 81.01batch/s]2025-04-02 19:50:19,828 - INFO - Epoch 1 (Step 068000): Train loss 4.439, Val loss 4.800
Epoch 1:  32%|███▏      | 68091/210244 [14:45<29:31, 80.24batch/s]

[2025-04-02 19:50:21,066] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  33%|███▎      | 68993/210244 [14:57<42:13, 55.74batch/s]  2025-04-02 19:50:33,600 - INFO - Epoch 1 (Step 069000): Train loss 4.607, Val loss 4.786
Epoch 1:  33%|███▎      | 69994/210244 [15:10<28:44, 81.33batch/s]2025-04-02 19:50:46,100 - INFO - Epoch 1 (Step 070000): Train loss 4.379, Val loss 4.755
Epoch 1:  33%|███▎      | 70100/210244 [15:11<29:41, 78.66batch/s]

[2025-04-02 19:50:47,372] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  34%|███▍      | 71000/210244 [15:22<28:39, 81.00batch/s]2025-04-02 19:50:58,604 - INFO - Epoch 1 (Step 071000): Train loss 4.334, Val loss 4.774
Epoch 1:  34%|███▍      | 71992/210244 [15:36<29:01, 79.38batch/s]  2025-04-02 19:51:12,352 - INFO - Epoch 1 (Step 072000): Train loss 4.508, Val loss 4.750
Epoch 1:  34%|███▍      | 72095/210244 [15:37<29:05, 79.16batch/s]

[2025-04-02 19:51:13,653] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  35%|███▍      | 72995/210244 [15:48<27:36, 82.87batch/s]2025-04-02 19:51:24,711 - INFO - Epoch 1 (Step 073000): Train loss 4.604, Val loss 4.755
Epoch 1:  35%|███▌      | 73995/210244 [16:01<28:33, 79.52batch/s]2025-04-02 19:51:37,183 - INFO - Epoch 1 (Step 074000): Train loss 4.479, Val loss 4.765
Epoch 1:  35%|███▌      | 74102/210244 [16:02<28:01, 80.94batch/s]

[2025-04-02 19:51:38,468] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  36%|███▌      | 74998/210244 [16:15<27:25, 82.18batch/s]  2025-04-02 19:51:50,905 - INFO - Epoch 1 (Step 075000): Train loss 4.529, Val loss 4.734
Epoch 1:  36%|███▌      | 75993/210244 [16:27<27:52, 80.27batch/s]2025-04-02 19:52:03,420 - INFO - Epoch 1 (Step 076000): Train loss 4.489, Val loss 4.728
Epoch 1:  36%|███▌      | 76099/210244 [16:29<27:57, 79.99batch/s]

[2025-04-02 19:52:04,749] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  37%|███▋      | 76996/210244 [16:41<27:38, 80.35batch/s]  2025-04-02 19:52:17,224 - INFO - Epoch 1 (Step 077000): Train loss 4.462, Val loss 4.748
Epoch 1:  37%|███▋      | 77994/210244 [16:53<27:13, 80.97batch/s]2025-04-02 19:52:29,686 - INFO - Epoch 1 (Step 078000): Train loss 4.478, Val loss 4.719
Epoch 1:  37%|███▋      | 78102/210244 [16:55<28:09, 78.20batch/s]

[2025-04-02 19:52:31,082] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  38%|███▊      | 78992/210244 [17:06<27:04, 80.81batch/s]2025-04-02 19:52:42,152 - INFO - Epoch 1 (Step 079000): Train loss 4.357, Val loss 4.732
Epoch 1:  38%|███▊      | 79996/210244 [17:20<26:59, 80.41batch/s]  2025-04-02 19:52:55,939 - INFO - Epoch 1 (Step 080000): Train loss 4.399, Val loss 4.722
Epoch 1:  38%|███▊      | 80111/210244 [17:21<27:06, 80.00batch/s]

[2025-04-02 19:52:57,329] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  39%|███▊      | 80997/210244 [17:32<26:27, 81.43batch/s]2025-04-02 19:53:08,311 - INFO - Epoch 1 (Step 081000): Train loss 4.521, Val loss 4.719
Epoch 1:  39%|███▉      | 81997/210244 [17:46<32:13, 66.32batch/s]  2025-04-02 19:53:22,032 - INFO - Epoch 1 (Step 082000): Train loss 4.346, Val loss 4.695
Epoch 1:  39%|███▉      | 82113/210244 [17:47<26:05, 81.85batch/s]

[2025-04-02 19:53:23,415] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  39%|███▉      | 82995/210244 [17:58<25:50, 82.07batch/s]2025-04-02 19:53:34,301 - INFO - Epoch 1 (Step 083000): Train loss 4.348, Val loss 4.694
Epoch 1:  40%|███▉      | 83995/210244 [18:10<25:37, 82.13batch/s]2025-04-02 19:53:46,726 - INFO - Epoch 1 (Step 084000): Train loss 4.499, Val loss 4.695
Epoch 1:  40%|████      | 84112/210244 [18:12<26:33, 79.17batch/s]

[2025-04-02 19:53:48,139] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  40%|████      | 84999/210244 [18:24<25:18, 82.47batch/s]  2025-04-02 19:54:00,569 - INFO - Epoch 1 (Step 085000): Train loss 4.256, Val loss 4.664
Epoch 1:  41%|████      | 85995/210244 [18:37<25:09, 82.32batch/s]2025-04-02 19:54:13,031 - INFO - Epoch 1 (Step 086000): Train loss 4.450, Val loss 4.653
Epoch 1:  41%|████      | 86112/210244 [18:38<25:56, 79.73batch/s]

[2025-04-02 19:54:14,476] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  41%|████▏     | 86995/210244 [18:49<25:16, 81.25batch/s]2025-04-02 19:54:25,455 - INFO - Epoch 1 (Step 087000): Train loss 4.291, Val loss 4.661
Epoch 1:  42%|████▏     | 87995/210244 [19:03<24:54, 81.77batch/s]  2025-04-02 19:54:39,206 - INFO - Epoch 1 (Step 088000): Train loss 4.397, Val loss 4.657
Epoch 1:  42%|████▏     | 88117/210244 [19:05<25:20, 80.31batch/s]

[2025-04-02 19:54:40,691] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  42%|████▏     | 88994/210244 [19:15<25:01, 80.73batch/s]2025-04-02 19:54:51,571 - INFO - Epoch 1 (Step 089000): Train loss 4.351, Val loss 4.645
Epoch 1:  43%|████▎     | 89998/210244 [19:29<25:18, 79.19batch/s]  2025-04-02 19:55:05,316 - INFO - Epoch 1 (Step 090000): Train loss 4.314, Val loss 4.642
Epoch 1:  43%|████▎     | 90113/210244 [19:31<24:51, 80.52batch/s]

[2025-04-02 19:55:06,824] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  43%|████▎     | 90996/210244 [19:42<23:37, 84.12batch/s]2025-04-02 19:55:17,756 - INFO - Epoch 1 (Step 091000): Train loss 4.346, Val loss 4.638
Epoch 1:  44%|████▍     | 91996/210244 [19:54<24:37, 80.01batch/s]2025-04-02 19:55:30,210 - INFO - Epoch 1 (Step 092000): Train loss 4.276, Val loss 4.634
Epoch 1:  44%|████▍     | 92120/210244 [19:56<25:35, 76.92batch/s]

[2025-04-02 19:55:31,753] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  44%|████▍     | 92994/210244 [20:08<24:03, 81.21batch/s]  2025-04-02 19:55:44,050 - INFO - Epoch 1 (Step 093000): Train loss 4.628, Val loss 4.636
Epoch 1:  45%|████▍     | 93995/210244 [20:20<23:44, 81.58batch/s]2025-04-02 19:55:56,492 - INFO - Epoch 1 (Step 094000): Train loss 4.392, Val loss 4.630
Epoch 1:  45%|████▍     | 94119/210244 [20:22<24:03, 80.44batch/s]

[2025-04-02 19:55:58,046] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  45%|████▌     | 94993/210244 [20:34<26:46, 71.75batch/s]  2025-04-02 19:56:10,260 - INFO - Epoch 1 (Step 095000): Train loss 4.410, Val loss 4.621
Epoch 1:  46%|████▌     | 95994/210244 [20:46<23:36, 80.67batch/s]2025-04-02 19:56:22,721 - INFO - Epoch 1 (Step 096000): Train loss 4.362, Val loss 4.629
Epoch 1:  46%|████▌     | 96125/210244 [20:48<23:11, 82.02batch/s]

[2025-04-02 19:56:24,307] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  46%|████▌     | 96993/210244 [20:59<24:20, 77.53batch/s]2025-04-02 19:56:35,220 - INFO - Epoch 1 (Step 097000): Train loss 4.219, Val loss 4.591
Epoch 1:  47%|████▋     | 97994/210244 [21:13<23:03, 81.11batch/s]  2025-04-02 19:56:49,047 - INFO - Epoch 1 (Step 098000): Train loss 4.196, Val loss 4.617
Epoch 1:  47%|████▋     | 98125/210244 [21:14<22:46, 82.07batch/s]

[2025-04-02 19:56:50,677] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  47%|████▋     | 98998/210244 [21:25<23:00, 80.61batch/s]2025-04-02 19:57:01,571 - INFO - Epoch 1 (Step 099000): Train loss 4.350, Val loss 4.574
Epoch 1:  48%|████▊     | 100000/210244 [21:38<23:00, 79.85batch/s]2025-04-02 19:57:14,017 - INFO - Epoch 1 (Step 100000): Train loss 4.281, Val loss 4.594
Epoch 1:  48%|████▊     | 100123/210244 [21:41<43:23, 42.30batch/s]  

[2025-04-02 19:57:16,984] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  48%|████▊     | 100993/210244 [21:51<22:32, 80.80batch/s]2025-04-02 19:57:27,773 - INFO - Epoch 1 (Step 101000): Train loss 4.573, Val loss 4.610
Epoch 1:  49%|████▊     | 101992/210244 [22:04<22:14, 81.10batch/s]2025-04-02 19:57:40,194 - INFO - Epoch 1 (Step 102000): Train loss 4.372, Val loss 4.605
Epoch 1:  49%|████▊     | 102126/210244 [22:06<22:31, 80.00batch/s]

[2025-04-02 19:57:41,838] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  49%|████▉     | 102996/210244 [22:18<22:15, 80.28batch/s]  2025-04-02 19:57:53,962 - INFO - Epoch 1 (Step 103000): Train loss 4.291, Val loss 4.614
Epoch 1:  49%|████▉     | 103996/210244 [22:30<22:20, 79.25batch/s]2025-04-02 19:58:06,500 - INFO - Epoch 1 (Step 104000): Train loss 4.492, Val loss 4.627
Epoch 1:  50%|████▉     | 104128/210244 [22:32<22:06, 79.98batch/s]

[2025-04-02 19:58:08,186] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  50%|████▉     | 104995/210244 [22:43<21:37, 81.11batch/s]2025-04-02 19:58:18,997 - INFO - Epoch 1 (Step 105000): Train loss 4.350, Val loss 4.607
Epoch 1:  50%|█████     | 105998/210244 [22:57<21:16, 81.68batch/s]  2025-04-02 19:58:32,827 - INFO - Epoch 1 (Step 106000): Train loss 4.377, Val loss 4.627
Epoch 1:  50%|█████     | 106131/210244 [22:58<21:31, 80.59batch/s]

[2025-04-02 19:58:34,529] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  51%|█████     | 107000/210244 [23:09<21:14, 81.00batch/s]2025-04-02 19:58:45,298 - INFO - Epoch 1 (Step 107000): Train loss 4.217, Val loss 4.609
Epoch 1:  51%|█████▏    | 107996/210244 [23:23<21:18, 79.97batch/s]  2025-04-02 19:58:59,093 - INFO - Epoch 1 (Step 108000): Train loss 4.256, Val loss 4.615
Epoch 1:  51%|█████▏    | 108139/210244 [23:25<21:06, 80.62batch/s]

[2025-04-02 19:59:00,804] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  52%|█████▏    | 108992/210244 [23:35<20:27, 82.47batch/s]2025-04-02 19:59:11,525 - INFO - Epoch 1 (Step 109000): Train loss 4.303, Val loss 4.590
Epoch 1:  52%|█████▏    | 109998/210244 [23:48<20:54, 79.91batch/s]2025-04-02 19:59:23,894 - INFO - Epoch 1 (Step 110000): Train loss 4.288, Val loss 4.595
Epoch 1:  52%|█████▏    | 110133/210244 [23:49<20:42, 80.55batch/s]

[2025-04-02 19:59:25,624] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  53%|█████▎    | 110998/210244 [24:01<20:01, 82.61batch/s]  2025-04-02 19:59:37,677 - INFO - Epoch 1 (Step 111000): Train loss 4.305, Val loss 4.596
Epoch 1:  53%|█████▎    | 111999/210244 [24:14<20:10, 81.14batch/s]2025-04-02 19:59:50,180 - INFO - Epoch 1 (Step 112000): Train loss 4.257, Val loss 4.573
Epoch 1:  53%|█████▎    | 112140/210244 [24:16<20:23, 80.19batch/s]

[2025-04-02 19:59:51,949] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  54%|█████▎    | 112997/210244 [24:26<19:58, 81.12batch/s]2025-04-02 20:00:02,559 - INFO - Epoch 1 (Step 113000): Train loss 4.343, Val loss 4.576
Epoch 1:  54%|█████▍    | 113481/210244 [24:34<20:02, 80.44batch/s]  

[2025-04-02 20:00:09,860] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768


Epoch 1:  54%|█████▍    | 113999/210244 [24:40<19:31, 82.17batch/s]2025-04-02 20:00:16,347 - INFO - Epoch 1 (Step 114000): Train loss 4.192, Val loss 4.557
Epoch 1:  55%|█████▍    | 114997/210244 [24:53<19:18, 82.21batch/s]2025-04-02 20:00:28,899 - INFO - Epoch 1 (Step 115000): Train loss 4.209, Val loss 4.595
Epoch 1:  55%|█████▌    | 115999/210244 [25:06<19:32, 80.37batch/s]  2025-04-02 20:00:42,704 - INFO - Epoch 1 (Step 116000): Train loss 4.367, Val loss 4.579
Epoch 1:  56%|█████▌    | 116992/210244 [25:19<19:17, 80.54batch/s]2025-04-02 20:00:55,257 - INFO - Epoch 1 (Step 117000): Train loss 4.347, Val loss 4.560
Epoch 1:  56%|█████▌    | 117485/210244 [25:25<18:52, 81.88batch/s]

[2025-04-02 20:01:01,211] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  56%|█████▌    | 117999/210244 [25:31<18:45, 81.93batch/s]2025-04-02 20:01:07,679 - INFO - Epoch 1 (Step 118000): Train loss 4.202, Val loss 4.559
Epoch 1:  57%|█████▋    | 118998/210244 [25:45<18:48, 80.86batch/s]  2025-04-02 20:01:21,470 - INFO - Epoch 1 (Step 119000): Train loss 4.312, Val loss 4.527
Epoch 1:  57%|█████▋    | 119485/210244 [25:51<18:31, 81.68batch/s]

[2025-04-02 20:01:27,502] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  57%|█████▋    | 119994/210244 [25:58<19:02, 79.00batch/s]2025-04-02 20:01:33,912 - INFO - Epoch 1 (Step 120000): Train loss 4.417, Val loss 4.554
Epoch 1:  58%|█████▊    | 120994/210244 [26:12<18:39, 79.73batch/s]  2025-04-02 20:01:47,828 - INFO - Epoch 1 (Step 121000): Train loss 4.340, Val loss 4.549
Epoch 1:  58%|█████▊    | 121485/210244 [26:18<18:29, 80.01batch/s]

[2025-04-02 20:01:53,959] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  58%|█████▊    | 121994/210244 [26:24<18:19, 80.26batch/s]2025-04-02 20:02:00,426 - INFO - Epoch 1 (Step 122000): Train loss 4.344, Val loss 4.544
Epoch 1:  59%|█████▊    | 122997/210244 [26:37<17:53, 81.24batch/s]2025-04-02 20:02:12,975 - INFO - Epoch 1 (Step 123000): Train loss 4.234, Val loss 4.547
Epoch 1:  59%|█████▊    | 123485/210244 [26:44<23:22, 61.86batch/s]  

[2025-04-02 20:02:20,392] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  59%|█████▉    | 123999/210244 [26:51<17:37, 81.57batch/s]2025-04-02 20:02:26,808 - INFO - Epoch 1 (Step 124000): Train loss 4.125, Val loss 4.556
Epoch 1:  59%|█████▉    | 124994/210244 [27:03<17:53, 79.42batch/s]2025-04-02 20:02:39,321 - INFO - Epoch 1 (Step 125000): Train loss 4.280, Val loss 4.530
Epoch 1:  60%|█████▉    | 125490/210244 [27:09<17:57, 78.65batch/s]

[2025-04-02 20:02:45,430] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, reducing to 65536


Epoch 1:  60%|█████▉    | 125891/210244 [27:14<17:12, 81.68batch/s]

[2025-04-02 20:02:50,381] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768


Epoch 1:  60%|█████▉    | 125999/210244 [27:16<17:23, 80.74batch/s]2025-04-02 20:02:53,117 - INFO - Epoch 1 (Step 126000): Train loss 4.283, Val loss 4.529
Epoch 1:  60%|██████    | 126995/210244 [27:29<17:09, 80.84batch/s]  2025-04-02 20:03:05,572 - INFO - Epoch 1 (Step 127000): Train loss 4.380, Val loss 4.510
Epoch 1:  61%|██████    | 127999/210244 [27:42<17:23, 78.80batch/s]2025-04-02 20:03:18,093 - INFO - Epoch 1 (Step 128000): Train loss 4.239, Val loss 4.516
Epoch 1:  61%|██████    | 128305/210244 [27:46<16:58, 80.42batch/s]

[2025-04-02 20:03:21,961] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768


Epoch 1:  61%|██████▏   | 128996/210244 [27:56<17:05, 79.23batch/s]  2025-04-02 20:03:31,901 - INFO - Epoch 1 (Step 129000): Train loss 4.342, Val loss 4.525
Epoch 1:  62%|██████▏   | 129998/210244 [28:08<16:42, 80.08batch/s]2025-04-02 20:03:44,340 - INFO - Epoch 1 (Step 130000): Train loss 4.110, Val loss 4.502
Epoch 1:  62%|██████▏   | 130994/210244 [28:21<16:05, 82.10batch/s]2025-04-02 20:03:56,798 - INFO - Epoch 1 (Step 131000): Train loss 4.219, Val loss 4.517
Epoch 1:  62%|██████▏   | 131353/210244 [28:26<16:05, 81.70batch/s]  

[2025-04-02 20:04:02,565] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768


Epoch 1:  63%|██████▎   | 131996/210244 [28:34<16:31, 78.93batch/s]2025-04-02 20:04:10,577 - INFO - Epoch 1 (Step 132000): Train loss 4.325, Val loss 4.531
Epoch 1:  63%|██████▎   | 132998/210244 [28:47<16:24, 78.46batch/s]2025-04-02 20:04:23,074 - INFO - Epoch 1 (Step 133000): Train loss 4.126, Val loss 4.513
Epoch 1:  63%|██████▎   | 133381/210244 [28:52<15:45, 81.27batch/s]

[2025-04-02 20:04:27,887] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768


Epoch 1:  64%|██████▎   | 133998/210244 [29:01<15:43, 80.80batch/s]  2025-04-02 20:04:36,867 - INFO - Epoch 1 (Step 134000): Train loss 4.285, Val loss 4.522
Epoch 1:  64%|██████▍   | 134998/210244 [29:13<15:36, 80.38batch/s]2025-04-02 20:04:49,351 - INFO - Epoch 1 (Step 135000): Train loss 4.103, Val loss 4.530
Epoch 1:  64%|██████▍   | 135505/210244 [29:19<15:03, 82.70batch/s]

[2025-04-02 20:04:55,598] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768


Epoch 1:  65%|██████▍   | 135994/210244 [29:26<16:07, 76.78batch/s]2025-04-02 20:05:01,816 - INFO - Epoch 1 (Step 136000): Train loss 4.263, Val loss 4.508
Epoch 1:  65%|██████▌   | 136994/210244 [29:39<15:12, 80.28batch/s]  2025-04-02 20:05:15,640 - INFO - Epoch 1 (Step 137000): Train loss 4.292, Val loss 4.542
Epoch 1:  65%|██████▌   | 137679/210244 [29:48<14:52, 81.30batch/s]

[2025-04-02 20:05:24,183] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768


Epoch 1:  66%|██████▌   | 137994/210244 [29:52<14:53, 80.91batch/s]2025-04-02 20:05:28,086 - INFO - Epoch 1 (Step 138000): Train loss 4.199, Val loss 4.551
Epoch 1:  66%|██████▌   | 139000/210244 [30:06<20:04, 59.16batch/s]  2025-04-02 20:05:41,965 - INFO - Epoch 1 (Step 139000): Train loss 4.221, Val loss 4.522
Epoch 1:  67%|██████▋   | 139995/210244 [30:18<14:17, 81.89batch/s]2025-04-02 20:05:54,386 - INFO - Epoch 1 (Step 140000): Train loss 4.259, Val loss 4.525
Epoch 1:  67%|██████▋   | 140336/210244 [30:22<14:35, 79.81batch/s]

[2025-04-02 20:05:58,574] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768


Epoch 1:  67%|██████▋   | 140999/210244 [30:31<14:06, 81.83batch/s]2025-04-02 20:06:06,846 - INFO - Epoch 1 (Step 141000): Train loss 3.837, Val loss 4.480
Epoch 1:  68%|██████▊   | 141995/210244 [30:44<13:52, 81.96batch/s]  2025-04-02 20:06:20,536 - INFO - Epoch 1 (Step 142000): Train loss 4.263, Val loss 4.517
Epoch 1:  68%|██████▊   | 142872/210244 [30:55<13:52, 80.96batch/s]

[2025-04-02 20:06:31,420] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768


Epoch 1:  68%|██████▊   | 142994/210244 [30:57<14:24, 77.80batch/s]2025-04-02 20:06:33,080 - INFO - Epoch 1 (Step 143000): Train loss 4.312, Val loss 4.532
Epoch 1:  68%|██████▊   | 143992/210244 [31:09<13:45, 80.30batch/s]2025-04-02 20:06:45,614 - INFO - Epoch 1 (Step 144000): Train loss 4.257, Val loss 4.493
Epoch 1:  69%|██████▉   | 144981/210244 [31:23<14:06, 77.10batch/s]  

[2025-04-02 20:06:59,184] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768


Epoch 1:  69%|██████▉   | 144997/210244 [31:23<14:11, 76.63batch/s]2025-04-02 20:06:59,494 - INFO - Epoch 1 (Step 145000): Train loss 4.413, Val loss 4.495
Epoch 1:  69%|██████▉   | 145992/210244 [31:36<13:09, 81.34batch/s]2025-04-02 20:07:11,910 - INFO - Epoch 1 (Step 146000): Train loss 4.238, Val loss 4.490
Epoch 1:  70%|██████▉   | 146359/210244 [31:40<13:06, 81.23batch/s]

[2025-04-02 20:07:16,366] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384


Epoch 1:  70%|██████▉   | 146992/210244 [31:49<13:24, 78.66batch/s]  2025-04-02 20:07:25,674 - INFO - Epoch 1 (Step 147000): Train loss 4.103, Val loss 4.485
Epoch 1:  70%|███████   | 147992/210244 [32:02<13:03, 79.48batch/s]2025-04-02 20:07:38,096 - INFO - Epoch 1 (Step 148000): Train loss 4.127, Val loss 4.504
Epoch 1:  71%|███████   | 148520/210244 [32:08<12:31, 82.10batch/s]

[2025-04-02 20:07:44,461] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384


Epoch 1:  71%|███████   | 148992/210244 [32:14<13:00, 78.51batch/s]2025-04-02 20:07:50,462 - INFO - Epoch 1 (Step 149000): Train loss 4.260, Val loss 4.473
Epoch 1:  71%|███████▏  | 149992/210244 [32:28<12:24, 80.95batch/s]2025-04-02 20:08:04,448 - INFO - Epoch 1 (Step 150000): Train loss 4.292, Val loss 4.487
Epoch 1:  72%|███████▏  | 150983/210244 [32:41<13:09, 75.09batch/s]

[2025-04-02 20:08:16,804] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384


Epoch 1:  72%|███████▏  | 150992/210244 [32:41<12:41, 77.78batch/s]2025-04-02 20:08:16,971 - INFO - Epoch 1 (Step 151000): Train loss 4.385, Val loss 4.515
Epoch 1:  72%|███████▏  | 152000/210244 [32:55<12:26, 78.06batch/s]2025-04-02 20:08:30,817 - INFO - Epoch 1 (Step 152000): Train loss 4.270, Val loss 4.503
Epoch 1:  73%|███████▎  | 152993/210244 [33:07<11:51, 80.42batch/s]2025-04-02 20:08:43,310 - INFO - Epoch 1 (Step 153000): Train loss 4.166, Val loss 4.487
Epoch 1:  73%|███████▎  | 153997/210244 [33:20<11:34, 81.00batch/s]2025-04-02 20:08:55,778 - INFO - Epoch 1 (Step 154000): Train loss 4.123, Val loss 4.491
Epoch 1:  74%|███████▎  | 154997/210244 [33:33<11:03, 83.24batch/s]2025-04-02 20:09:09,603 - INFO - Epoch 1 (Step 155000): Train loss 4.220, Val loss 4.491
Epoch 1:  74%|███████▍  | 155078/210244 [33:34<11:49, 77.80batch/s]

[2025-04-02 20:09:10,653] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768


Epoch 1:  74%|███████▍  | 156000/210244 [33:46<11:33, 78.17batch/s]2025-04-02 20:09:22,230 - INFO - Epoch 1 (Step 156000): Train loss 4.143, Val loss 4.486
Epoch 1:  74%|███████▍  | 156142/210244 [33:48<11:21, 79.43batch/s]

[2025-04-02 20:09:23,993] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384


Epoch 1:  75%|███████▍  | 156996/210244 [33:58<11:01, 80.55batch/s]2025-04-02 20:09:34,590 - INFO - Epoch 1 (Step 157000): Train loss 4.221, Val loss 4.498
Epoch 1:  75%|███████▌  | 157995/210244 [34:12<10:41, 81.42batch/s]2025-04-02 20:09:48,371 - INFO - Epoch 1 (Step 158000): Train loss 4.344, Val loss 4.476
Epoch 1:  75%|███████▌  | 158724/210244 [34:21<10:30, 81.67batch/s]

[2025-04-02 20:09:57,194] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384


Epoch 1:  76%|███████▌  | 158994/210244 [34:24<10:33, 80.89batch/s]2025-04-02 20:10:00,641 - INFO - Epoch 1 (Step 159000): Train loss 4.057, Val loss 4.485
Epoch 1:  76%|███████▌  | 159992/210244 [34:38<10:16, 81.45batch/s]2025-04-02 20:10:14,357 - INFO - Epoch 1 (Step 160000): Train loss 4.247, Val loss 4.454
Epoch 1:  77%|███████▋  | 160995/210244 [34:50<09:57, 82.38batch/s]2025-04-02 20:10:26,703 - INFO - Epoch 1 (Step 161000): Train loss 4.317, Val loss 4.481
Epoch 1:  77%|███████▋  | 161021/210244 [34:51<11:00, 74.52batch/s]

[2025-04-02 20:10:26,969] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384


Epoch 1:  77%|███████▋  | 161998/210244 [35:03<10:15, 78.36batch/s]2025-04-02 20:10:39,137 - INFO - Epoch 1 (Step 162000): Train loss 4.191, Val loss 4.476
Epoch 1:  78%|███████▊  | 162995/210244 [35:17<09:40, 81.45batch/s]2025-04-02 20:10:53,034 - INFO - Epoch 1 (Step 163000): Train loss 4.144, Val loss 4.490
Epoch 1:  78%|███████▊  | 164000/210244 [35:29<10:00, 76.95batch/s]2025-04-02 20:11:05,572 - INFO - Epoch 1 (Step 164000): Train loss 4.282, Val loss 4.493
Epoch 1:  78%|███████▊  | 164197/210244 [35:32<09:19, 82.36batch/s]

[2025-04-02 20:11:08,032] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384


Epoch 1:  78%|███████▊  | 164994/210244 [35:43<09:38, 78.16batch/s]2025-04-02 20:11:19,255 - INFO - Epoch 1 (Step 165000): Train loss 4.175, Val loss 4.497
Epoch 1:  79%|███████▉  | 165996/210244 [35:56<09:15, 79.67batch/s]2025-04-02 20:11:31,963 - INFO - Epoch 1 (Step 166000): Train loss 4.234, Val loss 4.481
Epoch 1:  79%|███████▉  | 166253/210244 [35:59<08:52, 82.54batch/s]

[2025-04-02 20:11:35,158] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384


Epoch 1:  79%|███████▉  | 166999/210244 [36:08<09:15, 77.82batch/s]2025-04-02 20:11:44,457 - INFO - Epoch 1 (Step 167000): Train loss 4.142, Val loss 4.456
Epoch 1:  80%|███████▉  | 167993/210244 [36:22<08:32, 82.50batch/s]2025-04-02 20:11:58,196 - INFO - Epoch 1 (Step 168000): Train loss 4.095, Val loss 4.463
Epoch 1:  80%|████████  | 169000/210244 [36:34<08:28, 81.16batch/s]2025-04-02 20:12:10,546 - INFO - Epoch 1 (Step 169000): Train loss 4.263, Val loss 4.461
Epoch 1:  80%|████████  | 169045/210244 [36:35<08:40, 79.14batch/s]

[2025-04-02 20:12:11,096] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384


Epoch 1:  81%|████████  | 169997/210244 [36:47<08:11, 81.96batch/s]2025-04-02 20:12:23,151 - INFO - Epoch 1 (Step 170000): Train loss 4.096, Val loss 4.446
Epoch 1:  81%|████████▏ | 170996/210244 [37:01<08:05, 80.81batch/s]2025-04-02 20:12:37,000 - INFO - Epoch 1 (Step 171000): Train loss 4.281, Val loss 4.463
Epoch 1:  82%|████████▏ | 171530/210244 [37:07<08:01, 80.49batch/s]

[2025-04-02 20:12:43,594] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384


Epoch 1:  82%|████████▏ | 171997/210244 [37:13<07:50, 81.26batch/s]2025-04-02 20:12:49,422 - INFO - Epoch 1 (Step 172000): Train loss 4.245, Val loss 4.477
Epoch 1:  82%|████████▏ | 173000/210244 [37:27<07:41, 80.73batch/s]2025-04-02 20:13:03,359 - INFO - Epoch 1 (Step 173000): Train loss 4.140, Val loss 4.459
Epoch 1:  83%|████████▎ | 173996/210244 [37:40<07:24, 81.46batch/s]2025-04-02 20:13:15,761 - INFO - Epoch 1 (Step 174000): Train loss 4.225, Val loss 4.441
Epoch 1:  83%|████████▎ | 174749/210244 [37:49<07:10, 82.43batch/s]

[2025-04-02 20:13:25,012] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384


Epoch 1:  83%|████████▎ | 174992/210244 [37:52<07:15, 80.95batch/s]2025-04-02 20:13:28,142 - INFO - Epoch 1 (Step 175000): Train loss 4.225, Val loss 4.440
Epoch 1:  84%|████████▎ | 175996/210244 [38:06<07:08, 79.97batch/s]2025-04-02 20:13:41,970 - INFO - Epoch 1 (Step 176000): Train loss 4.300, Val loss 4.456
Epoch 1:  84%|████████▍ | 176999/210244 [38:18<06:59, 79.18batch/s]2025-04-02 20:13:54,477 - INFO - Epoch 1 (Step 177000): Train loss 4.186, Val loss 4.441
Epoch 1:  84%|████████▍ | 177515/210244 [38:25<06:50, 79.76batch/s]

[2025-04-02 20:14:01,085] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384


Epoch 1:  85%|████████▍ | 177999/210244 [38:32<06:35, 81.48batch/s]2025-04-02 20:14:08,368 - INFO - Epoch 1 (Step 178000): Train loss 4.323, Val loss 4.413
Epoch 1:  85%|████████▌ | 178994/210244 [38:44<06:26, 80.75batch/s]2025-04-02 20:14:20,771 - INFO - Epoch 1 (Step 179000): Train loss 4.092, Val loss 4.450
Epoch 1:  85%|████████▌ | 179612/210244 [38:52<06:18, 80.84batch/s]

[2025-04-02 20:14:28,377] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384


Epoch 1:  86%|████████▌ | 179996/210244 [38:57<06:16, 80.37batch/s]2025-04-02 20:14:33,240 - INFO - Epoch 1 (Step 180000): Train loss 4.145, Val loss 4.437
Epoch 1:  86%|████████▌ | 180992/210244 [39:11<06:00, 81.16batch/s]2025-04-02 20:14:47,082 - INFO - Epoch 1 (Step 181000): Train loss 4.102, Val loss 4.442
Epoch 1:  86%|████████▋ | 181667/210244 [39:19<05:53, 80.78batch/s]

[2025-04-02 20:14:55,359] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384


Epoch 1:  87%|████████▋ | 181999/210244 [39:23<05:49, 80.84batch/s]2025-04-02 20:14:59,540 - INFO - Epoch 1 (Step 182000): Train loss 4.214, Val loss 4.463
Epoch 1:  87%|████████▋ | 182994/210244 [39:37<16:34, 27.39batch/s]2025-04-02 20:15:13,360 - INFO - Epoch 1 (Step 183000): Train loss 3.946, Val loss 4.433
Epoch 1:  87%|████████▋ | 183156/210244 [39:39<05:31, 81.76batch/s]

[2025-04-02 20:15:15,346] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 1:  88%|████████▊ | 183995/210244 [39:50<05:23, 81.13batch/s]2025-04-02 20:15:25,814 - INFO - Epoch 1 (Step 184000): Train loss 4.204, Val loss 4.476
Epoch 1:  88%|████████▊ | 184999/210244 [40:02<05:08, 81.91batch/s]2025-04-02 20:15:38,288 - INFO - Epoch 1 (Step 185000): Train loss 4.144, Val loss 4.448
Epoch 1:  88%|████████▊ | 186000/210244 [40:16<05:01, 80.53batch/s]2025-04-02 20:15:52,106 - INFO - Epoch 1 (Step 186000): Train loss 4.035, Val loss 4.482
Epoch 1:  89%|████████▉ | 186997/210244 [40:28<04:40, 82.87batch/s]2025-04-02 20:16:04,613 - INFO - Epoch 1 (Step 187000): Train loss 4.159, Val loss 4.473
Epoch 1:  89%|████████▉ | 187203/210244 [40:31<04:41, 81.82batch/s]

[2025-04-02 20:16:07,192] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384


Epoch 1:  89%|████████▉ | 187996/210244 [40:41<04:36, 80.39batch/s]2025-04-02 20:16:17,104 - INFO - Epoch 1 (Step 188000): Train loss 4.235, Val loss 4.452
Epoch 1:  90%|████████▉ | 188999/210244 [40:55<04:24, 80.47batch/s]2025-04-02 20:16:30,976 - INFO - Epoch 1 (Step 189000): Train loss 4.231, Val loss 4.443
Epoch 1:  90%|████████▉ | 189071/210244 [40:56<04:21, 80.83batch/s]

[2025-04-02 20:16:31,921] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 1:  90%|█████████ | 189999/210244 [41:07<04:06, 82.03batch/s]2025-04-02 20:16:43,509 - INFO - Epoch 1 (Step 190000): Train loss 4.289, Val loss 4.421
Epoch 1:  91%|█████████ | 190994/210244 [41:21<03:55, 81.61batch/s]2025-04-02 20:16:57,350 - INFO - Epoch 1 (Step 191000): Train loss 4.107, Val loss 4.451
Epoch 1:  91%|█████████▏| 191995/210244 [41:34<03:40, 82.59batch/s]2025-04-02 20:17:09,820 - INFO - Epoch 1 (Step 192000): Train loss 4.054, Val loss 4.454
Epoch 1:  92%|█████████▏| 192995/210244 [41:46<03:35, 80.01batch/s]2025-04-02 20:17:22,397 - INFO - Epoch 1 (Step 193000): Train loss 4.228, Val loss 4.429
Epoch 1:  92%|█████████▏| 193173/210244 [41:48<03:27, 82.33batch/s]

[2025-04-02 20:17:24,525] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384


Epoch 1:  92%|█████████▏| 193994/210244 [42:00<03:21, 80.54batch/s]2025-04-02 20:17:36,205 - INFO - Epoch 1 (Step 194000): Train loss 4.147, Val loss 4.430
Epoch 1:  93%|█████████▎| 194995/210244 [42:12<03:05, 82.37batch/s]2025-04-02 20:17:48,662 - INFO - Epoch 1 (Step 195000): Train loss 4.120, Val loss 4.431
Epoch 1:  93%|█████████▎| 195273/210244 [42:16<03:02, 81.96batch/s]

[2025-04-02 20:17:52,078] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384


Epoch 1:  93%|█████████▎| 195994/210244 [42:26<03:08, 75.77batch/s]2025-04-02 20:18:02,432 - INFO - Epoch 1 (Step 196000): Train loss 4.123, Val loss 4.429
Epoch 1:  94%|█████████▎| 196997/210244 [42:39<02:41, 81.85batch/s]2025-04-02 20:18:14,862 - INFO - Epoch 1 (Step 197000): Train loss 4.069, Val loss 4.424
Epoch 1:  94%|█████████▍| 197550/210244 [42:46<02:36, 81.22batch/s]

[2025-04-02 20:18:21,756] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384


Epoch 1:  94%|█████████▍| 198000/210244 [42:51<02:31, 80.92batch/s]2025-04-02 20:18:27,263 - INFO - Epoch 1 (Step 198000): Train loss 4.153, Val loss 4.416
Epoch 1:  95%|█████████▍| 198775/210244 [43:02<02:23, 80.10batch/s]

[2025-04-02 20:18:38,299] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 1:  95%|█████████▍| 199000/210244 [43:05<02:21, 79.47batch/s]2025-04-02 20:18:41,104 - INFO - Epoch 1 (Step 199000): Train loss 4.318, Val loss 4.445
Epoch 1:  95%|█████████▌| 199992/210244 [43:17<02:14, 76.24batch/s]2025-04-02 20:18:53,616 - INFO - Epoch 1 (Step 200000): Train loss 4.015, Val loss 4.442
Epoch 1:  96%|█████████▌| 200999/210244 [43:30<01:52, 81.93batch/s]2025-04-02 20:19:05,959 - INFO - Epoch 1 (Step 201000): Train loss 4.083, Val loss 4.433
Epoch 1:  96%|█████████▌| 201993/210244 [43:43<01:41, 81.32batch/s]2025-04-02 20:19:19,742 - INFO - Epoch 1 (Step 202000): Train loss 4.100, Val loss 4.436
Epoch 1:  96%|█████████▋| 202778/210244 [43:53<01:35, 78.48batch/s]

[2025-04-02 20:19:29,457] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384


Epoch 1:  97%|█████████▋| 203000/210244 [43:56<01:34, 76.85batch/s]2025-04-02 20:19:32,260 - INFO - Epoch 1 (Step 203000): Train loss 4.092, Val loss 4.427
Epoch 1:  97%|█████████▋| 203846/210244 [44:08<01:27, 72.93batch/s]

[2025-04-02 20:19:44,311] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 1:  97%|█████████▋| 203997/210244 [44:10<01:15, 82.22batch/s]2025-04-02 20:19:46,190 - INFO - Epoch 1 (Step 204000): Train loss 3.962, Val loss 4.436
Epoch 1:  98%|█████████▊| 204998/210244 [44:23<01:05, 79.84batch/s]2025-04-02 20:19:58,753 - INFO - Epoch 1 (Step 205000): Train loss 4.196, Val loss 4.444
Epoch 1:  98%|█████████▊| 205996/210244 [44:35<00:52, 80.44batch/s]2025-04-02 20:20:11,227 - INFO - Epoch 1 (Step 206000): Train loss 4.214, Val loss 4.437
Epoch 1:  98%|█████████▊| 206482/210244 [44:42<00:47, 80.01batch/s]

[2025-04-02 20:20:18,640] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 1:  98%|█████████▊| 206995/210244 [44:49<00:39, 82.00batch/s]2025-04-02 20:20:25,128 - INFO - Epoch 1 (Step 207000): Train loss 4.157, Val loss 4.438
Epoch 1:  99%|█████████▉| 207992/210244 [45:01<00:27, 80.97batch/s]2025-04-02 20:20:37,588 - INFO - Epoch 1 (Step 208000): Train loss 4.218, Val loss 4.452
Epoch 1:  99%|█████████▉| 208530/210244 [45:08<00:21, 81.55batch/s]

[2025-04-02 20:20:44,304] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 1:  99%|█████████▉| 208998/210244 [45:15<00:15, 79.76batch/s]2025-04-02 20:20:51,471 - INFO - Epoch 1 (Step 209000): Train loss 4.140, Val loss 4.418
Epoch 1: 100%|█████████▉| 209992/210244 [45:28<00:03, 81.53batch/s]2025-04-02 20:21:03,930 - INFO - Epoch 1 (Step 210000): Train loss 4.112, Val loss 4.425
Epoch 1: 100%|██████████| 210244/210244 [45:31<00:00, 76.98batch/s]
2025-04-02 20:21:06,958 - INFO - Epoch 1 completed. Generating a sample...
2025-04-02 20:21:07,028 - INFO - Generated Text: Every effort moves you to get to the game . " 
   = = = = = = = = = = = 
   The game was released in the United States on November 7 , 2010 , in the United States . The game was released in
2025-04-02 20:21:07,030 - INFO - Starting Epoch 2...


Every effort moves you to get to the game . "     = = = = = = = = = = =     The game was released in the United States on November 7 , 2010 , in the United States . The game was released in


Epoch 2:   0%|          | 755/210244 [00:09<43:05, 81.02batch/s]2025-04-02 20:21:16,538 - INFO - Epoch 2 (Step 211000): Train loss 4.095, Val loss 4.421
Epoch 2:   1%|          | 1420/210244 [00:19<44:23, 78.40batch/s]  

[2025-04-02 20:21:26,311] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:   1%|          | 1754/210244 [00:23<46:45, 74.33batch/s]2025-04-02 20:21:30,545 - INFO - Epoch 2 (Step 212000): Train loss 4.180, Val loss 4.399
Epoch 2:   1%|▏         | 2752/210244 [00:35<42:57, 80.49batch/s]2025-04-02 20:21:43,150 - INFO - Epoch 2 (Step 213000): Train loss 3.975, Val loss 4.445
Epoch 2:   2%|▏         | 3750/210244 [00:49<1:23:22, 41.28batch/s]2025-04-02 20:21:56,950 - INFO - Epoch 2 (Step 214000): Train loss 4.156, Val loss 4.442
Epoch 2:   2%|▏         | 4752/210244 [01:02<43:04, 79.49batch/s]  2025-04-02 20:22:09,626 - INFO - Epoch 2 (Step 215000): Train loss 4.055, Val loss 4.437
Epoch 2:   2%|▏         | 5057/210244 [01:06<42:42, 80.07batch/s]

[2025-04-02 20:22:13,503] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:   3%|▎         | 5748/210244 [01:14<41:05, 82.95batch/s]2025-04-02 20:22:22,103 - INFO - Epoch 2 (Step 216000): Train loss 3.959, Val loss 4.417
Epoch 2:   3%|▎         | 6752/210244 [01:28<42:12, 80.35batch/s]  2025-04-02 20:22:35,889 - INFO - Epoch 2 (Step 217000): Train loss 4.126, Val loss 4.444
Epoch 2:   4%|▎         | 7753/210244 [01:41<41:32, 81.23batch/s]2025-04-02 20:22:48,379 - INFO - Epoch 2 (Step 218000): Train loss 4.084, Val loss 4.409
Epoch 2:   4%|▍         | 8238/210244 [01:47<41:53, 80.36batch/s]

[2025-04-02 20:22:54,327] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:   4%|▍         | 8490/210244 [01:50<41:50, 80.38batch/s]

[2025-04-02 20:22:57,396] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:   4%|▍         | 8750/210244 [01:53<41:29, 80.93batch/s]2025-04-02 20:23:00,694 - INFO - Epoch 2 (Step 219000): Train loss 4.027, Val loss 4.409
Epoch 2:   5%|▍         | 9748/210244 [02:07<40:23, 82.71batch/s]  2025-04-02 20:23:14,462 - INFO - Epoch 2 (Step 220000): Train loss 4.153, Val loss 4.399
Epoch 2:   5%|▌         | 10752/210244 [02:19<41:52, 79.38batch/s]2025-04-02 20:23:26,983 - INFO - Epoch 2 (Step 221000): Train loss 4.124, Val loss 4.412
Epoch 2:   6%|▌         | 11752/210244 [02:33<40:48, 81.07batch/s]  2025-04-02 20:23:40,797 - INFO - Epoch 2 (Step 222000): Train loss 3.998, Val loss 4.428
Epoch 2:   6%|▌         | 12599/210244 [02:44<40:25, 81.48batch/s]

[2025-04-02 20:23:51,217] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:   6%|▌         | 12751/210244 [02:46<40:29, 81.29batch/s]2025-04-02 20:23:53,206 - INFO - Epoch 2 (Step 223000): Train loss 4.074, Val loss 4.425
Epoch 2:   7%|▋         | 13752/210244 [02:58<41:55, 78.11batch/s]2025-04-02 20:24:05,694 - INFO - Epoch 2 (Step 224000): Train loss 4.225, Val loss 4.412
Epoch 2:   7%|▋         | 14697/210244 [03:11<40:29, 80.48batch/s]  

[2025-04-02 20:24:18,860] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:   7%|▋         | 14751/210244 [03:12<40:06, 81.23batch/s]2025-04-02 20:24:19,602 - INFO - Epoch 2 (Step 225000): Train loss 4.201, Val loss 4.392
Epoch 2:   7%|▋         | 15484/210244 [03:21<40:16, 80.59batch/s]

[2025-04-02 20:24:28,699] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:   7%|▋         | 15753/210244 [03:24<39:38, 81.76batch/s]2025-04-02 20:24:32,104 - INFO - Epoch 2 (Step 226000): Train loss 4.052, Val loss 4.408
Epoch 2:   8%|▊         | 16748/210244 [03:38<51:50, 62.21batch/s]  2025-04-02 20:24:45,848 - INFO - Epoch 2 (Step 227000): Train loss 4.249, Val loss 4.424
Epoch 2:   8%|▊         | 17755/210244 [03:51<41:29, 77.31batch/s]2025-04-02 20:24:58,361 - INFO - Epoch 2 (Step 228000): Train loss 4.283, Val loss 4.411
Epoch 2:   9%|▊         | 18184/210244 [03:56<39:19, 81.40batch/s]

[2025-04-02 20:25:03,661] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:   9%|▉         | 18749/210244 [04:03<39:02, 81.74batch/s]2025-04-02 20:25:10,729 - INFO - Epoch 2 (Step 229000): Train loss 4.048, Val loss 4.408
Epoch 2:   9%|▉         | 19750/210244 [04:17<39:14, 80.92batch/s]  2025-04-02 20:25:24,633 - INFO - Epoch 2 (Step 230000): Train loss 3.889, Val loss 4.383
Epoch 2:  10%|▉         | 20755/210244 [04:29<38:46, 81.46batch/s]2025-04-02 20:25:37,091 - INFO - Epoch 2 (Step 231000): Train loss 3.896, Val loss 4.373
Epoch 2:  10%|▉         | 20850/210244 [04:31<39:43, 79.47batch/s]

[2025-04-02 20:25:38,347] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  10%|█         | 21756/210244 [04:42<38:52, 80.80batch/s]2025-04-02 20:25:49,662 - INFO - Epoch 2 (Step 232000): Train loss 4.264, Val loss 4.374
Epoch 2:  11%|█         | 22748/210244 [04:56<38:57, 80.21batch/s]  2025-04-02 20:26:03,463 - INFO - Epoch 2 (Step 233000): Train loss 4.121, Val loss 4.360
Epoch 2:  11%|█▏        | 23751/210244 [05:08<37:01, 83.94batch/s]2025-04-02 20:26:15,804 - INFO - Epoch 2 (Step 234000): Train loss 4.127, Val loss 4.381
Epoch 2:  11%|█▏        | 23847/210244 [05:09<40:17, 77.10batch/s]

[2025-04-02 20:26:17,008] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  12%|█▏        | 24755/210244 [05:22<38:54, 79.45batch/s]  2025-04-02 20:26:29,702 - INFO - Epoch 2 (Step 235000): Train loss 4.089, Val loss 4.359
Epoch 2:  12%|█▏        | 25750/210244 [05:34<38:21, 80.17batch/s]2025-04-02 20:26:42,163 - INFO - Epoch 2 (Step 236000): Train loss 4.027, Val loss 4.391
Epoch 2:  13%|█▎        | 26753/210244 [05:47<37:34, 81.37batch/s]2025-04-02 20:26:54,710 - INFO - Epoch 2 (Step 237000): Train loss 4.149, Val loss 4.376
Epoch 2:  13%|█▎        | 27754/210244 [06:01<37:07, 81.92batch/s]  2025-04-02 20:27:08,508 - INFO - Epoch 2 (Step 238000): Train loss 3.954, Val loss 4.406
Epoch 2:  13%|█▎        | 27903/210244 [06:03<37:17, 81.51batch/s]

[2025-04-02 20:27:10,451] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  14%|█▎        | 28755/210244 [06:13<41:02, 73.69batch/s]2025-04-02 20:27:21,057 - INFO - Epoch 2 (Step 239000): Train loss 4.163, Val loss 4.386
Epoch 2:  14%|█▍        | 29752/210244 [06:27<37:47, 79.61batch/s]  2025-04-02 20:27:34,882 - INFO - Epoch 2 (Step 240000): Train loss 4.123, Val loss 4.379
Epoch 2:  15%|█▍        | 30628/210244 [06:38<36:58, 80.96batch/s]

[2025-04-02 20:27:45,759] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  15%|█▍        | 30754/210244 [06:40<37:00, 80.84batch/s]2025-04-02 20:27:47,332 - INFO - Epoch 2 (Step 241000): Train loss 4.248, Val loss 4.379
Epoch 2:  15%|█▌        | 31755/210244 [06:52<36:23, 81.73batch/s]2025-04-02 20:27:59,834 - INFO - Epoch 2 (Step 242000): Train loss 4.207, Val loss 4.386
Epoch 2:  16%|█▌        | 32750/210244 [07:06<36:33, 80.90batch/s]  2025-04-02 20:28:13,623 - INFO - Epoch 2 (Step 243000): Train loss 4.112, Val loss 4.383
Epoch 2:  16%|█▌        | 33089/210244 [07:10<35:37, 82.90batch/s]

[2025-04-02 20:28:17,784] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  16%|█▌        | 33754/210244 [07:18<36:19, 80.99batch/s]2025-04-02 20:28:26,001 - INFO - Epoch 2 (Step 244000): Train loss 4.073, Val loss 4.393
Epoch 2:  17%|█▋        | 34748/210244 [07:31<36:31, 80.08batch/s]2025-04-02 20:28:38,466 - INFO - Epoch 2 (Step 245000): Train loss 4.093, Val loss 4.357
Epoch 2:  17%|█▋        | 35207/210244 [07:38<35:44, 81.61batch/s]  

[2025-04-02 20:28:45,532] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  17%|█▋        | 35756/210244 [07:45<36:24, 79.87batch/s]2025-04-02 20:28:52,291 - INFO - Epoch 2 (Step 246000): Train loss 4.129, Val loss 4.375
Epoch 2:  17%|█▋        | 36751/210244 [07:57<35:04, 82.44batch/s]2025-04-02 20:29:04,707 - INFO - Epoch 2 (Step 247000): Train loss 4.070, Val loss 4.389
Epoch 2:  18%|█▊        | 37260/210244 [08:03<35:22, 81.52batch/s]

[2025-04-02 20:29:10,945] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  18%|█▊        | 37750/210244 [08:09<35:11, 81.70batch/s]2025-04-02 20:29:17,135 - INFO - Epoch 2 (Step 248000): Train loss 4.032, Val loss 4.380
Epoch 2:  18%|█▊        | 37973/210244 [08:12<34:57, 82.14batch/s]

[2025-04-02 20:29:19,899] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  18%|█▊        | 38753/210244 [08:24<35:51, 79.72batch/s]  2025-04-02 20:29:31,513 - INFO - Epoch 2 (Step 249000): Train loss 4.173, Val loss 4.377
Epoch 2:  19%|█▉        | 39752/210244 [08:36<35:11, 80.75batch/s]2025-04-02 20:29:43,861 - INFO - Epoch 2 (Step 250000): Train loss 4.339, Val loss 4.355
Epoch 2:  19%|█▉        | 40753/210244 [08:49<35:19, 79.97batch/s]2025-04-02 20:29:56,418 - INFO - Epoch 2 (Step 251000): Train loss 4.157, Val loss 4.367
Epoch 2:  19%|█▉        | 40859/210244 [08:50<34:19, 82.26batch/s]

[2025-04-02 20:29:57,707] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  20%|█▉        | 41756/210244 [09:03<34:39, 81.02batch/s]  2025-04-02 20:30:10,150 - INFO - Epoch 2 (Step 252000): Train loss 4.009, Val loss 4.378
Epoch 2:  20%|██        | 42756/210244 [09:15<34:54, 79.98batch/s]2025-04-02 20:30:22,619 - INFO - Epoch 2 (Step 253000): Train loss 4.029, Val loss 4.379
Epoch 2:  21%|██        | 43139/210244 [09:20<33:42, 82.62batch/s]

[2025-04-02 20:30:27,356] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  21%|██        | 43749/210244 [09:27<34:59, 79.29batch/s]2025-04-02 20:30:35,078 - INFO - Epoch 2 (Step 254000): Train loss 4.075, Val loss 4.389
Epoch 2:  21%|██▏       | 44748/210244 [09:41<33:54, 81.33batch/s]  2025-04-02 20:30:48,892 - INFO - Epoch 2 (Step 255000): Train loss 4.143, Val loss 4.405
Epoch 2:  22%|██▏       | 45751/210244 [09:54<35:59, 76.18batch/s]2025-04-02 20:31:01,372 - INFO - Epoch 2 (Step 256000): Train loss 4.041, Val loss 4.420
Epoch 2:  22%|██▏       | 46748/210244 [10:08<33:38, 81.01batch/s]  2025-04-02 20:31:15,254 - INFO - Epoch 2 (Step 257000): Train loss 4.253, Val loss 4.396
Epoch 2:  23%|██▎       | 47658/210244 [10:19<33:45, 80.27batch/s]

[2025-04-02 20:31:26,425] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  23%|██▎       | 47748/210244 [10:20<33:17, 81.33batch/s]2025-04-02 20:31:27,679 - INFO - Epoch 2 (Step 258000): Train loss 4.082, Val loss 4.407
Epoch 2:  23%|██▎       | 48378/210244 [10:28<32:34, 82.83batch/s]

[2025-04-02 20:31:35,405] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  23%|██▎       | 48755/210244 [10:32<33:20, 80.72batch/s]2025-04-02 20:31:40,078 - INFO - Epoch 2 (Step 259000): Train loss 4.097, Val loss 4.415
Epoch 2:  24%|██▎       | 49750/210244 [10:46<33:05, 80.83batch/s]  2025-04-02 20:31:54,006 - INFO - Epoch 2 (Step 260000): Train loss 4.172, Val loss 4.399
Epoch 2:  24%|██▍       | 50754/210244 [10:59<33:50, 78.54batch/s]2025-04-02 20:32:06,438 - INFO - Epoch 2 (Step 261000): Train loss 4.169, Val loss 4.407
Epoch 2:  25%|██▍       | 51752/210244 [11:13<33:19, 79.25batch/s]  2025-04-02 20:32:20,246 - INFO - Epoch 2 (Step 262000): Train loss 4.196, Val loss 4.408
Epoch 2:  25%|██▌       | 52681/210244 [11:24<31:36, 83.09batch/s]

[2025-04-02 20:32:31,755] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  25%|██▌       | 52753/210244 [11:25<33:17, 78.85batch/s]2025-04-02 20:32:32,703 - INFO - Epoch 2 (Step 263000): Train loss 4.053, Val loss 4.408
Epoch 2:  25%|██▌       | 52922/210244 [11:27<32:24, 80.90batch/s]

[2025-04-02 20:32:34,764] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  26%|██▌       | 53752/210244 [11:37<32:35, 80.03batch/s]2025-04-02 20:32:45,148 - INFO - Epoch 2 (Step 264000): Train loss 4.147, Val loss 4.397
Epoch 2:  26%|██▌       | 54756/210244 [11:51<32:24, 79.97batch/s]  2025-04-02 20:32:58,981 - INFO - Epoch 2 (Step 265000): Train loss 4.085, Val loss 4.387
Epoch 2:  26%|██▋       | 55599/210244 [12:02<31:13, 82.55batch/s]

[2025-04-02 20:33:09,348] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  27%|██▋       | 55752/210244 [12:04<31:36, 81.44batch/s]2025-04-02 20:33:11,289 - INFO - Epoch 2 (Step 266000): Train loss 3.942, Val loss 4.369
Epoch 2:  27%|██▋       | 56753/210244 [12:18<1:51:16, 22.99batch/s]2025-04-02 20:33:25,148 - INFO - Epoch 2 (Step 267000): Train loss 3.936, Val loss 4.405
Epoch 2:  27%|██▋       | 57751/210244 [12:30<31:17, 81.21batch/s]  2025-04-02 20:33:37,653 - INFO - Epoch 2 (Step 268000): Train loss 4.047, Val loss 4.405
Epoch 2:  28%|██▊       | 58753/210244 [12:43<31:20, 80.54batch/s]2025-04-02 20:33:50,181 - INFO - Epoch 2 (Step 269000): Train loss 4.010, Val loss 4.402
Epoch 2:  28%|██▊       | 59718/210244 [12:56<31:02, 80.82batch/s]  

[2025-04-02 20:34:03,533] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  28%|██▊       | 59754/210244 [12:56<31:02, 80.79batch/s]2025-04-02 20:34:04,029 - INFO - Epoch 2 (Step 270000): Train loss 4.130, Val loss 4.395
Epoch 2:  28%|██▊       | 59858/210244 [12:58<31:41, 79.09batch/s]

[2025-04-02 20:34:05,347] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  29%|██▉       | 60748/210244 [13:09<30:30, 81.66batch/s]2025-04-02 20:34:16,426 - INFO - Epoch 2 (Step 271000): Train loss 3.924, Val loss 4.402
Epoch 2:  29%|██▉       | 61748/210244 [13:21<31:27, 78.69batch/s]2025-04-02 20:34:28,902 - INFO - Epoch 2 (Step 272000): Train loss 4.106, Val loss 4.398
Epoch 2:  30%|██▉       | 62377/210244 [13:30<30:18, 81.32batch/s]  

[2025-04-02 20:34:38,059] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  30%|██▉       | 62748/210244 [13:35<30:47, 79.84batch/s]2025-04-02 20:34:42,795 - INFO - Epoch 2 (Step 273000): Train loss 4.072, Val loss 4.410
Epoch 2:  30%|███       | 63756/210244 [13:48<30:20, 80.47batch/s]2025-04-02 20:34:55,448 - INFO - Epoch 2 (Step 274000): Train loss 4.069, Val loss 4.416
Epoch 2:  31%|███       | 64390/210244 [13:56<30:38, 79.32batch/s]

[2025-04-02 20:35:03,502] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  31%|███       | 64753/210244 [14:02<30:32, 79.38batch/s]  2025-04-02 20:35:09,507 - INFO - Epoch 2 (Step 275000): Train loss 4.182, Val loss 4.424
Epoch 2:  31%|███▏      | 65750/210244 [14:14<29:43, 81.00batch/s]2025-04-02 20:35:21,884 - INFO - Epoch 2 (Step 276000): Train loss 3.850, Val loss 4.406
Epoch 2:  31%|███▏      | 65973/210244 [14:17<28:49, 83.43batch/s]

[2025-04-02 20:35:24,653] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 2:  32%|███▏      | 66752/210244 [14:27<29:19, 81.53batch/s]2025-04-02 20:35:34,219 - INFO - Epoch 2 (Step 277000): Train loss 3.942, Val loss 4.405
Epoch 2:  32%|███▏      | 67748/210244 [14:40<29:35, 80.27batch/s]  2025-04-02 20:35:48,034 - INFO - Epoch 2 (Step 278000): Train loss 4.127, Val loss 4.415
Epoch 2:  33%|███▎      | 68750/210244 [14:53<29:29, 79.95batch/s]2025-04-02 20:36:00,418 - INFO - Epoch 2 (Step 279000): Train loss 4.119, Val loss 4.418
Epoch 2:  33%|███▎      | 69756/210244 [15:07<33:26, 70.02batch/s]  2025-04-02 20:36:14,202 - INFO - Epoch 2 (Step 280000): Train loss 3.913, Val loss 4.382
Epoch 2:  34%|███▎      | 70756/210244 [15:19<28:44, 80.91batch/s]2025-04-02 20:36:26,623 - INFO - Epoch 2 (Step 281000): Train loss 4.086, Val loss 4.348
Epoch 2:  34%|███▍      | 71070/210244 [15:23<28:59, 80.00batch/s]

[2025-04-02 20:36:30,566] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  34%|███▍      | 71749/210244 [15:31<28:30, 80.96batch/s]2025-04-02 20:36:39,167 - INFO - Epoch 2 (Step 282000): Train loss 4.092, Val loss 4.379
Epoch 2:  35%|███▍      | 72753/210244 [15:45<28:05, 81.57batch/s]  2025-04-02 20:36:52,982 - INFO - Epoch 2 (Step 283000): Train loss 3.989, Val loss 4.403
Epoch 2:  35%|███▍      | 73137/210244 [15:50<27:47, 82.22batch/s]

[2025-04-02 20:36:57,853] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  35%|███▌      | 73753/210244 [15:58<27:32, 82.60batch/s]2025-04-02 20:37:05,503 - INFO - Epoch 2 (Step 284000): Train loss 4.147, Val loss 4.376
Epoch 2:  36%|███▌      | 74752/210244 [16:10<28:13, 79.98batch/s]2025-04-02 20:37:18,080 - INFO - Epoch 2 (Step 285000): Train loss 3.940, Val loss 4.376
Epoch 2:  36%|███▌      | 75708/210244 [16:24<28:08, 79.66batch/s]  

[2025-04-02 20:37:31,286] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  36%|███▌      | 75753/210244 [16:24<27:48, 80.60batch/s]2025-04-02 20:37:31,857 - INFO - Epoch 2 (Step 286000): Train loss 3.969, Val loss 4.412
Epoch 2:  37%|███▋      | 76748/210244 [16:37<28:07, 79.09batch/s]2025-04-02 20:37:44,319 - INFO - Epoch 2 (Step 287000): Train loss 4.166, Val loss 4.413
Epoch 2:  37%|███▋      | 77752/210244 [16:51<28:18, 78.02batch/s]  2025-04-02 20:37:58,159 - INFO - Epoch 2 (Step 288000): Train loss 4.086, Val loss 4.346
Epoch 2:  37%|███▋      | 78017/210244 [16:54<27:28, 80.19batch/s]

[2025-04-02 20:38:01,440] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  37%|███▋      | 78748/210244 [17:03<27:07, 80.82batch/s]2025-04-02 20:38:10,606 - INFO - Epoch 2 (Step 289000): Train loss 4.047, Val loss 4.386
Epoch 2:  38%|███▊      | 79121/210244 [17:08<26:47, 81.55batch/s]

[2025-04-02 20:38:15,313] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 2:  38%|███▊      | 79754/210244 [17:16<26:57, 80.68batch/s]2025-04-02 20:38:23,189 - INFO - Epoch 2 (Step 290000): Train loss 4.254, Val loss 4.374
Epoch 2:  38%|███▊      | 80756/210244 [17:29<26:50, 80.38batch/s]  2025-04-02 20:38:37,030 - INFO - Epoch 2 (Step 291000): Train loss 4.074, Val loss 4.358
Epoch 2:  39%|███▉      | 81755/210244 [17:42<27:02, 79.21batch/s]2025-04-02 20:38:49,457 - INFO - Epoch 2 (Step 292000): Train loss 3.989, Val loss 4.399
Epoch 2:  39%|███▉      | 82749/210244 [17:55<26:52, 79.06batch/s]  2025-04-02 20:39:03,138 - INFO - Epoch 2 (Step 293000): Train loss 4.157, Val loss 4.382
Epoch 2:  40%|███▉      | 83756/210244 [18:08<26:17, 80.17batch/s]2025-04-02 20:39:15,757 - INFO - Epoch 2 (Step 294000): Train loss 4.100, Val loss 4.391
Epoch 2:  40%|████      | 84749/210244 [18:21<26:09, 79.97batch/s]2025-04-02 20:39:28,298 - INFO - Epoch 2 (Step 295000): Train loss 4.121, Val loss 4.362
Epoch 2:  41%|████      | 85163/210244 [18:26<25:27, 81.86batch/s]

[2025-04-02 20:39:33,423] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  41%|████      | 85521/210244 [18:32<25:29, 81.53batch/s]  

[2025-04-02 20:39:39,167] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  41%|████      | 85754/210244 [18:34<25:20, 81.85batch/s]2025-04-02 20:39:42,063 - INFO - Epoch 2 (Step 296000): Train loss 4.011, Val loss 4.367
Epoch 2:  41%|████▏     | 86754/210244 [18:47<27:03, 76.05batch/s]2025-04-02 20:39:54,532 - INFO - Epoch 2 (Step 297000): Train loss 3.959, Val loss 4.371
Epoch 2:  42%|████▏     | 87748/210244 [18:59<25:23, 80.40batch/s]2025-04-02 20:40:06,914 - INFO - Epoch 2 (Step 298000): Train loss 4.206, Val loss 4.375
Epoch 2:  42%|████▏     | 88752/210244 [19:13<24:53, 81.36batch/s]  2025-04-02 20:40:20,655 - INFO - Epoch 2 (Step 299000): Train loss 4.167, Val loss 4.376
Epoch 2:  43%|████▎     | 89754/210244 [19:26<24:55, 80.55batch/s]2025-04-02 20:40:33,189 - INFO - Epoch 2 (Step 300000): Train loss 3.986, Val loss 4.363
Epoch 2:  43%|████▎     | 89825/210244 [19:26<24:45, 81.09batch/s]

[2025-04-02 20:40:34,132] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  43%|████▎     | 90751/210244 [19:39<24:25, 81.55batch/s]  2025-04-02 20:40:46,971 - INFO - Epoch 2 (Step 301000): Train loss 4.005, Val loss 4.358
Epoch 2:  44%|████▎     | 91753/210244 [19:52<24:26, 80.82batch/s]2025-04-02 20:40:59,426 - INFO - Epoch 2 (Step 302000): Train loss 4.168, Val loss 4.376
Epoch 2:  44%|████▎     | 91895/210244 [19:54<24:13, 81.40batch/s]

[2025-04-02 20:41:01,174] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  44%|████▎     | 91931/210244 [19:54<24:15, 81.28batch/s]

[2025-04-02 20:41:01,642] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  44%|████▍     | 92756/210244 [20:04<25:27, 76.93batch/s]2025-04-02 20:41:11,923 - INFO - Epoch 2 (Step 303000): Train loss 4.087, Val loss 4.363
Epoch 2:  45%|████▍     | 93748/210244 [20:18<24:23, 79.62batch/s]  2025-04-02 20:41:25,827 - INFO - Epoch 2 (Step 304000): Train loss 4.174, Val loss 4.402
Epoch 2:  45%|████▍     | 94027/210244 [20:22<23:31, 82.34batch/s]

[2025-04-02 20:41:29,186] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  45%|████▌     | 94754/210244 [20:31<24:14, 79.38batch/s]2025-04-02 20:41:38,284 - INFO - Epoch 2 (Step 305000): Train loss 3.925, Val loss 4.386
Epoch 2:  46%|████▌     | 95751/210244 [20:44<23:31, 81.12batch/s]  2025-04-02 20:41:52,114 - INFO - Epoch 2 (Step 306000): Train loss 4.205, Val loss 4.342
Epoch 2:  46%|████▌     | 96751/210244 [20:57<23:15, 81.32batch/s]2025-04-02 20:42:04,570 - INFO - Epoch 2 (Step 307000): Train loss 4.130, Val loss 4.364
Epoch 2:  46%|████▌     | 96910/210244 [20:59<22:57, 82.29batch/s]

[2025-04-02 20:42:06,504] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  46%|████▋     | 97752/210244 [21:09<23:39, 79.24batch/s]2025-04-02 20:42:17,025 - INFO - Epoch 2 (Step 308000): Train loss 3.993, Val loss 4.396
Epoch 2:  47%|████▋     | 98751/210244 [21:23<22:51, 81.27batch/s]  2025-04-02 20:42:30,903 - INFO - Epoch 2 (Step 309000): Train loss 4.187, Val loss 4.388
Epoch 2:  47%|████▋     | 99539/210244 [21:33<22:59, 80.28batch/s]

[2025-04-02 20:42:40,714] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  47%|████▋     | 99755/210244 [21:36<22:33, 81.64batch/s]2025-04-02 20:42:43,376 - INFO - Epoch 2 (Step 310000): Train loss 4.131, Val loss 4.351
Epoch 2:  48%|████▊     | 100754/210244 [21:48<22:27, 81.24batch/s]2025-04-02 20:42:55,818 - INFO - Epoch 2 (Step 311000): Train loss 3.969, Val loss 4.349
Epoch 2:  48%|████▊     | 101750/210244 [22:02<22:20, 80.94batch/s]  2025-04-02 20:43:09,618 - INFO - Epoch 2 (Step 312000): Train loss 4.130, Val loss 4.359
Epoch 2:  49%|████▉     | 102576/210244 [22:12<21:53, 81.98batch/s]

[2025-04-02 20:43:19,786] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  49%|████▉     | 102756/210244 [22:14<22:09, 80.86batch/s]2025-04-02 20:43:21,979 - INFO - Epoch 2 (Step 313000): Train loss 4.088, Val loss 4.349
Epoch 2:  49%|████▉     | 103756/210244 [22:28<22:24, 79.20batch/s]  2025-04-02 20:43:35,720 - INFO - Epoch 2 (Step 314000): Train loss 4.037, Val loss 4.379
Epoch 2:  50%|████▉     | 104753/210244 [22:41<21:18, 82.51batch/s]2025-04-02 20:43:48,164 - INFO - Epoch 2 (Step 315000): Train loss 3.946, Val loss 4.390
Epoch 2:  50%|█████     | 105750/210244 [22:53<22:35, 77.12batch/s]2025-04-02 20:44:00,605 - INFO - Epoch 2 (Step 316000): Train loss 4.032, Val loss 4.392
Epoch 2:  51%|█████     | 106660/210244 [23:06<21:36, 79.88batch/s]  

[2025-04-02 20:44:13,232] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  51%|█████     | 106722/210244 [23:06<21:50, 79.00batch/s]

[2025-04-02 20:44:13,967] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  51%|█████     | 106755/210244 [23:07<21:49, 79.05batch/s]2025-04-02 20:44:14,456 - INFO - Epoch 2 (Step 317000): Train loss 3.818, Val loss 4.381
Epoch 2:  51%|█████     | 106954/210244 [23:09<21:27, 80.19batch/s]

[2025-04-02 20:44:17,017] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 2:  51%|█████▏    | 107752/210244 [23:19<21:23, 79.84batch/s]2025-04-02 20:44:27,012 - INFO - Epoch 2 (Step 318000): Train loss 4.102, Val loss 4.371
Epoch 2:  52%|█████▏    | 108754/210244 [23:33<20:40, 81.80batch/s]  2025-04-02 20:44:40,741 - INFO - Epoch 2 (Step 319000): Train loss 3.979, Val loss 4.363
Epoch 2:  52%|█████▏    | 109755/210244 [23:46<20:27, 81.88batch/s]2025-04-02 20:44:53,185 - INFO - Epoch 2 (Step 320000): Train loss 4.200, Val loss 4.368
Epoch 2:  53%|█████▎    | 110750/210244 [23:58<20:44, 79.95batch/s]2025-04-02 20:45:05,652 - INFO - Epoch 2 (Step 321000): Train loss 4.203, Val loss 4.354
Epoch 2:  53%|█████▎    | 111665/210244 [24:11<21:50, 75.22batch/s]  

[2025-04-02 20:45:18,474] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  53%|█████▎    | 111753/210244 [24:12<20:19, 80.77batch/s]2025-04-02 20:45:19,575 - INFO - Epoch 2 (Step 322000): Train loss 4.081, Val loss 4.371
Epoch 2:  54%|█████▎    | 112755/210244 [24:24<20:06, 80.79batch/s]2025-04-02 20:45:31,996 - INFO - Epoch 2 (Step 323000): Train loss 4.033, Val loss 4.394
Epoch 2:  54%|█████▍    | 113750/210244 [24:37<19:48, 81.21batch/s]2025-04-02 20:45:44,339 - INFO - Epoch 2 (Step 324000): Train loss 3.963, Val loss 4.366
Epoch 2:  55%|█████▍    | 114749/210244 [24:50<19:57, 79.72batch/s]  2025-04-02 20:45:58,121 - INFO - Epoch 2 (Step 325000): Train loss 4.008, Val loss 4.379
Epoch 2:  55%|█████▌    | 115744/210244 [25:03<19:32, 80.61batch/s]

[2025-04-02 20:46:10,481] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  55%|█████▌    | 115753/210244 [25:03<19:06, 82.45batch/s]2025-04-02 20:46:10,629 - INFO - Epoch 2 (Step 326000): Train loss 4.130, Val loss 4.354
Epoch 2:  56%|█████▌    | 116752/210244 [25:17<19:05, 81.63batch/s]  2025-04-02 20:46:24,440 - INFO - Epoch 2 (Step 327000): Train loss 4.383, Val loss 4.368
Epoch 2:  56%|█████▌    | 116779/210244 [25:17<20:49, 74.82batch/s]

[2025-04-02 20:46:24,725] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  56%|█████▌    | 117754/210244 [25:29<19:34, 78.73batch/s]2025-04-02 20:46:36,825 - INFO - Epoch 2 (Step 328000): Train loss 4.083, Val loss 4.364
Epoch 2:  56%|█████▋    | 118748/210244 [25:42<19:38, 77.66batch/s]2025-04-02 20:46:49,331 - INFO - Epoch 2 (Step 329000): Train loss 4.085, Val loss 4.373
Epoch 2:  57%|█████▋    | 119750/210244 [25:56<18:49, 80.15batch/s]  2025-04-02 20:47:03,209 - INFO - Epoch 2 (Step 330000): Train loss 3.927, Val loss 4.390
Epoch 2:  57%|█████▋    | 120231/210244 [26:02<18:12, 82.40batch/s]

[2025-04-02 20:47:09,112] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  57%|█████▋    | 120753/210244 [26:08<18:16, 81.58batch/s]2025-04-02 20:47:15,574 - INFO - Epoch 2 (Step 331000): Train loss 3.906, Val loss 4.366
Epoch 2:  58%|█████▊    | 121756/210244 [26:22<18:30, 79.72batch/s]  2025-04-02 20:47:29,469 - INFO - Epoch 2 (Step 332000): Train loss 4.113, Val loss 4.375
Epoch 2:  58%|█████▊    | 122753/210244 [26:34<18:09, 80.28batch/s]2025-04-02 20:47:41,965 - INFO - Epoch 2 (Step 333000): Train loss 3.947, Val loss 4.378
Epoch 2:  59%|█████▉    | 123756/210244 [26:47<17:29, 82.43batch/s]2025-04-02 20:47:54,601 - INFO - Epoch 2 (Step 334000): Train loss 4.118, Val loss 4.390
Epoch 2:  59%|█████▉    | 124295/210244 [26:55<17:43, 80.85batch/s]  

[2025-04-02 20:48:02,599] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  59%|█████▉    | 124756/210244 [27:01<17:37, 80.82batch/s]2025-04-02 20:48:08,375 - INFO - Epoch 2 (Step 335000): Train loss 4.096, Val loss 4.377
Epoch 2:  60%|█████▉    | 125749/210244 [27:13<17:15, 81.61batch/s]2025-04-02 20:48:20,941 - INFO - Epoch 2 (Step 336000): Train loss 4.137, Val loss 4.380
Epoch 2:  60%|██████    | 126340/210244 [27:21<17:21, 80.53batch/s]

[2025-04-02 20:48:28,252] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  60%|██████    | 126750/210244 [27:27<39:41, 35.06batch/s]  2025-04-02 20:48:34,739 - INFO - Epoch 2 (Step 337000): Train loss 4.047, Val loss 4.360
Epoch 2:  60%|██████    | 126945/210244 [27:30<17:08, 80.95batch/s]

[2025-04-02 20:48:37,087] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  61%|██████    | 127753/210244 [27:40<17:03, 80.57batch/s]2025-04-02 20:48:47,170 - INFO - Epoch 2 (Step 338000): Train loss 3.992, Val loss 4.360
Epoch 2:  61%|██████    | 128755/210244 [27:52<16:43, 81.21batch/s]2025-04-02 20:48:59,744 - INFO - Epoch 2 (Step 339000): Train loss 4.227, Val loss 4.385
Epoch 2:  62%|██████▏   | 129756/210244 [28:06<16:32, 81.06batch/s]  2025-04-02 20:49:13,530 - INFO - Epoch 2 (Step 340000): Train loss 3.948, Val loss 4.381
Epoch 2:  62%|██████▏   | 130755/210244 [28:18<16:07, 82.17batch/s]2025-04-02 20:49:26,053 - INFO - Epoch 2 (Step 341000): Train loss 4.104, Val loss 4.379
Epoch 2:  62%|██████▏   | 131016/210244 [28:22<16:15, 81.22batch/s]

[2025-04-02 20:49:29,269] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  63%|██████▎   | 131748/210244 [28:31<16:20, 80.09batch/s]2025-04-02 20:49:38,481 - INFO - Epoch 2 (Step 342000): Train loss 4.128, Val loss 4.369
Epoch 2:  63%|██████▎   | 132756/210244 [28:45<16:00, 80.66batch/s]  2025-04-02 20:49:52,153 - INFO - Epoch 2 (Step 343000): Train loss 4.078, Val loss 4.327
Epoch 2:  63%|██████▎   | 133255/210244 [28:51<15:57, 80.41batch/s]

[2025-04-02 20:49:58,429] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  63%|██████▎   | 133273/210244 [28:51<15:56, 80.49batch/s]

[2025-04-02 20:49:58,703] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  64%|██████▎   | 133756/210244 [28:57<15:50, 80.47batch/s]2025-04-02 20:50:04,789 - INFO - Epoch 2 (Step 344000): Train loss 4.043, Val loss 4.345
Epoch 2:  64%|██████▍   | 134755/210244 [29:11<15:29, 81.22batch/s]  2025-04-02 20:50:18,657 - INFO - Epoch 2 (Step 345000): Train loss 3.865, Val loss 4.343
Epoch 2:  65%|██████▍   | 135755/210244 [29:23<15:16, 81.26batch/s]2025-04-02 20:50:31,081 - INFO - Epoch 2 (Step 346000): Train loss 4.046, Val loss 4.357
Epoch 2:  65%|██████▍   | 135961/210244 [29:26<15:21, 80.62batch/s]

[2025-04-02 20:50:33,659] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  65%|██████▍   | 136177/210244 [29:29<15:04, 81.88batch/s]

[2025-04-02 20:50:36,317] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 2:  65%|██████▌   | 136751/210244 [29:36<14:56, 82.00batch/s]2025-04-02 20:50:43,543 - INFO - Epoch 2 (Step 347000): Train loss 4.052, Val loss 4.348
Epoch 2:  66%|██████▌   | 137756/210244 [29:50<15:59, 75.58batch/s]  2025-04-02 20:50:57,357 - INFO - Epoch 2 (Step 348000): Train loss 4.102, Val loss 4.367
Epoch 2:  66%|██████▌   | 138748/210244 [30:02<14:52, 80.13batch/s]2025-04-02 20:51:09,741 - INFO - Epoch 2 (Step 349000): Train loss 4.128, Val loss 4.356
Epoch 2:  66%|██████▋   | 139751/210244 [30:16<16:01, 73.31batch/s]  2025-04-02 20:51:23,522 - INFO - Epoch 2 (Step 350000): Train loss 4.180, Val loss 4.359
Epoch 2:  67%|██████▋   | 140752/210244 [30:28<14:00, 82.71batch/s]2025-04-02 20:51:35,885 - INFO - Epoch 2 (Step 351000): Train loss 4.211, Val loss 4.381
Epoch 2:  67%|██████▋   | 141019/210244 [30:32<15:03, 76.60batch/s]

[2025-04-02 20:51:39,253] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  67%|██████▋   | 141748/210244 [30:41<13:54, 82.08batch/s]2025-04-02 20:51:48,332 - INFO - Epoch 2 (Step 352000): Train loss 3.933, Val loss 4.377
Epoch 2:  68%|██████▊   | 142748/210244 [30:55<13:45, 81.74batch/s]  2025-04-02 20:52:02,244 - INFO - Epoch 2 (Step 353000): Train loss 3.871, Val loss 4.380
Epoch 2:  68%|██████▊   | 143755/210244 [31:07<13:22, 82.88batch/s]2025-04-02 20:52:14,645 - INFO - Epoch 2 (Step 354000): Train loss 4.034, Val loss 4.373
Epoch 2:  68%|██████▊   | 143895/210244 [31:09<13:57, 79.19batch/s]

[2025-04-02 20:52:16,402] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  69%|██████▉   | 144750/210244 [31:19<13:43, 79.57batch/s]2025-04-02 20:52:27,135 - INFO - Epoch 2 (Step 355000): Train loss 4.172, Val loss 4.358
Epoch 2:  69%|██████▉   | 145755/210244 [31:33<13:12, 81.37batch/s]  2025-04-02 20:52:40,963 - INFO - Epoch 2 (Step 356000): Train loss 3.964, Val loss 4.362
Epoch 2:  70%|██████▉   | 146751/210244 [31:46<13:02, 81.18batch/s]2025-04-02 20:52:53,370 - INFO - Epoch 2 (Step 357000): Train loss 3.885, Val loss 4.363
Epoch 2:  70%|███████   | 147752/210244 [32:00<12:53, 80.84batch/s]  2025-04-02 20:53:07,205 - INFO - Epoch 2 (Step 358000): Train loss 4.099, Val loss 4.365
Epoch 2:  71%|███████   | 148386/210244 [32:07<12:58, 79.42batch/s]

[2025-04-02 20:53:15,035] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  71%|███████   | 148751/210244 [32:12<12:38, 81.07batch/s]2025-04-02 20:53:19,638 - INFO - Epoch 2 (Step 359000): Train loss 3.917, Val loss 4.363
Epoch 2:  71%|███████   | 149749/210244 [32:24<12:23, 81.33batch/s]2025-04-02 20:53:32,131 - INFO - Epoch 2 (Step 360000): Train loss 3.963, Val loss 4.360
Epoch 2:  72%|███████▏  | 150703/210244 [32:38<12:09, 81.60batch/s]

[2025-04-02 20:53:45,187] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  72%|███████▏  | 150748/210244 [32:38<12:23, 80.02batch/s]2025-04-02 20:53:45,900 - INFO - Epoch 2 (Step 361000): Train loss 4.081, Val loss 4.354
Epoch 2:  72%|███████▏  | 151082/210244 [32:43<12:31, 78.74batch/s]

[2025-04-02 20:53:50,073] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  72%|███████▏  | 151750/210244 [32:51<12:24, 78.53batch/s]2025-04-02 20:53:58,526 - INFO - Epoch 2 (Step 362000): Train loss 3.836, Val loss 4.372
Epoch 2:  73%|███████▎  | 152754/210244 [33:05<12:42, 75.43batch/s]2025-04-02 20:54:12,395 - INFO - Epoch 2 (Step 363000): Train loss 4.050, Val loss 4.384
Epoch 2:  73%|███████▎  | 153408/210244 [33:13<11:41, 81.03batch/s]

[2025-04-02 20:54:20,513] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  73%|███████▎  | 153750/210244 [33:17<11:33, 81.48batch/s]2025-04-02 20:54:24,796 - INFO - Epoch 2 (Step 364000): Train loss 4.092, Val loss 4.399
Epoch 2:  74%|███████▎  | 154755/210244 [33:30<11:12, 82.47batch/s]2025-04-02 20:54:37,145 - INFO - Epoch 2 (Step 365000): Train loss 3.977, Val loss 4.359
Epoch 2:  74%|███████▍  | 155751/210244 [33:43<11:29, 79.06batch/s]2025-04-02 20:54:51,020 - INFO - Epoch 2 (Step 366000): Train loss 4.195, Val loss 4.377
Epoch 2:  74%|███████▍  | 156363/210244 [33:51<11:02, 81.35batch/s]

[2025-04-02 20:54:58,672] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  74%|███████▍  | 156408/210244 [33:52<11:18, 79.39batch/s]

[2025-04-02 20:54:59,193] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 2:  75%|███████▍  | 156749/210244 [33:56<10:54, 81.74batch/s]2025-04-02 20:55:03,470 - INFO - Epoch 2 (Step 367000): Train loss 3.995, Val loss 4.371
Epoch 2:  75%|███████▌  | 157750/210244 [34:08<11:04, 78.96batch/s]2025-04-02 20:55:15,961 - INFO - Epoch 2 (Step 368000): Train loss 4.120, Val loss 4.383
Epoch 2:  76%|███████▌  | 158748/210244 [34:23<10:36, 80.89batch/s]2025-04-02 20:55:30,215 - INFO - Epoch 2 (Step 369000): Train loss 3.983, Val loss 4.406
Epoch 2:  76%|███████▌  | 159754/210244 [34:35<10:21, 81.24batch/s]2025-04-02 20:55:42,539 - INFO - Epoch 2 (Step 370000): Train loss 4.054, Val loss 4.368
Epoch 2:  76%|███████▋  | 160753/210244 [34:49<10:04, 81.92batch/s]2025-04-02 20:55:56,368 - INFO - Epoch 2 (Step 371000): Train loss 3.939, Val loss 4.398
Epoch 2:  77%|███████▋  | 161748/210244 [35:01<09:54, 81.54batch/s]2025-04-02 20:56:08,790 - INFO - Epoch 2 (Step 372000): Train loss 4.081, Val loss 4.374
Epoch 2:  77%|███████▋  | 162448/210244 [35:10<10:07, 78.64batch

[2025-04-02 20:56:17,409] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  77%|███████▋  | 162754/210244 [35:14<10:09, 77.93batch/s]2025-04-02 20:56:21,270 - INFO - Epoch 2 (Step 373000): Train loss 4.175, Val loss 4.374
Epoch 2:  78%|███████▊  | 163749/210244 [35:27<09:22, 82.63batch/s]2025-04-02 20:56:35,083 - INFO - Epoch 2 (Step 374000): Train loss 4.074, Val loss 4.424
Epoch 2:  78%|███████▊  | 164497/210244 [35:37<09:20, 81.60batch/s]

[2025-04-02 20:56:44,311] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  78%|███████▊  | 164695/210244 [35:39<09:19, 81.42batch/s]

[2025-04-02 20:56:46,746] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  78%|███████▊  | 164749/210244 [35:40<09:21, 81.01batch/s]2025-04-02 20:56:47,531 - INFO - Epoch 2 (Step 375000): Train loss 3.937, Val loss 4.395
Epoch 2:  79%|███████▉  | 165749/210244 [35:54<09:11, 80.64batch/s]2025-04-02 20:57:01,319 - INFO - Epoch 2 (Step 376000): Train loss 4.011, Val loss 4.388
Epoch 2:  79%|███████▉  | 166748/210244 [36:06<08:57, 80.90batch/s]2025-04-02 20:57:13,783 - INFO - Epoch 2 (Step 377000): Train loss 4.005, Val loss 4.362
Epoch 2:  79%|███████▉  | 166810/210244 [36:07<09:10, 78.87batch/s]

[2025-04-02 20:57:14,462] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  80%|███████▉  | 167748/210244 [36:18<08:51, 79.92batch/s]2025-04-02 20:57:26,191 - INFO - Epoch 2 (Step 378000): Train loss 4.130, Val loss 4.387
Epoch 2:  80%|████████  | 168751/210244 [36:32<08:26, 81.97batch/s]2025-04-02 20:57:40,007 - INFO - Epoch 2 (Step 379000): Train loss 4.037, Val loss 4.380
Epoch 2:  81%|████████  | 169748/210244 [36:45<08:16, 81.48batch/s]2025-04-02 20:57:52,577 - INFO - Epoch 2 (Step 380000): Train loss 4.063, Val loss 4.380
Epoch 2:  81%|████████  | 170751/210244 [36:59<22:24, 29.37batch/s]2025-04-02 20:58:06,306 - INFO - Epoch 2 (Step 381000): Train loss 4.138, Val loss 4.375
Epoch 2:  81%|████████  | 170821/210244 [37:00<08:58, 73.18batch/s]

[2025-04-02 20:58:07,208] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  81%|████████▏ | 170943/210244 [37:01<08:02, 81.52batch/s]

[2025-04-02 20:58:08,695] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  82%|████████▏ | 171754/210244 [37:11<08:04, 79.43batch/s]2025-04-02 20:58:18,886 - INFO - Epoch 2 (Step 382000): Train loss 3.969, Val loss 4.373
Epoch 2:  82%|████████▏ | 172751/210244 [37:24<07:51, 79.46batch/s]2025-04-02 20:58:31,424 - INFO - Epoch 2 (Step 383000): Train loss 4.028, Val loss 4.406
Epoch 2:  82%|████████▏ | 173397/210244 [37:33<07:57, 77.13batch/s]

[2025-04-02 20:58:40,713] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  83%|████████▎ | 173756/210244 [37:38<07:33, 80.49batch/s]2025-04-02 20:58:45,117 - INFO - Epoch 2 (Step 384000): Train loss 3.982, Val loss 4.406
Epoch 2:  83%|████████▎ | 174755/210244 [37:50<07:12, 82.08batch/s]2025-04-02 20:58:57,513 - INFO - Epoch 2 (Step 385000): Train loss 4.162, Val loss 4.391
Epoch 2:  84%|████████▎ | 175750/210244 [38:02<07:09, 80.22batch/s]2025-04-02 20:59:09,975 - INFO - Epoch 2 (Step 386000): Train loss 4.045, Val loss 4.400
Epoch 2:  84%|████████▍ | 176750/210244 [38:16<06:54, 80.89batch/s]2025-04-02 20:59:23,803 - INFO - Epoch 2 (Step 387000): Train loss 4.042, Val loss 4.384
Epoch 2:  84%|████████▍ | 177581/210244 [38:26<06:43, 80.92batch/s]

[2025-04-02 20:59:34,070] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  85%|████████▍ | 177752/210244 [38:29<06:40, 81.04batch/s]2025-04-02 20:59:36,198 - INFO - Epoch 2 (Step 388000): Train loss 4.026, Val loss 4.374
Epoch 2:  85%|████████▌ | 178754/210244 [38:42<06:26, 81.49batch/s]2025-04-02 20:59:49,936 - INFO - Epoch 2 (Step 389000): Train loss 4.176, Val loss 4.371
Epoch 2:  85%|████████▌ | 179672/210244 [38:54<06:14, 81.59batch/s]

[2025-04-02 21:00:01,374] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  85%|████████▌ | 179753/210244 [38:55<06:09, 82.60batch/s]2025-04-02 21:00:02,427 - INFO - Epoch 2 (Step 390000): Train loss 4.079, Val loss 4.384
Epoch 2:  86%|████████▌ | 180488/210244 [39:04<06:16, 79.06batch/s]

[2025-04-02 21:00:11,492] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  86%|████████▌ | 180750/210244 [39:07<06:09, 79.88batch/s]2025-04-02 21:00:14,957 - INFO - Epoch 2 (Step 391000): Train loss 4.080, Val loss 4.384
Epoch 2:  86%|████████▋ | 181751/210244 [39:21<05:51, 81.06batch/s]2025-04-02 21:00:28,820 - INFO - Epoch 2 (Step 392000): Train loss 4.076, Val loss 4.361
Epoch 2:  87%|████████▋ | 182742/210244 [39:33<05:39, 81.10batch/s]

[2025-04-02 21:00:41,066] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  87%|████████▋ | 182751/210244 [39:34<05:32, 82.69batch/s]2025-04-02 21:00:41,263 - INFO - Epoch 2 (Step 393000): Train loss 4.108, Val loss 4.379
Epoch 2:  87%|████████▋ | 183754/210244 [39:47<07:53, 55.93batch/s]2025-04-02 21:00:55,104 - INFO - Epoch 2 (Step 394000): Train loss 4.013, Val loss 4.348
Epoch 2:  88%|████████▊ | 184752/210244 [40:00<05:19, 79.71batch/s]2025-04-02 21:01:07,631 - INFO - Epoch 2 (Step 395000): Train loss 4.044, Val loss 4.339
Epoch 2:  88%|████████▊ | 185753/210244 [40:12<05:05, 80.13batch/s]2025-04-02 21:01:20,072 - INFO - Epoch 2 (Step 396000): Train loss 4.024, Val loss 4.371
Epoch 2:  88%|████████▊ | 186040/210244 [40:16<04:55, 81.80batch/s]

[2025-04-02 21:01:23,569] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  89%|████████▊ | 186121/210244 [40:17<04:57, 81.13batch/s]

[2025-04-02 21:01:24,653] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 2:  89%|████████▉ | 186755/210244 [40:26<04:54, 79.73batch/s]2025-04-02 21:01:33,833 - INFO - Epoch 2 (Step 397000): Train loss 4.094, Val loss 4.359
Epoch 2:  89%|████████▉ | 187756/210244 [40:39<04:38, 80.74batch/s]2025-04-02 21:01:46,526 - INFO - Epoch 2 (Step 398000): Train loss 4.104, Val loss 4.363
Epoch 2:  90%|████████▉ | 188754/210244 [40:51<04:28, 80.15batch/s]2025-04-02 21:01:59,090 - INFO - Epoch 2 (Step 399000): Train loss 4.056, Val loss 4.383
Epoch 2:  90%|█████████ | 189755/210244 [41:06<04:30, 75.84batch/s]2025-04-02 21:02:13,147 - INFO - Epoch 2 (Step 400000): Train loss 4.140, Val loss 4.373
Epoch 2:  91%|█████████ | 190750/210244 [41:18<04:01, 80.66batch/s]2025-04-02 21:02:25,598 - INFO - Epoch 2 (Step 401000): Train loss 4.209, Val loss 4.348
Epoch 2:  91%|█████████ | 191756/210244 [41:32<03:46, 81.54batch/s]2025-04-02 21:02:39,499 - INFO - Epoch 2 (Step 402000): Train loss 4.221, Val loss 4.365
Epoch 2:  92%|█████████▏| 192572/210244 [41:42<03:38, 80.72batch

[2025-04-02 21:02:49,687] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  92%|█████████▏| 192753/210244 [41:44<03:33, 81.80batch/s]2025-04-02 21:02:51,957 - INFO - Epoch 2 (Step 403000): Train loss 4.164, Val loss 4.366
Epoch 2:  92%|█████████▏| 193749/210244 [41:57<03:19, 82.56batch/s]2025-04-02 21:03:04,298 - INFO - Epoch 2 (Step 404000): Train loss 4.114, Val loss 4.339
Epoch 2:  93%|█████████▎| 194754/210244 [42:10<03:10, 81.32batch/s]2025-04-02 21:03:18,082 - INFO - Epoch 2 (Step 405000): Train loss 4.097, Val loss 4.358
Epoch 2:  93%|█████████▎| 194815/210244 [42:11<03:11, 80.61batch/s]

[2025-04-02 21:03:18,881] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2:  93%|█████████▎| 195489/210244 [42:20<02:56, 83.53batch/s]

[2025-04-02 21:03:27,109] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  93%|█████████▎| 195753/210244 [42:23<03:01, 80.03batch/s]2025-04-02 21:03:30,479 - INFO - Epoch 2 (Step 406000): Train loss 4.289, Val loss 4.345
Epoch 2:  94%|█████████▎| 196753/210244 [42:37<02:47, 80.48batch/s]2025-04-02 21:03:44,219 - INFO - Epoch 2 (Step 407000): Train loss 4.192, Val loss 4.389
Epoch 2:  94%|█████████▍| 197570/210244 [42:47<02:37, 80.41batch/s]

[2025-04-02 21:03:54,449] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  94%|█████████▍| 197750/210244 [42:49<02:39, 78.54batch/s]2025-04-02 21:03:56,718 - INFO - Epoch 2 (Step 408000): Train loss 4.106, Val loss 4.356
Epoch 2:  95%|█████████▍| 198756/210244 [43:01<02:22, 80.67batch/s]2025-04-02 21:04:09,047 - INFO - Epoch 2 (Step 409000): Train loss 4.190, Val loss 4.366
Epoch 2:  95%|█████████▍| 199621/210244 [43:14<02:10, 81.46batch/s]

[2025-04-02 21:04:21,106] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  95%|█████████▌| 199756/210244 [43:15<02:09, 80.80batch/s]2025-04-02 21:04:22,802 - INFO - Epoch 2 (Step 410000): Train loss 4.167, Val loss 4.346
Epoch 2:  95%|█████████▌| 200752/210244 [43:28<01:58, 80.28batch/s]2025-04-02 21:04:35,337 - INFO - Epoch 2 (Step 411000): Train loss 3.916, Val loss 4.356
Epoch 2:  96%|█████████▌| 201754/210244 [43:40<01:43, 82.07batch/s]2025-04-02 21:04:47,753 - INFO - Epoch 2 (Step 412000): Train loss 3.891, Val loss 4.369
Epoch 2:  96%|█████████▋| 202753/210244 [43:54<01:32, 80.90batch/s]2025-04-02 21:05:01,518 - INFO - Epoch 2 (Step 413000): Train loss 4.165, Val loss 4.357
Epoch 2:  96%|█████████▋| 202798/210244 [43:54<01:35, 77.57batch/s]

[2025-04-02 21:05:02,047] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  97%|█████████▋| 203756/210244 [44:06<01:18, 82.96batch/s]2025-04-02 21:05:13,892 - INFO - Epoch 2 (Step 414000): Train loss 4.184, Val loss 4.362
Epoch 2:  97%|█████████▋| 204753/210244 [44:20<01:07, 81.23batch/s]2025-04-02 21:05:27,771 - INFO - Epoch 2 (Step 415000): Train loss 4.067, Val loss 4.337
Epoch 2:  98%|█████████▊| 205542/210244 [44:30<00:59, 79.44batch/s]

[2025-04-02 21:05:37,731] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 2:  98%|█████████▊| 205756/210244 [44:33<00:55, 81.56batch/s]2025-04-02 21:05:40,372 - INFO - Epoch 2 (Step 416000): Train loss 4.009, Val loss 4.368
Epoch 2:  98%|█████████▊| 206751/210244 [44:45<00:43, 80.42batch/s]2025-04-02 21:05:52,930 - INFO - Epoch 2 (Step 417000): Train loss 4.249, Val loss 4.362
Epoch 2:  99%|█████████▉| 207748/210244 [44:59<00:30, 82.48batch/s]2025-04-02 21:06:06,799 - INFO - Epoch 2 (Step 418000): Train loss 3.993, Val loss 4.386
Epoch 2:  99%|█████████▉| 208755/210244 [45:12<00:18, 80.48batch/s]2025-04-02 21:06:19,343 - INFO - Epoch 2 (Step 419000): Train loss 4.145, Val loss 4.348
Epoch 2: 100%|█████████▉| 209621/210244 [45:24<00:09, 62.53batch/s]

[2025-04-02 21:06:31,454] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 2: 100%|█████████▉| 209754/210244 [45:25<00:06, 80.58batch/s]2025-04-02 21:06:33,104 - INFO - Epoch 2 (Step 420000): Train loss 3.827, Val loss 4.362
Epoch 2: 100%|██████████| 210244/210244 [45:32<00:00, 76.95batch/s]
2025-04-02 21:06:39,136 - INFO - Epoch 2 completed. Generating a sample...
2025-04-02 21:06:39,207 - INFO - Generated Text: Every effort moves you out of the game . " 
   = = = = = = = = = = = = 
   The game was released on October 12 , 2008 , and was released on the Xbox 360 console . It was released on October
2025-04-02 21:06:39,208 - INFO - Starting Epoch 3...


Every effort moves you out of the game . "     = = = = = = = = = = = =     The game was released on October 12 , 2008 , and was released on the Xbox 360 console . It was released on October


Epoch 3:   0%|          | 160/210244 [00:01<43:10, 81.10batch/s]

[2025-04-02 21:06:41,254] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:   0%|          | 509/210244 [00:06<42:31, 82.21batch/s]2025-04-02 21:06:45,566 - INFO - Epoch 3 (Step 421000): Train loss 4.064, Val loss 4.381
Epoch 3:   1%|          | 1508/210244 [00:18<43:29, 79.99batch/s]2025-04-02 21:06:57,892 - INFO - Epoch 3 (Step 422000): Train loss 3.996, Val loss 4.374
Epoch 3:   1%|          | 2510/210244 [00:32<42:50, 80.82batch/s]  2025-04-02 21:07:11,599 - INFO - Epoch 3 (Step 423000): Train loss 4.077, Val loss 4.379
Epoch 3:   2%|▏         | 3507/210244 [00:44<44:11, 77.96batch/s]2025-04-02 21:07:24,024 - INFO - Epoch 3 (Step 424000): Train loss 4.048, Val loss 4.379
Epoch 3:   2%|▏         | 4189/210244 [00:53<41:53, 81.99batch/s]

[2025-04-02 21:07:32,412] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 3:   2%|▏         | 4504/210244 [00:56<42:33, 80.56batch/s]2025-04-02 21:07:36,363 - INFO - Epoch 3 (Step 425000): Train loss 4.085, Val loss 4.355
Epoch 3:   2%|▏         | 4852/210244 [01:02<43:57, 77.88batch/s]  

[2025-04-02 21:07:42,068] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:   3%|▎         | 5509/210244 [01:11<42:30, 80.27batch/s]2025-04-02 21:07:50,339 - INFO - Epoch 3 (Step 426000): Train loss 4.134, Val loss 4.375
Epoch 3:   3%|▎         | 6508/210244 [01:23<41:33, 81.70batch/s]2025-04-02 21:08:02,778 - INFO - Epoch 3 (Step 427000): Train loss 4.106, Val loss 4.349
Epoch 3:   3%|▎         | 7181/210244 [01:33<1:06:41, 50.74batch/s]

[2025-04-02 21:08:12,501] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:   4%|▎         | 7512/210244 [01:37<42:02, 80.38batch/s]  2025-04-02 21:08:16,595 - INFO - Epoch 3 (Step 428000): Train loss 3.980, Val loss 4.356
Epoch 3:   4%|▍         | 8510/210244 [01:49<40:46, 82.44batch/s]2025-04-02 21:08:28,926 - INFO - Epoch 3 (Step 429000): Train loss 4.018, Val loss 4.377
Epoch 3:   5%|▍         | 9506/210244 [02:02<41:20, 80.94batch/s]2025-04-02 21:08:41,395 - INFO - Epoch 3 (Step 430000): Train loss 4.022, Val loss 4.367
Epoch 3:   5%|▍         | 10504/210244 [02:15<41:05, 81.02batch/s] 2025-04-02 21:08:55,235 - INFO - Epoch 3 (Step 431000): Train loss 4.024, Val loss 4.380
Epoch 3:   5%|▌         | 11358/210244 [02:26<40:44, 81.35batch/s]

[2025-04-02 21:09:05,892] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 3:   5%|▌         | 11512/210244 [02:28<41:27, 79.89batch/s]2025-04-02 21:09:07,740 - INFO - Epoch 3 (Step 432000): Train loss 4.031, Val loss 4.354
Epoch 3:   6%|▌         | 12511/210244 [02:42<40:32, 81.30batch/s]  2025-04-02 21:09:21,597 - INFO - Epoch 3 (Step 433000): Train loss 4.104, Val loss 4.387
Epoch 3:   6%|▋         | 13187/210244 [02:50<41:57, 78.28batch/s]

[2025-04-02 21:09:29,982] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:   6%|▋         | 13506/210244 [02:54<41:59, 78.08batch/s]2025-04-02 21:09:34,074 - INFO - Epoch 3 (Step 434000): Train loss 4.060, Val loss 4.369
Epoch 3:   7%|▋         | 14507/210244 [03:07<40:12, 81.13batch/s]2025-04-02 21:09:46,775 - INFO - Epoch 3 (Step 435000): Train loss 3.869, Val loss 4.359
Epoch 3:   7%|▋         | 15506/210244 [03:21<40:51, 79.42batch/s]  2025-04-02 21:10:00,661 - INFO - Epoch 3 (Step 436000): Train loss 4.194, Val loss 4.317
Epoch 3:   8%|▊         | 16504/210244 [03:33<40:03, 80.60batch/s]2025-04-02 21:10:13,123 - INFO - Epoch 3 (Step 437000): Train loss 4.182, Val loss 4.366
Epoch 3:   8%|▊         | 16608/210244 [03:35<40:49, 79.04batch/s]

[2025-04-02 21:10:14,332] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:   8%|▊         | 17505/210244 [03:46<39:12, 81.93batch/s]2025-04-02 21:10:26,827 - INFO - Epoch 3 (Step 438000): Train loss 4.223, Val loss 4.380
Epoch 3:   9%|▉         | 18511/210244 [03:59<39:03, 81.81batch/s]  2025-04-02 21:10:39,204 - INFO - Epoch 3 (Step 439000): Train loss 3.961, Val loss 4.335
Epoch 3:   9%|▉         | 19505/210244 [04:12<38:31, 82.53batch/s]2025-04-02 21:10:51,522 - INFO - Epoch 3 (Step 440000): Train loss 4.073, Val loss 4.370
Epoch 3:  10%|▉         | 19993/210244 [04:18<39:10, 80.95batch/s]

[2025-04-02 21:10:57,582] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  10%|▉         | 20181/210244 [04:21<1:06:32, 47.61batch/s]

[2025-04-02 21:11:01,296] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 3:  10%|▉         | 20509/210244 [04:26<38:57, 81.18batch/s]  2025-04-02 21:11:05,413 - INFO - Epoch 3 (Step 441000): Train loss 4.030, Val loss 4.366
Epoch 3:  10%|█         | 21512/210244 [04:38<39:46, 79.09batch/s]2025-04-02 21:11:17,824 - INFO - Epoch 3 (Step 442000): Train loss 4.037, Val loss 4.386
Epoch 3:  11%|█         | 22507/210244 [04:51<38:32, 81.17batch/s]2025-04-02 21:11:30,360 - INFO - Epoch 3 (Step 443000): Train loss 4.094, Val loss 4.361
Epoch 3:  11%|█         | 23505/210244 [05:04<38:11, 81.51batch/s]  2025-04-02 21:11:44,200 - INFO - Epoch 3 (Step 444000): Train loss 4.181, Val loss 4.359
Epoch 3:  12%|█▏        | 24510/210244 [05:17<39:23, 78.59batch/s]2025-04-02 21:11:56,745 - INFO - Epoch 3 (Step 445000): Train loss 4.080, Val loss 4.368
Epoch 3:  12%|█▏        | 25509/210244 [05:31<37:34, 81.95batch/s]  2025-04-02 21:12:10,549 - INFO - Epoch 3 (Step 446000): Train loss 4.037, Val loss 4.366
Epoch 3:  13%|█▎        | 26334/210244 [05:41<39:04, 78.46batch/

[2025-04-02 21:12:20,788] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 3:  13%|█▎        | 26505/210244 [05:43<37:27, 81.76batch/s]2025-04-02 21:12:22,980 - INFO - Epoch 3 (Step 447000): Train loss 4.009, Val loss 4.367
Epoch 3:  13%|█▎        | 27302/210244 [05:53<37:49, 80.62batch/s]

[2025-04-02 21:12:32,733] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  13%|█▎        | 27506/210244 [05:56<38:21, 79.40batch/s]2025-04-02 21:12:35,377 - INFO - Epoch 3 (Step 448000): Train loss 3.903, Val loss 4.342
Epoch 3:  14%|█▎        | 28509/210244 [06:09<37:01, 81.79batch/s]  2025-04-02 21:12:49,177 - INFO - Epoch 3 (Step 449000): Train loss 4.218, Val loss 4.356
Epoch 3:  14%|█▍        | 29508/210244 [06:22<37:37, 80.06batch/s]2025-04-02 21:13:01,646 - INFO - Epoch 3 (Step 450000): Train loss 4.011, Val loss 4.359
Epoch 3:  15%|█▍        | 30509/210244 [06:34<36:40, 81.67batch/s]2025-04-02 21:13:14,099 - INFO - Epoch 3 (Step 451000): Train loss 4.033, Val loss 4.350
Epoch 3:  15%|█▍        | 31218/210244 [06:45<36:59, 80.67batch/s]  

[2025-04-02 21:13:24,260] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  15%|█▍        | 31506/210244 [06:48<36:39, 81.26batch/s]2025-04-02 21:13:27,886 - INFO - Epoch 3 (Step 452000): Train loss 4.098, Val loss 4.389
Epoch 3:  15%|█▌        | 32004/210244 [06:54<36:59, 80.30batch/s]

[2025-04-02 21:13:34,086] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 3:  15%|█▌        | 32512/210244 [07:01<36:39, 80.80batch/s]2025-04-02 21:13:40,372 - INFO - Epoch 3 (Step 453000): Train loss 3.913, Val loss 4.377
Epoch 3:  16%|█▌        | 33508/210244 [07:14<36:45, 80.15batch/s]  2025-04-02 21:13:54,260 - INFO - Epoch 3 (Step 454000): Train loss 4.092, Val loss 4.352
Epoch 3:  16%|█▋        | 34511/210244 [07:27<35:36, 82.25batch/s]2025-04-02 21:14:06,702 - INFO - Epoch 3 (Step 455000): Train loss 3.910, Val loss 4.374
Epoch 3:  17%|█▋        | 35509/210244 [07:39<35:23, 82.27batch/s]2025-04-02 21:14:19,187 - INFO - Epoch 3 (Step 456000): Train loss 3.996, Val loss 4.382
Epoch 3:  17%|█▋        | 36125/210244 [07:48<35:26, 81.89batch/s]  

[2025-04-02 21:14:28,146] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  17%|█▋        | 36512/210244 [07:53<35:50, 80.80batch/s]2025-04-02 21:14:32,993 - INFO - Epoch 3 (Step 457000): Train loss 3.861, Val loss 4.361
Epoch 3:  18%|█▊        | 37445/210244 [08:05<35:55, 80.18batch/s]

[2025-04-02 21:14:44,641] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 3:  18%|█▊        | 37508/210244 [08:06<36:07, 79.70batch/s]2025-04-02 21:14:45,537 - INFO - Epoch 3 (Step 458000): Train loss 4.116, Val loss 4.364
Epoch 3:  18%|█▊        | 38511/210244 [08:20<36:31, 78.37batch/s]  2025-04-02 21:14:59,358 - INFO - Epoch 3 (Step 459000): Train loss 4.014, Val loss 4.366
Epoch 3:  19%|█▉        | 39504/210244 [08:32<35:07, 81.02batch/s]2025-04-02 21:15:11,792 - INFO - Epoch 3 (Step 460000): Train loss 3.903, Val loss 4.347
Epoch 3:  19%|█▉        | 40507/210244 [08:44<35:42, 79.23batch/s]2025-04-02 21:15:24,252 - INFO - Epoch 3 (Step 461000): Train loss 4.188, Val loss 4.353
Epoch 3:  20%|█▉        | 41511/210244 [08:58<34:54, 80.56batch/s]  2025-04-02 21:15:38,012 - INFO - Epoch 3 (Step 462000): Train loss 3.932, Val loss 4.355
Epoch 3:  20%|█▉        | 41726/210244 [09:01<34:30, 81.39batch/s]

[2025-04-02 21:15:40,719] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  20%|██        | 42505/210244 [09:11<34:26, 81.17batch/s]2025-04-02 21:15:50,470 - INFO - Epoch 3 (Step 463000): Train loss 4.167, Val loss 4.364
Epoch 3:  21%|██        | 43510/210244 [09:25<42:45, 64.99batch/s]  2025-04-02 21:16:04,512 - INFO - Epoch 3 (Step 464000): Train loss 4.062, Val loss 4.357
Epoch 3:  21%|██        | 44511/210244 [09:37<33:36, 82.20batch/s]2025-04-02 21:16:16,931 - INFO - Epoch 3 (Step 465000): Train loss 3.967, Val loss 4.369
Epoch 3:  22%|██▏       | 45505/210244 [09:50<34:36, 79.35batch/s]2025-04-02 21:16:29,470 - INFO - Epoch 3 (Step 466000): Train loss 4.131, Val loss 4.380
Epoch 3:  22%|██▏       | 45964/210244 [09:55<33:47, 81.01batch/s]

[2025-04-02 21:16:35,085] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 3:  22%|██▏       | 46504/210244 [10:03<34:06, 79.99batch/s]  2025-04-02 21:16:43,273 - INFO - Epoch 3 (Step 467000): Train loss 3.991, Val loss 4.375
Epoch 3:  23%|██▎       | 47512/210244 [10:16<33:23, 81.24batch/s]2025-04-02 21:16:55,848 - INFO - Epoch 3 (Step 468000): Train loss 4.089, Val loss 4.371
Epoch 3:  23%|██▎       | 47627/210244 [10:18<34:04, 79.52batch/s]

[2025-04-02 21:16:57,352] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  23%|██▎       | 48509/210244 [10:28<33:24, 80.69batch/s]2025-04-02 21:17:08,341 - INFO - Epoch 3 (Step 469000): Train loss 4.112, Val loss 4.374
Epoch 3:  24%|██▎       | 49509/210244 [10:42<32:53, 81.43batch/s]  2025-04-02 21:17:22,174 - INFO - Epoch 3 (Step 470000): Train loss 3.854, Val loss 4.359
Epoch 3:  24%|██▍       | 50504/210244 [10:55<32:56, 80.82batch/s]2025-04-02 21:17:34,611 - INFO - Epoch 3 (Step 471000): Train loss 3.992, Val loss 4.361
Epoch 3:  25%|██▍       | 51510/210244 [11:09<33:04, 79.98batch/s]  2025-04-02 21:17:48,438 - INFO - Epoch 3 (Step 472000): Train loss 4.073, Val loss 4.388
Epoch 3:  25%|██▍       | 51687/210244 [11:11<33:05, 79.88batch/s]

[2025-04-02 21:17:50,648] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 3:  25%|██▍       | 52025/210244 [11:15<32:42, 80.61batch/s]

[2025-04-02 21:17:54,925] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  25%|██▍       | 52512/210244 [11:21<33:27, 78.58batch/s]2025-04-02 21:18:01,065 - INFO - Epoch 3 (Step 473000): Train loss 4.086, Val loss 4.370
Epoch 3:  25%|██▌       | 53504/210244 [11:34<32:49, 79.57batch/s]2025-04-02 21:18:13,585 - INFO - Epoch 3 (Step 474000): Train loss 4.079, Val loss 4.367
Epoch 3:  26%|██▌       | 54506/210244 [11:47<32:14, 80.53batch/s]  2025-04-02 21:18:27,352 - INFO - Epoch 3 (Step 475000): Train loss 4.183, Val loss 4.372
Epoch 3:  26%|██▌       | 54833/210244 [11:52<32:16, 80.27batch/s]

[2025-04-02 21:18:31,486] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  26%|██▋       | 55506/210244 [12:00<32:12, 80.05batch/s]2025-04-02 21:18:39,958 - INFO - Epoch 3 (Step 476000): Train loss 4.109, Val loss 4.375
Epoch 3:  27%|██▋       | 56511/210244 [12:14<32:00, 80.05batch/s]  2025-04-02 21:18:53,872 - INFO - Epoch 3 (Step 477000): Train loss 3.956, Val loss 4.328
Epoch 3:  27%|██▋       | 57509/210244 [12:27<31:18, 81.30batch/s]2025-04-02 21:19:06,333 - INFO - Epoch 3 (Step 478000): Train loss 3.924, Val loss 4.338
Epoch 3:  28%|██▊       | 58508/210244 [12:39<30:43, 82.32batch/s]2025-04-02 21:19:18,742 - INFO - Epoch 3 (Step 479000): Train loss 4.057, Val loss 4.337
Epoch 3:  28%|██▊       | 59042/210244 [12:47<35:45, 70.49batch/s]  

[2025-04-02 21:19:26,757] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 3:  28%|██▊       | 59506/210244 [12:53<32:02, 78.39batch/s]2025-04-02 21:19:32,557 - INFO - Epoch 3 (Step 480000): Train loss 4.079, Val loss 4.351
Epoch 3:  29%|██▊       | 60047/210244 [12:59<30:39, 81.65batch/s]

[2025-04-02 21:19:39,294] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  29%|██▉       | 60504/210244 [13:05<31:13, 79.95batch/s]2025-04-02 21:19:45,048 - INFO - Epoch 3 (Step 481000): Train loss 4.100, Val loss 4.308
Epoch 3:  29%|██▉       | 61512/210244 [13:18<30:44, 80.64batch/s]2025-04-02 21:19:58,930 - INFO - Epoch 3 (Step 482000): Train loss 3.939, Val loss 4.321
Epoch 3:  30%|██▉       | 62504/210244 [13:32<30:35, 80.47batch/s]  2025-04-02 21:20:11,667 - INFO - Epoch 3 (Step 483000): Train loss 3.890, Val loss 4.339
Epoch 3:  30%|██▉       | 63020/210244 [13:38<30:00, 81.77batch/s]

[2025-04-02 21:20:17,985] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  30%|███       | 63506/210244 [13:44<30:24, 80.45batch/s]2025-04-02 21:20:24,212 - INFO - Epoch 3 (Step 484000): Train loss 4.010, Val loss 4.349
Epoch 3:  31%|███       | 64508/210244 [13:58<29:23, 82.62batch/s]  2025-04-02 21:20:38,095 - INFO - Epoch 3 (Step 485000): Train loss 4.034, Val loss 4.365
Epoch 3:  31%|███       | 65080/210244 [14:05<30:47, 78.59batch/s]

[2025-04-02 21:20:45,145] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  31%|███       | 65512/210244 [14:11<29:35, 81.52batch/s]2025-04-02 21:20:50,488 - INFO - Epoch 3 (Step 486000): Train loss 3.869, Val loss 4.339
Epoch 3:  32%|███▏      | 66512/210244 [14:23<30:18, 79.06batch/s]2025-04-02 21:21:02,951 - INFO - Epoch 3 (Step 487000): Train loss 4.118, Val loss 4.365
Epoch 3:  32%|███▏      | 67504/210244 [14:37<29:28, 80.72batch/s]  2025-04-02 21:21:16,766 - INFO - Epoch 3 (Step 488000): Train loss 3.958, Val loss 4.359
Epoch 3:  33%|███▎      | 68509/210244 [14:49<29:13, 80.85batch/s]2025-04-02 21:21:29,225 - INFO - Epoch 3 (Step 489000): Train loss 3.993, Val loss 4.374
Epoch 3:  33%|███▎      | 69462/210244 [15:03<28:55, 81.10batch/s]  

[2025-04-02 21:21:42,378] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 3:  33%|███▎      | 69506/210244 [15:03<29:28, 79.59batch/s]2025-04-02 21:21:42,980 - INFO - Epoch 3 (Step 490000): Train loss 4.083, Val loss 4.383
Epoch 3:  34%|███▎      | 70504/210244 [15:16<29:39, 78.55batch/s]2025-04-02 21:21:55,381 - INFO - Epoch 3 (Step 491000): Train loss 3.965, Val loss 4.371
Epoch 3:  34%|███▍      | 71472/210244 [15:28<28:36, 80.84batch/s]

[2025-04-02 21:22:07,451] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 3:  34%|███▍      | 71505/210244 [15:28<30:15, 76.43batch/s]2025-04-02 21:22:07,974 - INFO - Epoch 3 (Step 492000): Train loss 4.186, Val loss 4.373
Epoch 3:  34%|███▍      | 72509/210244 [15:42<28:11, 81.44batch/s]  2025-04-02 21:22:21,847 - INFO - Epoch 3 (Step 493000): Train loss 3.907, Val loss 4.381
Epoch 3:  35%|███▍      | 73155/210244 [15:50<28:00, 81.57batch/s]

[2025-04-02 21:22:29,890] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  35%|███▍      | 73506/210244 [15:54<28:19, 80.45batch/s]2025-04-02 21:22:34,331 - INFO - Epoch 3 (Step 494000): Train loss 4.203, Val loss 4.371
Epoch 3:  35%|███▌      | 74509/210244 [16:08<29:24, 76.95batch/s]  2025-04-02 21:22:48,162 - INFO - Epoch 3 (Step 495000): Train loss 4.156, Val loss 4.377
Epoch 3:  36%|███▌      | 75512/210244 [16:21<27:37, 81.31batch/s]2025-04-02 21:23:00,666 - INFO - Epoch 3 (Step 496000): Train loss 4.122, Val loss 4.398
Epoch 3:  36%|███▋      | 76505/210244 [16:33<29:03, 76.72batch/s]2025-04-02 21:23:13,192 - INFO - Epoch 3 (Step 497000): Train loss 4.154, Val loss 4.391
Epoch 3:  37%|███▋      | 77400/210244 [16:46<27:35, 80.23batch/s]  

[2025-04-02 21:23:25,552] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 3:  37%|███▋      | 77508/210244 [16:47<27:17, 81.06batch/s]2025-04-02 21:23:26,932 - INFO - Epoch 3 (Step 498000): Train loss 4.046, Val loss 4.383
Epoch 3:  37%|███▋      | 78457/210244 [16:59<26:56, 81.51batch/s]

[2025-04-02 21:23:38,680] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  37%|███▋      | 78510/210244 [17:00<28:18, 77.57batch/s]2025-04-02 21:23:39,444 - INFO - Epoch 3 (Step 499000): Train loss 4.183, Val loss 4.382
Epoch 3:  38%|███▊      | 79512/210244 [17:12<26:56, 80.85batch/s]2025-04-02 21:23:51,812 - INFO - Epoch 3 (Step 500000): Train loss 3.955, Val loss 4.373
Epoch 3:  38%|███▊      | 80511/210244 [17:26<26:10, 82.63batch/s]  2025-04-02 21:24:05,634 - INFO - Epoch 3 (Step 501000): Train loss 4.053, Val loss 4.380
Epoch 3:  39%|███▉      | 81504/210244 [17:38<26:20, 81.45batch/s]2025-04-02 21:24:17,998 - INFO - Epoch 3 (Step 502000): Train loss 4.160, Val loss 4.372
Epoch 3:  39%|███▉      | 82504/210244 [17:52<26:02, 81.74batch/s]  2025-04-02 21:24:31,665 - INFO - Epoch 3 (Step 503000): Train loss 4.086, Val loss 4.371
Epoch 3:  39%|███▉      | 82576/210244 [17:53<26:36, 79.94batch/s]

[2025-04-02 21:24:32,508] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 3:  40%|███▉      | 83508/210244 [18:04<26:18, 80.27batch/s]2025-04-02 21:24:44,014 - INFO - Epoch 3 (Step 504000): Train loss 4.051, Val loss 4.357
Epoch 3:  40%|████      | 84505/210244 [18:17<26:26, 79.25batch/s]2025-04-02 21:24:56,572 - INFO - Epoch 3 (Step 505000): Train loss 3.908, Val loss 4.364
Epoch 3:  40%|████      | 84638/210244 [18:18<25:42, 81.45batch/s]

[2025-04-02 21:24:58,133] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 3:  40%|████      | 84755/210244 [18:20<24:52, 84.08batch/s]

[2025-04-02 21:24:59,563] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  41%|████      | 85505/210244 [18:30<27:20, 76.06batch/s]  2025-04-02 21:25:10,323 - INFO - Epoch 3 (Step 506000): Train loss 4.029, Val loss 4.356
Epoch 3:  41%|████      | 86508/210244 [18:43<25:23, 81.22batch/s]2025-04-02 21:25:22,812 - INFO - Epoch 3 (Step 507000): Train loss 3.966, Val loss 4.362
Epoch 3:  42%|████▏     | 87508/210244 [18:57<26:21, 77.59batch/s]  2025-04-02 21:25:36,540 - INFO - Epoch 3 (Step 508000): Train loss 3.833, Val loss 4.354
Epoch 3:  42%|████▏     | 88348/210244 [19:07<24:56, 81.45batch/s]

[2025-04-02 21:25:46,894] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  42%|████▏     | 88510/210244 [19:09<25:09, 80.63batch/s]2025-04-02 21:25:48,908 - INFO - Epoch 3 (Step 509000): Train loss 4.123, Val loss 4.330
Epoch 3:  43%|████▎     | 89506/210244 [19:21<24:37, 81.69batch/s]2025-04-02 21:26:01,288 - INFO - Epoch 3 (Step 510000): Train loss 4.074, Val loss 4.359
Epoch 3:  43%|████▎     | 90505/210244 [19:35<24:26, 81.65batch/s]  2025-04-02 21:26:15,031 - INFO - Epoch 3 (Step 511000): Train loss 4.224, Val loss 4.374
Epoch 3:  43%|████▎     | 90550/210244 [19:36<25:37, 77.87batch/s]

[2025-04-02 21:26:15,534] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  43%|████▎     | 91099/210244 [19:43<24:25, 81.33batch/s]

[2025-04-02 21:26:22,316] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 3:  44%|████▎     | 91512/210244 [19:48<25:16, 78.28batch/s]2025-04-02 21:26:27,425 - INFO - Epoch 3 (Step 512000): Train loss 4.072, Val loss 4.342
Epoch 3:  44%|████▍     | 92512/210244 [20:00<24:14, 80.95batch/s]2025-04-02 21:26:39,795 - INFO - Epoch 3 (Step 513000): Train loss 4.024, Val loss 4.354
Epoch 3:  44%|████▍     | 93511/210244 [20:14<23:51, 81.53batch/s]  2025-04-02 21:26:53,697 - INFO - Epoch 3 (Step 514000): Train loss 4.066, Val loss 4.363
Epoch 3:  45%|████▍     | 94510/210244 [20:26<24:05, 80.04batch/s]2025-04-02 21:27:06,066 - INFO - Epoch 3 (Step 515000): Train loss 3.993, Val loss 4.365
Epoch 3:  45%|████▌     | 95506/210244 [20:40<24:04, 79.41batch/s]  2025-04-02 21:27:19,857 - INFO - Epoch 3 (Step 516000): Train loss 3.995, Val loss 4.381
Epoch 3:  46%|████▌     | 96511/210244 [20:52<23:11, 81.74batch/s]2025-04-02 21:27:32,272 - INFO - Epoch 3 (Step 517000): Train loss 4.012, Val loss 4.351
Epoch 3:  46%|████▋     | 97506/210244 [21:05<23:15, 80.78batch/s]

[2025-04-02 21:27:57,672] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 3:  47%|████▋     | 98506/210244 [21:19<23:08, 80.50batch/s]2025-04-02 21:27:58,901 - INFO - Epoch 3 (Step 519000): Train loss 4.196, Val loss 4.356
Epoch 3:  47%|████▋     | 99055/210244 [21:26<22:47, 81.33batch/s]

[2025-04-02 21:28:05,770] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  47%|████▋     | 99510/210244 [21:32<23:27, 78.65batch/s]2025-04-02 21:28:11,492 - INFO - Epoch 3 (Step 520000): Train loss 4.016, Val loss 4.344
Epoch 3:  48%|████▊     | 100509/210244 [21:45<22:36, 80.89batch/s]  2025-04-02 21:28:25,274 - INFO - Epoch 3 (Step 521000): Train loss 3.673, Val loss 4.368
Epoch 3:  48%|████▊     | 101508/210244 [21:58<22:27, 80.71batch/s]2025-04-02 21:28:37,755 - INFO - Epoch 3 (Step 522000): Train loss 4.105, Val loss 4.326
Epoch 3:  49%|████▉     | 102504/210244 [22:10<22:16, 80.59batch/s]2025-04-02 21:28:50,096 - INFO - Epoch 3 (Step 523000): Train loss 3.953, Val loss 4.355
Epoch 3:  49%|████▉     | 103200/210244 [22:20<22:14, 80.21batch/s]  

[2025-04-02 21:28:59,936] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 3:  49%|████▉     | 103371/210244 [22:22<21:41, 82.13batch/s]

[2025-04-02 21:29:02,117] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  49%|████▉     | 103512/210244 [22:24<22:29, 79.10batch/s]2025-04-02 21:29:03,850 - INFO - Epoch 3 (Step 524000): Train loss 4.125, Val loss 4.356
Epoch 3:  50%|████▉     | 104509/210244 [22:36<21:38, 81.43batch/s]2025-04-02 21:29:16,293 - INFO - Epoch 3 (Step 525000): Train loss 4.064, Val loss 4.357
Epoch 3:  50%|█████     | 105508/210244 [22:49<22:21, 78.08batch/s]2025-04-02 21:29:28,747 - INFO - Epoch 3 (Step 526000): Train loss 4.220, Val loss 4.352
Epoch 3:  51%|█████     | 106506/210244 [23:03<21:19, 81.09batch/s]  2025-04-02 21:29:42,487 - INFO - Epoch 3 (Step 527000): Train loss 4.120, Val loss 4.361
Epoch 3:  51%|█████     | 107509/210244 [23:15<21:01, 81.46batch/s]2025-04-02 21:29:55,027 - INFO - Epoch 3 (Step 528000): Train loss 3.943, Val loss 4.361
Epoch 3:  51%|█████▏    | 107893/210244 [23:20<20:49, 81.89batch/s]

[2025-04-02 21:29:59,853] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 3:  52%|█████▏    | 108512/210244 [23:29<20:53, 81.13batch/s]  2025-04-02 21:30:08,876 - INFO - Epoch 3 (Step 529000): Train loss 4.206, Val loss 4.341
Epoch 3:  52%|█████▏    | 109317/210244 [23:39<20:39, 81.46batch/s]

[2025-04-02 21:30:18,972] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  52%|█████▏    | 109506/210244 [23:42<20:52, 80.41batch/s]2025-04-02 21:30:21,380 - INFO - Epoch 3 (Step 530000): Train loss 3.996, Val loss 4.350
Epoch 3:  53%|█████▎    | 110510/210244 [23:54<20:38, 80.54batch/s]2025-04-02 21:30:33,751 - INFO - Epoch 3 (Step 531000): Train loss 4.011, Val loss 4.318
Epoch 3:  53%|█████▎    | 111508/210244 [24:08<20:17, 81.07batch/s]  2025-04-02 21:30:47,591 - INFO - Epoch 3 (Step 532000): Train loss 4.062, Val loss 4.316
Epoch 3:  54%|█████▎    | 112505/210244 [24:20<20:00, 81.38batch/s]2025-04-02 21:31:00,058 - INFO - Epoch 3 (Step 533000): Train loss 4.144, Val loss 4.333
Epoch 3:  54%|█████▎    | 112549/210244 [24:21<21:08, 77.01batch/s]

[2025-04-02 21:31:00,614] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  54%|█████▍    | 113509/210244 [24:34<19:34, 82.35batch/s]  2025-04-02 21:31:13,870 - INFO - Epoch 3 (Step 534000): Train loss 4.071, Val loss 4.319
Epoch 3:  54%|█████▍    | 114509/210244 [24:47<19:52, 80.30batch/s]2025-04-02 21:31:26,468 - INFO - Epoch 3 (Step 535000): Train loss 4.016, Val loss 4.337
Epoch 3:  55%|█████▍    | 115097/210244 [24:54<19:24, 81.68batch/s]

[2025-04-02 21:31:33,821] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  55%|█████▍    | 115506/210244 [24:59<19:31, 80.89batch/s]2025-04-02 21:31:38,962 - INFO - Epoch 3 (Step 536000): Train loss 3.892, Val loss 4.343
Epoch 3:  55%|█████▌    | 116506/210244 [25:13<19:29, 80.16batch/s]  2025-04-02 21:31:53,049 - INFO - Epoch 3 (Step 537000): Train loss 4.239, Val loss 4.356
Epoch 3:  56%|█████▌    | 117511/210244 [25:26<19:02, 81.14batch/s]2025-04-02 21:32:05,476 - INFO - Epoch 3 (Step 538000): Train loss 3.946, Val loss 4.346
Epoch 3:  56%|█████▋    | 118511/210244 [25:40<26:36, 57.46batch/s]  2025-04-02 21:32:19,401 - INFO - Epoch 3 (Step 539000): Train loss 4.153, Val loss 4.347
Epoch 3:  57%|█████▋    | 119108/210244 [25:47<18:57, 80.10batch/s]

[2025-04-02 21:32:26,866] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 3:  57%|█████▋    | 119396/210244 [25:51<18:38, 81.21batch/s]

[2025-04-02 21:32:30,411] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  57%|█████▋    | 119512/210244 [25:52<18:39, 81.02batch/s]2025-04-02 21:32:31,879 - INFO - Epoch 3 (Step 540000): Train loss 4.069, Val loss 4.353
Epoch 3:  57%|█████▋    | 120510/210244 [26:04<18:14, 82.00batch/s]2025-04-02 21:32:44,291 - INFO - Epoch 3 (Step 541000): Train loss 4.137, Val loss 4.347
Epoch 3:  58%|█████▊    | 121506/210244 [26:18<18:17, 80.89batch/s]  2025-04-02 21:32:57,973 - INFO - Epoch 3 (Step 542000): Train loss 4.240, Val loss 4.358
Epoch 3:  58%|█████▊    | 122504/210244 [26:31<18:12, 80.33batch/s]2025-04-02 21:33:10,478 - INFO - Epoch 3 (Step 543000): Train loss 4.149, Val loss 4.361
Epoch 3:  59%|█████▊    | 123508/210244 [26:43<18:06, 79.84batch/s]2025-04-02 21:33:22,862 - INFO - Epoch 3 (Step 544000): Train loss 3.984, Val loss 4.332
Epoch 3:  59%|█████▉    | 123931/210244 [26:50<17:48, 80.79batch/s]  

[2025-04-02 21:33:29,513] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 3:  59%|█████▉    | 124506/210244 [26:57<17:59, 79.42batch/s]2025-04-02 21:33:36,653 - INFO - Epoch 3 (Step 545000): Train loss 4.062, Val loss 4.368
Epoch 3:  59%|█████▉    | 124611/210244 [26:58<18:14, 78.25batch/s]

[2025-04-02 21:33:37,918] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  60%|█████▉    | 125194/210244 [27:05<17:25, 81.32batch/s]

[2025-04-02 21:33:45,162] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 3:  60%|█████▉    | 125508/210244 [27:09<17:28, 80.81batch/s]2025-04-02 21:33:49,083 - INFO - Epoch 3 (Step 546000): Train loss 4.053, Val loss 4.362
Epoch 3:  60%|██████    | 126504/210244 [27:23<17:13, 81.03batch/s]  2025-04-02 21:34:02,885 - INFO - Epoch 3 (Step 547000): Train loss 4.021, Val loss 4.381
Epoch 3:  61%|██████    | 127511/210244 [27:36<16:53, 81.65batch/s]2025-04-02 21:34:15,370 - INFO - Epoch 3 (Step 548000): Train loss 4.038, Val loss 4.366
Epoch 3:  61%|██████    | 128507/210244 [27:48<16:45, 81.29batch/s]2025-04-02 21:34:27,901 - INFO - Epoch 3 (Step 549000): Train loss 4.129, Val loss 4.344
Epoch 3:  62%|██████▏   | 129509/210244 [28:02<16:16, 82.65batch/s]  2025-04-02 21:34:41,750 - INFO - Epoch 3 (Step 550000): Train loss 4.071, Val loss 4.345
Epoch 3:  62%|██████▏   | 129552/210244 [28:03<17:53, 75.18batch/s]

[2025-04-02 21:34:42,321] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  62%|██████▏   | 130505/210244 [28:14<16:09, 82.28batch/s]2025-04-02 21:34:54,162 - INFO - Epoch 3 (Step 551000): Train loss 4.102, Val loss 4.350
Epoch 3:  63%|██████▎   | 131512/210244 [28:28<17:07, 76.63batch/s]  2025-04-02 21:35:08,075 - INFO - Epoch 3 (Step 552000): Train loss 4.108, Val loss 4.327
Epoch 3:  63%|██████▎   | 132507/210244 [28:41<15:53, 81.52batch/s]2025-04-02 21:35:20,521 - INFO - Epoch 3 (Step 553000): Train loss 4.082, Val loss 4.351
Epoch 3:  63%|██████▎   | 132748/210244 [28:44<15:58, 80.86batch/s]

[2025-04-02 21:35:23,558] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  64%|██████▎   | 133505/210244 [28:53<15:26, 82.83batch/s]2025-04-02 21:35:32,975 - INFO - Epoch 3 (Step 554000): Train loss 3.994, Val loss 4.328
Epoch 3:  64%|██████▍   | 134511/210244 [29:07<15:30, 81.41batch/s]  2025-04-02 21:35:46,851 - INFO - Epoch 3 (Step 555000): Train loss 4.223, Val loss 4.331
Epoch 3:  64%|██████▍   | 135511/210244 [29:20<15:07, 82.35batch/s]2025-04-02 21:35:59,352 - INFO - Epoch 3 (Step 556000): Train loss 4.074, Val loss 4.356
Epoch 3:  65%|██████▍   | 135837/210244 [29:24<15:25, 80.36batch/s]

[2025-04-02 21:36:03,480] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  65%|██████▍   | 136508/210244 [29:32<15:25, 79.67batch/s]2025-04-02 21:36:11,927 - INFO - Epoch 3 (Step 557000): Train loss 4.175, Val loss 4.340
Epoch 3:  65%|██████▌   | 137510/210244 [29:46<15:21, 78.95batch/s]  2025-04-02 21:36:25,804 - INFO - Epoch 3 (Step 558000): Train loss 4.014, Val loss 4.351
Epoch 3:  66%|██████▌   | 138511/210244 [29:59<14:32, 82.25batch/s]2025-04-02 21:36:38,314 - INFO - Epoch 3 (Step 559000): Train loss 4.156, Val loss 4.352
Epoch 3:  66%|██████▋   | 139508/210244 [30:12<14:55, 79.03batch/s]  2025-04-02 21:36:52,191 - INFO - Epoch 3 (Step 560000): Train loss 4.064, Val loss 4.344
Epoch 3:  67%|██████▋   | 140512/210244 [30:25<14:26, 80.51batch/s]2025-04-02 21:37:04,602 - INFO - Epoch 3 (Step 561000): Train loss 4.132, Val loss 4.365
Epoch 3:  67%|██████▋   | 140735/210244 [30:28<15:04, 76.84batch/s]

[2025-04-02 21:37:07,414] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 3:  67%|██████▋   | 141510/210244 [30:37<14:31, 78.91batch/s]2025-04-02 21:37:17,108 - INFO - Epoch 3 (Step 562000): Train loss 3.924, Val loss 4.321
Epoch 3:  67%|██████▋   | 141714/210244 [30:40<14:30, 78.76batch/s]

[2025-04-02 21:37:21,083] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  68%|██████▊   | 142511/210244 [30:51<13:55, 81.10batch/s]  2025-04-02 21:37:31,042 - INFO - Epoch 3 (Step 563000): Train loss 4.081, Val loss 4.305
Epoch 3:  68%|██████▊   | 143511/210244 [31:04<13:40, 81.33batch/s]2025-04-02 21:37:43,637 - INFO - Epoch 3 (Step 564000): Train loss 4.164, Val loss 4.330
Epoch 3:  69%|██████▊   | 144509/210244 [31:17<13:15, 82.67batch/s]  2025-04-02 21:37:57,301 - INFO - Epoch 3 (Step 565000): Train loss 3.864, Val loss 4.336
Epoch 3:  69%|██████▉   | 145511/210244 [31:30<13:24, 80.41batch/s]2025-04-02 21:38:09,707 - INFO - Epoch 3 (Step 566000): Train loss 3.932, Val loss 4.307
Epoch 3:  70%|██████▉   | 146488/210244 [31:42<13:13, 80.35batch/s]

[2025-04-02 21:38:21,927] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 3:  70%|██████▉   | 146506/210244 [31:42<13:14, 80.25batch/s]2025-04-02 21:38:22,200 - INFO - Epoch 3 (Step 567000): Train loss 3.919, Val loss 4.316
Epoch 3:  70%|███████   | 147506/210244 [31:56<12:38, 82.66batch/s]  2025-04-02 21:38:35,986 - INFO - Epoch 3 (Step 568000): Train loss 4.009, Val loss 4.326
Epoch 3:  71%|███████   | 148266/210244 [32:06<12:53, 80.13batch/s]

[2025-04-02 21:38:45,390] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  71%|███████   | 148506/210244 [32:09<12:44, 80.77batch/s]2025-04-02 21:38:48,476 - INFO - Epoch 3 (Step 569000): Train loss 3.996, Val loss 4.309
Epoch 3:  71%|███████   | 149506/210244 [32:22<34:32, 29.30batch/s]2025-04-02 21:39:02,284 - INFO - Epoch 3 (Step 570000): Train loss 3.894, Val loss 4.318
Epoch 3:  72%|███████▏  | 150512/210244 [32:35<12:23, 80.32batch/s]2025-04-02 21:39:14,734 - INFO - Epoch 3 (Step 571000): Train loss 4.114, Val loss 4.339
Epoch 3:  72%|███████▏  | 151509/210244 [32:47<11:50, 82.64batch/s]2025-04-02 21:39:27,161 - INFO - Epoch 3 (Step 572000): Train loss 4.088, Val loss 4.320
Epoch 3:  72%|███████▏  | 152097/210244 [32:56<34:35, 28.02batch/s]

[2025-04-02 21:39:35,747] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  73%|███████▎  | 152507/210244 [33:01<11:41, 82.29batch/s]2025-04-02 21:39:41,053 - INFO - Epoch 3 (Step 573000): Train loss 4.003, Val loss 4.331
Epoch 3:  73%|███████▎  | 153508/210244 [33:14<11:47, 80.14batch/s]2025-04-02 21:39:53,541 - INFO - Epoch 3 (Step 574000): Train loss 4.170, Val loss 4.309
Epoch 3:  73%|███████▎  | 153811/210244 [33:18<11:44, 80.15batch/s]

[2025-04-02 21:39:57,326] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048


Epoch 3:  73%|███████▎  | 154507/210244 [33:26<11:27, 81.02batch/s]2025-04-02 21:40:06,023 - INFO - Epoch 3 (Step 575000): Train loss 4.000, Val loss 4.324
Epoch 3:  74%|███████▍  | 155507/210244 [33:40<11:07, 82.00batch/s]2025-04-02 21:40:19,783 - INFO - Epoch 3 (Step 576000): Train loss 4.117, Val loss 4.320
Epoch 3:  74%|███████▍  | 156506/210244 [33:52<11:09, 80.27batch/s]2025-04-02 21:40:32,268 - INFO - Epoch 3 (Step 577000): Train loss 4.122, Val loss 4.321
Epoch 3:  75%|███████▍  | 157505/210244 [34:06<10:55, 80.51batch/s]2025-04-02 21:40:46,183 - INFO - Epoch 3 (Step 578000): Train loss 4.112, Val loss 4.315
Epoch 3:  75%|███████▌  | 158509/210244 [34:19<10:28, 82.36batch/s]2025-04-02 21:40:58,600 - INFO - Epoch 3 (Step 579000): Train loss 3.881, Val loss 4.335
Epoch 3:  76%|███████▌  | 159505/210244 [34:31<10:16, 82.28batch/s]2025-04-02 21:41:10,936 - INFO - Epoch 3 (Step 580000): Train loss 4.046, Val loss 4.329
Epoch 3:  76%|███████▋  | 160477/210244 [34:45<10:07, 81.97batch

[2025-04-02 21:41:24,295] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 3:  76%|███████▋  | 160504/210244 [34:45<10:30, 78.93batch/s]2025-04-02 21:41:24,750 - INFO - Epoch 3 (Step 581000): Train loss 3.951, Val loss 4.348
Epoch 3:  76%|███████▋  | 160522/210244 [34:45<11:35, 71.47batch/s]

[2025-04-02 21:41:24,910] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  77%|███████▋  | 161512/210244 [34:58<10:29, 77.38batch/s]2025-04-02 21:41:37,362 - INFO - Epoch 3 (Step 582000): Train loss 3.914, Val loss 4.342
Epoch 3:  77%|███████▋  | 162508/210244 [35:11<21:57, 36.24batch/s]2025-04-02 21:41:51,100 - INFO - Epoch 3 (Step 583000): Train loss 4.118, Val loss 4.341
Epoch 3:  78%|███████▊  | 163512/210244 [35:24<09:40, 80.50batch/s]2025-04-02 21:42:03,459 - INFO - Epoch 3 (Step 584000): Train loss 4.149, Val loss 4.322
Epoch 3:  78%|███████▊  | 164505/210244 [35:36<09:20, 81.65batch/s]2025-04-02 21:42:15,975 - INFO - Epoch 3 (Step 585000): Train loss 4.201, Val loss 4.367
Epoch 3:  79%|███████▊  | 165107/210244 [35:45<17:40, 42.56batch/s]

[2025-04-02 21:42:24,768] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 3:  79%|███████▊  | 165197/210244 [35:46<09:25, 79.70batch/s]

[2025-04-02 21:42:25,821] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  79%|███████▊  | 165512/210244 [35:50<09:10, 81.29batch/s]2025-04-02 21:42:29,833 - INFO - Epoch 3 (Step 586000): Train loss 4.080, Val loss 4.346
Epoch 3:  79%|███████▉  | 166505/210244 [36:02<08:56, 81.46batch/s]2025-04-02 21:42:42,264 - INFO - Epoch 3 (Step 587000): Train loss 4.111, Val loss 4.348
Epoch 3:  80%|███████▉  | 167507/210244 [36:15<08:43, 81.60batch/s]2025-04-02 21:42:54,797 - INFO - Epoch 3 (Step 588000): Train loss 4.096, Val loss 4.337
Epoch 3:  80%|████████  | 168253/210244 [36:26<08:28, 82.62batch/s]

[2025-04-02 21:43:05,377] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  80%|████████  | 168508/210244 [36:29<08:51, 78.56batch/s]2025-04-02 21:43:08,671 - INFO - Epoch 3 (Step 589000): Train loss 4.007, Val loss 4.331
Epoch 3:  81%|████████  | 169504/210244 [36:41<08:18, 81.68batch/s]2025-04-02 21:43:21,220 - INFO - Epoch 3 (Step 590000): Train loss 4.090, Val loss 4.349
Epoch 3:  81%|████████  | 170506/210244 [36:55<08:09, 81.12batch/s]2025-04-02 21:43:34,998 - INFO - Epoch 3 (Step 591000): Train loss 3.898, Val loss 4.346
Epoch 3:  82%|████████▏ | 171510/210244 [37:08<08:05, 79.72batch/s]2025-04-02 21:43:47,574 - INFO - Epoch 3 (Step 592000): Train loss 4.211, Val loss 4.357
Epoch 3:  82%|████████▏ | 172251/210244 [37:17<07:47, 81.24batch/s]

[2025-04-02 21:43:56,854] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 3:  82%|████████▏ | 172508/210244 [37:20<08:06, 77.60batch/s]2025-04-02 21:44:00,168 - INFO - Epoch 3 (Step 593000): Train loss 4.129, Val loss 4.333
Epoch 3:  83%|████████▎ | 173507/210244 [37:34<07:37, 80.28batch/s]2025-04-02 21:44:13,976 - INFO - Epoch 3 (Step 594000): Train loss 3.999, Val loss 4.345
Epoch 3:  83%|████████▎ | 173703/210244 [37:37<07:56, 76.76batch/s]

[2025-04-02 21:44:16,365] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  83%|████████▎ | 174504/210244 [37:46<07:19, 81.39batch/s]2025-04-02 21:44:26,355 - INFO - Epoch 3 (Step 595000): Train loss 4.018, Val loss 4.333
Epoch 3:  83%|████████▎ | 175506/210244 [38:00<07:31, 76.89batch/s]2025-04-02 21:44:40,265 - INFO - Epoch 3 (Step 596000): Train loss 3.970, Val loss 4.340
Epoch 3:  84%|████████▍ | 176505/210244 [38:13<06:52, 81.74batch/s]2025-04-02 21:44:52,694 - INFO - Epoch 3 (Step 597000): Train loss 4.081, Val loss 4.351
Epoch 3:  84%|████████▍ | 177507/210244 [38:25<06:40, 81.64batch/s]2025-04-02 21:45:05,254 - INFO - Epoch 3 (Step 598000): Train loss 4.032, Val loss 4.357
Epoch 3:  85%|████████▍ | 178006/210244 [38:33<18:30, 29.02batch/s]

[2025-04-02 21:45:12,773] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 3:  85%|████████▍ | 178508/210244 [38:39<06:35, 80.18batch/s]2025-04-02 21:45:19,134 - INFO - Epoch 3 (Step 599000): Train loss 3.982, Val loss 4.338
Epoch 3:  85%|████████▌ | 179509/210244 [38:52<06:23, 80.15batch/s]2025-04-02 21:45:31,594 - INFO - Epoch 3 (Step 600000): Train loss 3.909, Val loss 4.353
Epoch 3:  86%|████████▌ | 180044/210244 [38:58<06:21, 79.12batch/s]

[2025-04-02 21:45:38,203] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 3:  86%|████████▌ | 180509/210244 [39:04<06:06, 81.23batch/s]2025-04-02 21:45:44,058 - INFO - Epoch 3 (Step 601000): Train loss 4.037, Val loss 4.372
Epoch 3:  86%|████████▋ | 181504/210244 [39:18<05:57, 80.37batch/s]2025-04-02 21:45:57,826 - INFO - Epoch 3 (Step 602000): Train loss 3.950, Val loss 4.363
Epoch 3:  87%|████████▋ | 182117/210244 [39:26<05:44, 81.62batch/s]

[2025-04-02 21:46:05,442] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 3:  87%|████████▋ | 182510/210244 [39:30<05:39, 81.74batch/s]2025-04-02 21:46:10,274 - INFO - Epoch 3 (Step 603000): Train loss 4.015, Val loss 4.342
Epoch 3:  87%|████████▋ | 183383/210244 [39:43<05:33, 80.66batch/s]

[2025-04-02 21:46:22,512] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  87%|████████▋ | 183512/210244 [39:44<05:33, 80.20batch/s]2025-04-02 21:46:24,189 - INFO - Epoch 3 (Step 604000): Train loss 3.874, Val loss 4.339
Epoch 3:  88%|████████▊ | 184507/210244 [39:57<05:15, 81.48batch/s]2025-04-02 21:46:36,608 - INFO - Epoch 3 (Step 605000): Train loss 3.808, Val loss 4.347
Epoch 3:  88%|████████▊ | 185505/210244 [40:09<05:01, 82.08batch/s]2025-04-02 21:46:49,033 - INFO - Epoch 3 (Step 606000): Train loss 3.988, Val loss 4.372
Epoch 3:  89%|████████▊ | 186507/210244 [40:23<04:50, 81.62batch/s]2025-04-02 21:47:02,836 - INFO - Epoch 3 (Step 607000): Train loss 4.072, Val loss 4.330
Epoch 3:  89%|████████▉ | 187511/210244 [40:35<04:33, 83.15batch/s]2025-04-02 21:47:15,249 - INFO - Epoch 3 (Step 608000): Train loss 4.012, Val loss 4.349
Epoch 3:  89%|████████▉ | 188057/210244 [40:42<04:46, 77.53batch/s]

[2025-04-02 21:47:22,065] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 3:  90%|████████▉ | 188511/210244 [40:49<04:31, 79.90batch/s]2025-04-02 21:47:29,038 - INFO - Epoch 3 (Step 609000): Train loss 3.970, Val loss 4.348
Epoch 3:  90%|█████████ | 189509/210244 [41:02<04:13, 81.83batch/s]2025-04-02 21:47:41,562 - INFO - Epoch 3 (Step 610000): Train loss 4.014, Val loss 4.371
Epoch 3:  90%|█████████ | 189806/210244 [41:05<04:09, 82.08batch/s]

[2025-04-02 21:47:45,215] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  91%|█████████ | 190508/210244 [41:14<04:04, 80.83batch/s]2025-04-02 21:47:53,909 - INFO - Epoch 3 (Step 611000): Train loss 3.904, Val loss 4.362
Epoch 3:  91%|█████████ | 191510/210244 [41:28<03:54, 79.94batch/s]2025-04-02 21:48:07,792 - INFO - Epoch 3 (Step 612000): Train loss 4.102, Val loss 4.345
Epoch 3:  92%|█████████▏| 192507/210244 [41:40<03:38, 81.07batch/s]2025-04-02 21:48:20,324 - INFO - Epoch 3 (Step 613000): Train loss 3.879, Val loss 4.330
Epoch 3:  92%|█████████▏| 193107/210244 [41:48<03:27, 82.76batch/s]

[2025-04-02 21:48:27,717] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  92%|█████████▏| 193510/210244 [41:53<03:24, 81.81batch/s]2025-04-02 21:48:32,753 - INFO - Epoch 3 (Step 614000): Train loss 4.079, Val loss 4.325
Epoch 3:  93%|█████████▎| 194512/210244 [42:07<03:13, 81.25batch/s]2025-04-02 21:48:46,595 - INFO - Epoch 3 (Step 615000): Train loss 4.181, Val loss 4.319
Epoch 3:  93%|█████████▎| 195506/210244 [42:19<03:02, 80.85batch/s]2025-04-02 21:48:59,073 - INFO - Epoch 3 (Step 616000): Train loss 3.993, Val loss 4.331
Epoch 3:  93%|█████████▎| 196504/210244 [42:33<02:50, 80.76batch/s]2025-04-02 21:49:12,946 - INFO - Epoch 3 (Step 617000): Train loss 4.025, Val loss 4.339
Epoch 3:  94%|█████████▍| 197255/210244 [42:42<02:41, 80.19batch/s]

[2025-04-02 21:49:22,188] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 3:  94%|█████████▍| 197505/210244 [42:45<02:35, 82.17batch/s]2025-04-02 21:49:25,342 - INFO - Epoch 3 (Step 618000): Train loss 3.708, Val loss 4.326
Epoch 3:  94%|█████████▍| 198506/210244 [42:58<02:30, 78.01batch/s]2025-04-02 21:49:37,781 - INFO - Epoch 3 (Step 619000): Train loss 3.912, Val loss 4.321
Epoch 3:  95%|█████████▍| 199277/210244 [43:09<02:14, 81.81batch/s]

[2025-04-02 21:49:48,657] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 3:  95%|█████████▍| 199511/210244 [43:12<02:11, 81.39batch/s]2025-04-02 21:49:51,608 - INFO - Epoch 3 (Step 620000): Train loss 4.232, Val loss 4.335
Epoch 3:  95%|█████████▌| 200407/210244 [43:23<01:58, 83.30batch/s]

[2025-04-02 21:50:02,734] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  95%|█████████▌| 200506/210244 [43:24<02:03, 79.07batch/s]2025-04-02 21:50:04,005 - INFO - Epoch 3 (Step 621000): Train loss 4.160, Val loss 4.329
Epoch 3:  96%|█████████▌| 201506/210244 [43:38<01:45, 82.92batch/s]2025-04-02 21:50:17,702 - INFO - Epoch 3 (Step 622000): Train loss 4.148, Val loss 4.322
Epoch 3:  96%|█████████▋| 202509/210244 [43:50<01:34, 81.71batch/s]2025-04-02 21:50:30,072 - INFO - Epoch 3 (Step 623000): Train loss 4.006, Val loss 4.356
Epoch 3:  96%|█████████▋| 202724/210244 [43:53<01:32, 81.45batch/s]

[2025-04-02 21:50:32,714] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3:  97%|█████████▋| 203512/210244 [44:03<01:23, 80.27batch/s]2025-04-02 21:50:42,427 - INFO - Epoch 3 (Step 624000): Train loss 4.058, Val loss 4.348
Epoch 3:  97%|█████████▋| 204507/210244 [44:16<01:09, 82.23batch/s]2025-04-02 21:50:56,092 - INFO - Epoch 3 (Step 625000): Train loss 4.120, Val loss 4.335
Epoch 3:  98%|█████████▊| 205509/210244 [44:29<00:57, 82.18batch/s]2025-04-02 21:51:08,407 - INFO - Epoch 3 (Step 626000): Train loss 3.995, Val loss 4.331
Epoch 3:  98%|█████████▊| 206508/210244 [44:41<00:45, 81.74batch/s]2025-04-02 21:51:20,717 - INFO - Epoch 3 (Step 627000): Train loss 4.317, Val loss 4.317
Epoch 3:  98%|█████████▊| 207072/210244 [44:49<00:40, 79.23batch/s]

[2025-04-02 21:51:28,995] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192


Epoch 3:  99%|█████████▊| 207507/210244 [44:55<00:33, 80.52batch/s]2025-04-02 21:51:34,569 - INFO - Epoch 3 (Step 628000): Train loss 3.948, Val loss 4.361
Epoch 3:  99%|█████████▉| 208505/210244 [45:07<00:21, 81.84batch/s]2025-04-02 21:51:47,018 - INFO - Epoch 3 (Step 629000): Train loss 3.900, Val loss 4.370
Epoch 3:  99%|█████████▉| 208697/210244 [45:10<00:19, 80.22batch/s]

[2025-04-02 21:51:49,367] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096


Epoch 3: 100%|█████████▉| 209511/210244 [45:21<00:09, 81.17batch/s]2025-04-02 21:52:00,805 - INFO - Epoch 3 (Step 630000): Train loss 4.079, Val loss 4.354
Epoch 3: 100%|██████████| 210244/210244 [45:30<00:00, 76.99batch/s]
2025-04-02 21:52:09,905 - INFO - Epoch 3 completed. Generating a sample...
2025-04-02 21:52:09,979 - INFO - Generated Text: Every effort moves you out of the room . " 
   = = = = = = 
   The song was released on the iTunes Store on September 2 , 2008 , and was released on September 2 , 2008 . It was released on September 2 ,


Every effort moves you out of the room . "     = = = = = =     The song was released on the iTunes Store on September 2 , 2008 , and was released on September 2 , 2008 . It was released on September 2 ,


ValueError: too many values to unpack (expected 2)

In [12]:
from transformers import Trainer, TrainingArguments


# 6. Training Arguments
training_args = TrainingArguments(
    output_dir="./sllm_output",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    evaluation_strategy="no",
    save_strategy="no",
    logging_steps=10,
    deepspeed="./ds_config.json",  # <- 여전히 사용 가능
    fp16=True,
    report_to="none",
    remove_unused_columns=False,
    # ddp_backend=None
)

# 7. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [13]:
# import os
# os.environ["USE_MPI"] = "0"



trainer.train()  # Now, trainer.train() will handle the DataLoader and training.



TypeError: vars() argument must have __dict__ attribute