In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

In [3]:
import torch
from gpt import GPTModel

GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval();  # Disable dropout during inference

In [4]:
import tiktoken
from gpt import generate_text_simple, create_dataloader_v1

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())

start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")


token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"]
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you rentingetic wasnم refres RexMeCHicular stren


In [5]:
from datasets import load_dataset

dataset = load_dataset("wikitext", "wikitext-103-v1", split="train")


In [6]:
dataset['text']


['',
 ' = Valkyria Chronicles III = \n',
 '',
 ' Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . \n',
 " The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving

In [7]:
joined_text = "\n".join(dataset['text'])  # 텍스트 하나로 합침
byte_size = len(joined_text.encode('utf-8'))

print(f"UTF-8 저장 시 예상 용량: {byte_size / (1024**2):.2f} MB")


UTF-8 저장 시 예상 용량: 514.73 MB


In [8]:
ds = load_dataset("HuggingFaceFW/fineweb-edu-llama3-annotations")


README.md:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

train-00000-of-00008.parquet:   0%|          | 0.00/199M [00:00<?, ?B/s]

train-00001-of-00008.parquet:   0%|          | 0.00/198M [00:00<?, ?B/s]

train-00002-of-00008.parquet:   0%|          | 0.00/199M [00:00<?, ?B/s]

train-00003-of-00008.parquet:   0%|          | 0.00/199M [00:00<?, ?B/s]

train-00004-of-00008.parquet:   0%|          | 0.00/199M [00:00<?, ?B/s]

train-00005-of-00008.parquet:   0%|          | 0.00/199M [00:00<?, ?B/s]

train-00006-of-00008.parquet:   0%|          | 0.00/198M [00:00<?, ?B/s]

train-00007-of-00008.parquet:   0%|          | 0.00/199M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/467424 [00:00<?, ? examples/s]

In [16]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'metadata', 'prompt', 'score'],
        num_rows: 467424
    })
})

In [20]:
ds['train']['text']

["Catch up on all the latest from our staff and partners\nPlacement student Chloe, Cultural Heritage Management student at the University of York, tells us all about her time volunteering for the CBA and YAC\nIn July 2022 we teamed up with Archaeology Scotland to launch the Scotland Online YAC club thanks to funding from Historic Environment Scotland. YAC Leader, Jane Miller, tells us all about the first 6 months and how the club is getting on.\nFind out more from Isobel, a recent work experience student.\nRead all about the author of 'The Secret of the Treasure Keepers' and what inspired her to write her latest book.",
 'Arc Infrastructure was pleased to partner with the City of Swan, Main Roads WA and the Federal government on an upgrade of Stock Road in Bullsbrook, creating an important connection between Tonkin Highway and Great Northern Highway.\nArc’s role in the $71 million landmark project was the relocation and upgrade of the existing level crossing on Stock Road.\nThe project

In [21]:
txt_data = " ".join(ds['train']['text'])
# train_loader = create_dataloader_v1(txt_data, batch_size=4, max_length=256, stride=128)

train_ratio = 0.90
split_idx = int(train_ratio * len(txt_data))
train_data = txt_data[:split_idx]
val_data = txt_data[split_idx:]


torch.manual_seed(123)

train_loader = create_dataloader_v1(
    train_data,
    batch_size=4,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=4,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [22]:
for batch_idx, (input_batch, target_batch) in enumerate(train_loader):
    print(f"Batch {batch_idx + 1}")
    print("Input batch (tokens):", input_batch)
    print("Target batch (tokens):", target_batch)
    break  # Only printing the first batch as an example

Batch 1
Input batch (tokens): tensor([[   11,   749,  6510,  ..., 18447,   355,   880],
        [    8,   290,   257,  ...,  6379, 10572,   447],
        [ 2033,   286,  1802,  ...,    11,   290,   609],
        [ 3725,    11,   345,  ...,   788,  3067,   284]])
Target batch (tokens): tensor([[  749,  6510,    11,  ...,   355,   880,   355],
        [  290,   257,  6838,  ..., 10572,   447,   247],
        [  286,  1802,  1411,  ...,   290,   609,   726],
        [   11,   345,   460,  ...,  3067,   284,   262]])


In [23]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss


def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        # Reduce the number of batches to match the total number of batches in the data loader
        # if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

In [24]:
import math

def compute_perplexity(model, data_loader, device):
    model.eval()
    total_loss = 0.0
    total_tokens = 0

    with torch.no_grad():
        for input_batch, target_batch in data_loader:
            input_batch, target_batch = input_batch.to(device), target_batch.to(device)
            logits = model(input_batch)
            loss = torch.nn.functional.cross_entropy(
                logits.view(-1, logits.size(-1)),
                target_batch.view(-1),
                reduction='sum'
            )
            total_loss += loss.item()
            total_tokens += target_batch.numel()

    avg_loss = total_loss / total_tokens
    perplexity = math.exp(avg_loss)
    return perplexity


In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [15]:

# # Note:
# # Uncommenting the following lines will allow the code to run on Apple Silicon chips, if applicable,
# # which is approximately 2x faster than on an Apple CPU (as measured on an M3 MacBook Air).
# # However, the resulting loss values may be slightly different.

# #if torch.cuda.is_available():
# #    device = torch.device("cuda")
# #elif torch.backends.mps.is_available():
# #    device = torch.device("mps")
# #else:
# #    device = torch.device("cpu")
# #
# # print(f"Using {device} device.")


# model.to(device) # no assignment model = model.to(device) necessary for nn.Module classes


# torch.manual_seed(123) # For reproducibility due to the shuffling in the data loader

# with torch.no_grad(): # Disable gradient tracking for efficiency because we are not training, yet
#     train_loss = calc_loss_loader(train_loader, model, device)
#     val_loss = calc_loss_loader(val_loader, model, device)

# print("Training loss:", train_loss)
# print("Validation loss:", val_loss)


In [27]:
import logging
import deepspeed
from tqdm import tqdm  # tqdm을 함수처럼 사용

# Setup logging configuration
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(message)s", 
    level=logging.INFO,  # You can adjust the level to DEBUG, INFO, WARNING, etc.
    handlers=[
        logging.FileHandler("training_log_hf_ds_general.txt"),  # Logs will be saved to this file
        logging.StreamHandler()  # Also log to console
    ]
)

# Now use logging instead of print
logger = logging.getLogger(__name__)

In [28]:
import deepspeed
import os

def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                       eval_freq, eval_iter, start_context, tokenizer, deepspeed_config, save_dir="./checkpoints"):

    # Initialize DeepSpeed
    model, optimizer, _, _ = deepspeed.initialize(args=None, model=model, optimizer=optimizer, config_params=deepspeed_config)

    # Initialize lists to track losses and tokens seen
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    os.makedirs(save_dir, exist_ok=True)

    # Main training loop
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode

        logger.info(f"Starting Epoch {epoch+1}...")

        for input_batch, target_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}", unit="batch"):
            input_batch = input_batch.to(device)
            target_batch = target_batch.to(device)

            optimizer.zero_grad()  # Reset loss gradients from previous batch iteration

            # Compute the loss
            loss = calc_loss_batch(input_batch, target_batch, model, device)

            # print(loss)
            # print(f"Loss type: {loss.dtype}")
            # print(f"Loss scale type: {model.loss_scale.dtype}")
            # if not isinstance(loss, torch.Tensor):
            #     loss = torch.tensor(loss).to(device)


            # Backward pass and step the optimizer
            model.backward(loss)  # Backward pass with DeepSpeed
            model.step()  # Step optimizer using DeepSpeed

            tokens_seen += input_batch.numel()
            global_step += 1

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter
                )
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                ppl = compute_perplexity(model, val_loader, device)
                track_tokens_seen.append(tokens_seen)
                logger.info(f"Epoch {epoch+1} (Step {global_step:06d}): "
                            f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}"
                            f"Validation Perplexity: {ppl:.2f}")
                
                
        logger.info(f"Epoch {epoch+1} completed. Generating a sample...")
        # Print a sample text after each epoch
        generate_and_print_sample(
            model, tokenizer, device, start_context
        )
        # Save checkpoint
        epoch_ckpt_dir = os.path.join(save_dir, f"checkpoint_speed-epoch{epoch+1}")
        model.save_checkpoint(epoch_ckpt_dir)
        logger.info(f"Checkpoint saved at: {epoch_ckpt_dir}")

    return train_losses, val_losses, track_tokens_seen

def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss


def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model, idx=encoded,
            max_new_tokens=50, context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))  # Compact print format
    logger.info(f"Generated Text: {decoded_text}")  # Log the generated text
    model.train()

In [29]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

train_losses, track_tokens_seen = train_model_simple(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    device=device,
    num_epochs=3,
    eval_freq=1000,
    eval_iter=10,
    start_context="Every effort moves you",
    tokenizer=tokenizer,
    deepspeed_config="./ds_config.json"
)

[2025-05-19 18:47:08,482] [INFO] [logging.py:107:log_dist] [Rank -1] DeepSpeed info: version=0.16.5, git-hash=unknown, git-branch=unknown
[2025-05-19 18:47:08,483] [INFO] [comm.py:658:init_distributed] cdb=None
[2025-05-19 18:47:08,483] [INFO] [comm.py:673:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment...
[1747648028.658751] [gpusystem:326306:0]    cuda_copy_md.c:348  UCX  WARN                cannot set sync_memops on CUDA VMM without cuCtxSetFlags() (address=0x7f21ad400000)
[1747648028.663556] [gpusystem:326306:0]    cuda_copy_md.c:348  UCX  WARN              cannot set sync_memops on CUDA VMM without cuCtxSetFlags() (address=0x7f21ad400000)
[2025-05-19 18:47:08,670] [INFO] [comm.py:728:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=10.125.70.48, master_port=29500
[2025-05-19 18:47:08,671] [INFO] [comm.py:689:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[20

2025-05-19 18:47:08,916 - INFO - Starting Epoch 1...
Epoch 1:   0%|          | 0/316861 [00:00<?, ?batch/s]2025-05-19 18:50:00,289 - INFO - Epoch 1 (Step 000000): Train loss 10.981, Val loss 10.981Validation Perplexity: 58929.64
Epoch 1:   0%|          | 996/316861 [03:25<2:08:06, 41.10batch/s]   2025-05-19 18:53:16,435 - INFO - Epoch 1 (Step 001000): Train loss 7.108, Val loss 6.970Validation Perplexity: 1229.92
Epoch 1:   1%|          | 2000/316861 [06:46<2:15:53, 38.62batch/s]  2025-05-19 18:56:35,307 - INFO - Epoch 1 (Step 002000): Train loss 6.904, Val loss 6.707Validation Perplexity: 955.00
Epoch 1:   1%|          | 3000/316861 [10:08<2:32:36, 34.28batch/s]   2025-05-19 18:59:55,678 - INFO - Epoch 1 (Step 003000): Train loss 6.700, Val loss 6.568Validation Perplexity: 845.75
Epoch 1:   1%|▏         | 4000/316861 [13:26<2:25:52, 35.75batch/s]   2025-05-19 19:03:15,609 - INFO - Epoch 1 (Step 004000): Train loss 6.560, Val loss 6.494Validation Perplexity: 775.59
Epoch 1:   2%|▏     

[2025-05-20 15:19:39,447] [INFO] [unfused_optimizer.py:294:_update_scale] Grad overflow on iteration 71209
[2025-05-20 15:19:39,449] [INFO] [unfused_optimizer.py:295:_update_scale] Using static loss scale of 1024
[2025-05-20 15:19:39,450] [INFO] [unfused_optimizer.py:208:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1024, reducing to 1024


Epoch 1:  90%|████████▉ | 284997/316861 [20:32:47<13:38, 38.93batch/s]2025-05-20 15:22:33,938 - INFO - Epoch 1 (Step 285000): Train loss 4.438, Val loss 4.387Validation Perplexity: 96.03
Epoch 1:  90%|█████████ | 285997/316861 [20:36:09<13:27, 38.20batch/s]    2025-05-20 15:25:52,370 - INFO - Epoch 1 (Step 286000): Train loss 4.514, Val loss 4.393Validation Perplexity: 96.31
Epoch 1:  91%|█████████ | 286998/316861 [20:39:23<39:14, 12.68batch/s]    2025-05-20 15:29:11,600 - INFO - Epoch 1 (Step 287000): Train loss 4.546, Val loss 4.388Validation Perplexity: 96.09
Epoch 1:  91%|█████████ | 287998/316861 [20:42:42<12:19, 39.01batch/s]    2025-05-20 15:32:31,675 - INFO - Epoch 1 (Step 288000): Train loss 4.670, Val loss 4.379Validation Perplexity: 96.57
Epoch 1:  91%|█████████ | 288997/316861 [20:45:57<11:02, 42.05batch/s]    2025-05-20 15:35:46,377 - INFO - Epoch 1 (Step 289000): Train loss 4.577, Val loss 4.383Validation Perplexity: 95.76
Epoch 1:  92%|█████████▏| 290000/316861 [20:49:18

Every effort moves you to the next level. - The first step is to create a new strategy that will help you stay ahead of the competition. - The first step is to create a plan that will help you stay ahead of the competition. - The goal
[2025-05-20 17:05:53,547] [INFO] [logging.py:107:log_dist] [Rank 0] [Torch] Checkpoint global_step79215 is about to be saved!
[2025-05-20 17:05:53,551] [INFO] [logging.py:107:log_dist] [Rank 0] Saving model checkpoint: ./checkpoints/checkpoint_speed-epoch1/global_step79215/mp_rank_00_model_states.pt
[2025-05-20 17:05:53,551] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./checkpoints/checkpoint_speed-epoch1/global_step79215/mp_rank_00_model_states.pt...


[rank0]:[W520 17:05:53.556238033 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.


[2025-05-20 17:05:55,557] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./checkpoints/checkpoint_speed-epoch1/global_step79215/mp_rank_00_model_states.pt.
[2025-05-20 17:05:55,558] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step79215 is ready now!


2025-05-20 17:05:55,559 - INFO - Checkpoint saved at: ./checkpoints/checkpoint_speed-epoch1
2025-05-20 17:05:55,560 - INFO - Starting Epoch 2...
Epoch 2:   0%|          | 139/316861 [00:21<2:40:04, 32.98batch/s]2025-05-20 17:08:51,071 - INFO - Epoch 2 (Step 317000): Train loss 4.612, Val loss 4.389Validation Perplexity: 95.69
Epoch 2:   0%|          | 1139/316861 [03:35<2:17:20, 38.32batch/s]  2025-05-20 17:12:08,639 - INFO - Epoch 2 (Step 318000): Train loss 4.441, Val loss 4.373Validation Perplexity: 93.76
Epoch 2:   1%|          | 2139/316861 [06:53<2:34:42, 33.90batch/s]   2025-05-20 17:15:30,122 - INFO - Epoch 2 (Step 319000): Train loss 4.557, Val loss 4.351Validation Perplexity: 92.81
Epoch 2:   1%|          | 3136/316861 [10:14<2:13:49, 39.07batch/s]   2025-05-20 17:18:49,621 - INFO - Epoch 2 (Step 320000): Train loss 4.434, Val loss 4.350Validation Perplexity: 93.13
Epoch 2:   1%|▏         | 4136/316861 [13:36<2:13:46, 38.96batch/s]   2025-05-20 17:22:08,494 - INFO - Epoch 2 (

[2025-05-20 21:54:31,656] [INFO] [unfused_optimizer.py:294:_update_scale] Grad overflow on iteration 100805
[2025-05-20 21:54:31,657] [INFO] [unfused_optimizer.py:295:_update_scale] Using static loss scale of 1024
[2025-05-20 21:54:31,658] [INFO] [unfused_optimizer.py:208:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1024, reducing to 1024


Epoch 2:  28%|██▊       | 87139/316861 [4:49:14<1:42:27, 37.37batch/s]2025-05-20 21:57:45,581 - INFO - Epoch 2 (Step 404000): Train loss 4.413, Val loss 4.292Validation Perplexity: 87.67
Epoch 2:  28%|██▊       | 88136/316861 [4:52:29<1:44:33, 36.46batch/s]   2025-05-20 22:01:05,385 - INFO - Epoch 2 (Step 405000): Train loss 4.465, Val loss 4.281Validation Perplexity: 88.35
Epoch 2:  28%|██▊       | 88531/316861 [4:55:19<1:32:38, 41.08batch/s]  

[2025-05-20 22:01:14,899] [INFO] [unfused_optimizer.py:294:_update_scale] Grad overflow on iteration 101348
[2025-05-20 22:01:14,900] [INFO] [unfused_optimizer.py:295:_update_scale] Using static loss scale of 1024
[2025-05-20 22:01:14,900] [INFO] [unfused_optimizer.py:208:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1024, reducing to 1024


Epoch 2:  28%|██▊       | 89137/316861 [4:55:47<1:28:42, 42.78batch/s]2025-05-20 22:04:20,205 - INFO - Epoch 2 (Step 406000): Train loss 4.359, Val loss 4.285Validation Perplexity: 88.08
Epoch 2:  28%|██▊       | 90139/316861 [4:59:08<1:46:08, 35.60batch/s]  2025-05-20 22:07:41,120 - INFO - Epoch 2 (Step 407000): Train loss 4.295, Val loss 4.295Validation Perplexity: 87.68
Epoch 2:  29%|██▉       | 91136/316861 [5:02:30<1:49:35, 34.33batch/s]   2025-05-20 22:11:02,322 - INFO - Epoch 2 (Step 408000): Train loss 4.486, Val loss 4.300Validation Perplexity: 87.70
Epoch 2:  29%|██▉       | 92136/316861 [5:05:51<1:44:47, 35.74batch/s]  2025-05-20 22:14:23,017 - INFO - Epoch 2 (Step 409000): Train loss 4.374, Val loss 4.302Validation Perplexity: 87.78
Epoch 2:  29%|██▉       | 93136/316861 [5:09:07<1:48:19, 34.42batch/s]  2025-05-20 22:17:42,333 - INFO - Epoch 2 (Step 410000): Train loss 4.391, Val loss 4.312Validation Perplexity: 87.44
Epoch 2:  30%|██▉       | 94136/316861 [5:12:28<1:40:38,

[2025-05-20 22:37:57,272] [INFO] [unfused_optimizer.py:294:_update_scale] Grad overflow on iteration 104148
[2025-05-20 22:37:57,273] [INFO] [unfused_optimizer.py:295:_update_scale] Using static loss scale of 1024
[2025-05-20 22:37:57,273] [INFO] [unfused_optimizer.py:208:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1024, reducing to 1024


Epoch 2:  31%|███▏      | 99738/316861 [5:32:01<1:39:45, 36.27batch/s]

[2025-05-20 22:37:57,367] [INFO] [unfused_optimizer.py:294:_update_scale] Grad overflow on iteration 104149
[2025-05-20 22:37:57,369] [INFO] [unfused_optimizer.py:295:_update_scale] Using static loss scale of 1024
[2025-05-20 22:37:57,370] [INFO] [unfused_optimizer.py:208:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1024, reducing to 1024


Epoch 2:  31%|███▏      | 99751/316861 [5:32:02<1:43:46, 34.87batch/s]

[2025-05-20 22:37:57,812] [INFO] [unfused_optimizer.py:294:_update_scale] Grad overflow on iteration 104153
[2025-05-20 22:37:57,813] [INFO] [unfused_optimizer.py:295:_update_scale] Using static loss scale of 1024
[2025-05-20 22:37:57,814] [INFO] [unfused_optimizer.py:208:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1024, reducing to 1024


Epoch 2:  31%|███▏      | 99760/316861 [5:32:02<1:39:22, 36.41batch/s]

[2025-05-20 22:37:58,021] [INFO] [unfused_optimizer.py:294:_update_scale] Grad overflow on iteration 104155
[2025-05-20 22:37:58,023] [INFO] [unfused_optimizer.py:295:_update_scale] Using static loss scale of 1024
[2025-05-20 22:37:58,023] [INFO] [unfused_optimizer.py:208:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1024, reducing to 1024


Epoch 2:  31%|███▏      | 99781/316861 [5:32:02<1:42:52, 35.17batch/s]

[2025-05-20 22:37:58,587] [INFO] [unfused_optimizer.py:294:_update_scale] Grad overflow on iteration 104160
[2025-05-20 22:37:58,589] [INFO] [unfused_optimizer.py:295:_update_scale] Using static loss scale of 1024
[2025-05-20 22:37:58,590] [INFO] [unfused_optimizer.py:208:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1024, reducing to 1024


Epoch 2:  31%|███▏      | 99786/316861 [5:32:03<1:35:50, 37.75batch/s]

[2025-05-20 22:37:58,686] [INFO] [unfused_optimizer.py:294:_update_scale] Grad overflow on iteration 104161
[2025-05-20 22:37:58,687] [INFO] [unfused_optimizer.py:295:_update_scale] Using static loss scale of 1024
[2025-05-20 22:37:58,688] [INFO] [unfused_optimizer.py:208:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1024, reducing to 1024
[2025-05-20 22:37:58,774] [INFO] [unfused_optimizer.py:294:_update_scale] Grad overflow on iteration 104162
[2025-05-20 22:37:58,775] [INFO] [unfused_optimizer.py:295:_update_scale] Using static loss scale of 1024
[2025-05-20 22:37:58,777] [INFO] [unfused_optimizer.py:208:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1024, reducing to 1024


Epoch 2:  31%|███▏      | 99791/316861 [5:32:03<1:33:36, 38.65batch/s]

[2025-05-20 22:38:00,206] [INFO] [unfused_optimizer.py:294:_update_scale] Grad overflow on iteration 104163
[2025-05-20 22:38:00,207] [INFO] [unfused_optimizer.py:295:_update_scale] Using static loss scale of 1024
[2025-05-20 22:38:00,208] [INFO] [unfused_optimizer.py:208:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1024, reducing to 1024


Epoch 2:  31%|███▏      | 99795/316861 [5:32:04<7:01:01,  8.59batch/s]

[2025-05-20 22:38:00,290] [INFO] [unfused_optimizer.py:294:_update_scale] Grad overflow on iteration 104164
[2025-05-20 22:38:00,291] [INFO] [unfused_optimizer.py:295:_update_scale] Using static loss scale of 1024
[2025-05-20 22:38:00,292] [INFO] [unfused_optimizer.py:208:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1024, reducing to 1024


Epoch 2:  31%|███▏      | 99800/316861 [5:32:04<5:08:08, 11.74batch/s]

[2025-05-20 22:38:00,378] [INFO] [unfused_optimizer.py:294:_update_scale] Grad overflow on iteration 104165
[2025-05-20 22:38:00,379] [INFO] [unfused_optimizer.py:295:_update_scale] Using static loss scale of 1024
[2025-05-20 22:38:00,380] [INFO] [unfused_optimizer.py:208:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1024, reducing to 1024


Epoch 2:  31%|███▏      | 99805/316861 [5:32:04<3:54:04, 15.46batch/s]

[2025-05-20 22:38:00,461] [INFO] [unfused_optimizer.py:294:_update_scale] Grad overflow on iteration 104166
[2025-05-20 22:38:00,462] [INFO] [unfused_optimizer.py:295:_update_scale] Using static loss scale of 1024
[2025-05-20 22:38:00,463] [INFO] [unfused_optimizer.py:208:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1024, reducing to 1024


Epoch 2:  31%|███▏      | 99810/316861 [5:32:04<3:03:54, 19.67batch/s]

[2025-05-20 22:38:00,651] [INFO] [unfused_optimizer.py:294:_update_scale] Grad overflow on iteration 104168
[2025-05-20 22:38:00,652] [INFO] [unfused_optimizer.py:295:_update_scale] Using static loss scale of 1024
[2025-05-20 22:38:00,653] [INFO] [unfused_optimizer.py:208:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1024, reducing to 1024


Epoch 2:  32%|███▏      | 99815/316861 [5:32:05<2:36:46, 23.07batch/s]

[2025-05-20 22:38:00,732] [INFO] [unfused_optimizer.py:294:_update_scale] Grad overflow on iteration 104169
[2025-05-20 22:38:00,734] [INFO] [unfused_optimizer.py:295:_update_scale] Using static loss scale of 1024
[2025-05-20 22:38:00,734] [INFO] [unfused_optimizer.py:208:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1024, reducing to 1024


Epoch 2:  32%|███▏      | 99820/316861 [5:32:05<2:11:15, 27.56batch/s]

[2025-05-20 22:38:00,815] [INFO] [unfused_optimizer.py:294:_update_scale] Grad overflow on iteration 104170
[2025-05-20 22:38:00,815] [INFO] [unfused_optimizer.py:295:_update_scale] Using static loss scale of 1024
[2025-05-20 22:38:00,816] [INFO] [unfused_optimizer.py:208:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1024, reducing to 1024


Epoch 2:  32%|███▏      | 99826/316861 [5:32:05<1:50:12, 32.82batch/s]

[2025-05-20 22:38:00,894] [INFO] [unfused_optimizer.py:294:_update_scale] Grad overflow on iteration 104171
[2025-05-20 22:38:00,895] [INFO] [unfused_optimizer.py:295:_update_scale] Using static loss scale of 1024
[2025-05-20 22:38:00,895] [INFO] [unfused_optimizer.py:208:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1024, reducing to 1024
[2025-05-20 22:38:00,973] [INFO] [unfused_optimizer.py:294:_update_scale] Grad overflow on iteration 104172
[2025-05-20 22:38:00,974] [INFO] [unfused_optimizer.py:295:_update_scale] Using static loss scale of 1024
[2025-05-20 22:38:00,975] [INFO] [unfused_optimizer.py:208:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1024, reducing to 1024


Epoch 2:  32%|███▏      | 99846/316861 [5:32:05<1:32:16, 39.20batch/s]

[2025-05-20 22:38:01,365] [INFO] [unfused_optimizer.py:294:_update_scale] Grad overflow on iteration 104176
[2025-05-20 22:38:01,366] [INFO] [unfused_optimizer.py:295:_update_scale] Using static loss scale of 1024
[2025-05-20 22:38:01,367] [INFO] [unfused_optimizer.py:208:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1024, reducing to 1024
[2025-05-20 22:38:01,444] [INFO] [unfused_optimizer.py:294:_update_scale] Grad overflow on iteration 104177
[2025-05-20 22:38:01,446] [INFO] [unfused_optimizer.py:295:_update_scale] Using static loss scale of 1024
[2025-05-20 22:38:01,446] [INFO] [unfused_optimizer.py:208:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1024, reducing to 1024


Epoch 2:  32%|███▏      | 100139/316861 [5:32:24<1:47:24, 33.63batch/s]2025-05-20 22:40:59,497 - INFO - Epoch 2 (Step 417000): Train loss 7.804, Val loss 7.652Validation Perplexity: 2303.00
Epoch 2:  32%|███▏      | 101136/316861 [5:35:46<1:34:31, 38.04batch/s]  2025-05-20 22:44:18,550 - INFO - Epoch 2 (Step 418000): Train loss 7.457, Val loss 7.354Validation Perplexity: 1717.67
Epoch 2:  32%|███▏      | 102136/316861 [5:39:07<1:42:41, 34.85batch/s]  2025-05-20 22:47:38,999 - INFO - Epoch 2 (Step 419000): Train loss 7.029, Val loss 7.000Validation Perplexity: 1193.14
Epoch 2:  33%|███▎      | 103136/316861 [5:42:29<1:37:05, 36.69batch/s]  2025-05-20 22:50:58,998 - INFO - Epoch 2 (Step 420000): Train loss 4.449, Val loss 4.412Validation Perplexity: 97.34
Epoch 2:  33%|███▎      | 104135/316861 [5:45:40<1:27:13, 40.65batch/s]  2025-05-20 22:54:14,512 - INFO - Epoch 2 (Step 421000): Train loss 4.429, Val loss 4.373Validation Perplexity: 93.67
Epoch 2:  33%|███▎      | 105139/316861 [5:49:

[2025-05-20 23:21:02,681] [INFO] [unfused_optimizer.py:294:_update_scale] Grad overflow on iteration 107293
[2025-05-20 23:21:02,682] [INFO] [unfused_optimizer.py:295:_update_scale] Using static loss scale of 1024
[2025-05-20 23:21:02,682] [INFO] [unfused_optimizer.py:208:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1024, reducing to 1024


Epoch 2:  35%|███▌      | 112318/316861 [6:15:07<1:19:31, 42.86batch/s]

[2025-05-20 23:21:02,760] [INFO] [unfused_optimizer.py:294:_update_scale] Grad overflow on iteration 107294
[2025-05-20 23:21:02,761] [INFO] [unfused_optimizer.py:295:_update_scale] Using static loss scale of 1024
[2025-05-20 23:21:02,761] [INFO] [unfused_optimizer.py:208:step] [deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: 1024, reducing to 1024


Epoch 2:  36%|███▌      | 113139/316861 [6:15:44<1:39:37, 34.08batch/s]2025-05-20 23:24:18,791 - INFO - Epoch 2 (Step 430000): Train loss 4.463, Val loss 4.319Validation Perplexity: 90.20
Epoch 2:  36%|███▌      | 114135/316861 [6:18:59<1:23:42, 40.37batch/s]  2025-05-20 23:27:35,246 - INFO - Epoch 2 (Step 431000): Train loss 4.356, Val loss 4.323Validation Perplexity: 88.57
Epoch 2:  36%|███▋      | 115139/316861 [6:22:20<1:34:48, 35.46batch/s]  2025-05-20 23:30:55,532 - INFO - Epoch 2 (Step 432000): Train loss 4.281, Val loss 4.298Validation Perplexity: 88.21
Epoch 2:  37%|███▋      | 116136/316861 [6:25:38<1:26:54, 38.49batch/s]  2025-05-20 23:34:15,068 - INFO - Epoch 2 (Step 433000): Train loss 4.421, Val loss 4.301Validation Perplexity: 87.81
Epoch 2:  37%|███▋      | 117136/316861 [6:29:00<1:26:13, 38.61batch/s]  2025-05-20 23:37:33,781 - INFO - Epoch 2 (Step 434000): Train loss 4.487, Val loss 4.300Validation Perplexity: 87.96
Epoch 2:  37%|███▋      | 118136/316861 [6:32:21<1:2

Every effort moves you, you can use it to create a more effective and effective solution. - It can be used to create a more effective solution for your business. - It can be used to create a more effective solution for your business. - It can


: 

In [31]:
import torch
import torch.nn.functional as F
from tqdm import tqdm
import deepspeed

def distillation_loss(student_logits, teacher_logits, targets, temperature=2.0, alpha=0.7):
    """
    student_logits: (batch, seq_len, vocab_size)
    teacher_logits: (batch, seq_len, vocab_size)
    targets: (batch, seq_len)
    """
    # Soft targets (Teacher's softened prediction)
    teacher_probs = F.softmax(teacher_logits / temperature, dim=-1)
    student_log_probs = F.log_softmax(student_logits / temperature, dim=-1)

    # KL divergence between softened outputs
    distill_loss = F.kl_div(student_log_probs, teacher_probs, reduction="batchmean") * (temperature ** 2)

    # Hard label cross-entropy
    ce_loss = F.cross_entropy(student_logits.view(-1, student_logits.size(-1)), targets.view(-1))

    # Total loss
    total_loss = alpha * distill_loss + (1 - alpha) * ce_loss

    return total_loss


def train_distillation(student_model, teacher_model, train_loader, val_loader, optimizer, device, num_epochs,
                       deepspeed_config, tokenizer, start_context,
                       eval_freq=1000, eval_iter=10, temperature=2.0, alpha=0.7,
                       save_dir="./checkpoints_distill"):
    """
    student_model: 학생 모델 (GPT 같은거)
    teacher_model: 선생 모델 (GPT 큰거)
    train_loader: 학습 데이터
    optimizer: 학생 모델 Optimizer
    device: "cuda" or "cpu"
    num_epochs: 전체 에폭 수
    deepspeed_config: DeepSpeed 설정 dict
    tokenizer: 텍스트 생성할 때 쓰는 토크나이저
    start_context: epoch 끝날 때 sample 시작 문장
    eval_freq: evaluation 주기 (step 단위)
    eval_iter: evaluation용 배치 수
    temperature: distillation 온도
    alpha: distillation alpha
    save_dir: 체크포인트 저장 디렉토리
    """

    # Initialize DeepSpeed
    # student_model, optimizer, _, _ = deepspeed.initialize(
    #     model=student_model,
    #     optimizer=optimizer,
    #     config_params=deepspeed_config,
    #     args=None
    # )

    # Teacher 준비
    teacher_model.eval()
    teacher_model.to(device)

    # 디렉토리 준비
    os.makedirs(save_dir, exist_ok=True)

    # Track
    train_losses = []
    val_losses = []
    track_tokens_seen = []
    tokens_seen = 0
    global_step = 0

    context_size = student_model.module.pos_emb.weight.shape[0] if hasattr(student_model, "module") else student_model.pos_emb.weight.shape[0]

    for epoch in range(num_epochs):
        student_model.train()
        print(f"Starting Epoch {epoch+1}...")

        for input_batch, target_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}", unit="batch"):
            input_batch = input_batch.to(device)
            target_batch = target_batch.to(device)

            optimizer.zero_grad()

            # Forward
            with torch.no_grad():
                teacher_logits = teacher_model(input_batch).logits

            student_logits = student_model(input_batch)

            # Distillation loss
            loss = distillation_loss(
                student_logits=student_logits,
                teacher_logits=teacher_logits,
                targets=target_batch,
                temperature=temperature,
                alpha=alpha
            )

            # Backward + Step
            # student_model.backward(loss)
            # student_model.step()

            # Backward + Step
            loss.backward()
            optimizer.step()

            tokens_seen += input_batch.numel()
            global_step += 1

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    student_model, train_loader, val_loader, device, eval_iter
                )
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                ppl = compute_perplexity(student_model, val_loader, device)
                track_tokens_seen.append(tokens_seen)
                logger.info(f"Epoch {epoch+1} (Step {global_step:06d}): "
                            f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f} "
                            f"Validation Perplexity: {ppl:.2f}")


        # Epoch 종료 후 샘플 텍스트 출력
        print(f"Epoch {epoch+1} completed. Generating sample...")
        generate_and_print_sample(
            model=student_model,
            tokenizer=tokenizer,
            device=device,
            start_context=start_context
        )

        # Checkpoint 저장
        epoch_ckpt_dir = os.path.join(save_dir, f"checkpoint_distill-epoch{epoch+1}")
        student_model.save_checkpoint(epoch_ckpt_dir)
        print(f"Checkpoint saved at: {epoch_ckpt_dir}")

    print("Training completed!")
    return train_losses, track_tokens_seen



In [32]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

teacher_model = GPT2LMHeadModel.from_pretrained('gpt2-large').to(device)
teacher_model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1280)
    (wpe): Embedding(1024, 1280)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-35): 36 x GPT2Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=3840, nx=1280)
          (c_proj): Conv1D(nf=1280, nx=1280)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=5120, nx=1280)
          (c_proj): Conv1D(nf=1280, nx=5120)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1280, out_features=50257, bias=False)
)

In [33]:
# 입력 문장
start_context = "Every effort moves you"
input_ids = tokenizer.encode(start_context)
input_tensor = torch.tensor([input_ids], dtype=torch.long).to(device)  # batch 1

# 텍스트 생성
with torch.no_grad():
    output_ids = teacher_model.generate(
        input_tensor,
        max_new_tokens=50,
        temperature=1.0,
        top_p=0.9,
        do_sample=True
    )

# 디코딩
output_text = tokenizer.decode(output_ids[0].tolist())

print("\n===== Generated Output =====")
print(output_text.replace("\n", " "))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



===== Generated Output =====
Every effort moves you forward in the direction of a career in journalism. But we do not think the role is so important that anyone who doesn't find it meaningful should continue to do it."  In addition to the journalism scholarship, the fellows will also attend journalism school


In [34]:
torch.manual_seed(123)
student_model = GPTModel(GPT_CONFIG_124M)
student_model.to(device)
optimizer = torch.optim.AdamW(student_model.parameters(), lr=0.0004)

trained_student = train_distillation(
    student_model=student_model,
    teacher_model=teacher_model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    tokenizer=tokenizer,
    start_context="Every effort moves you",
    device="cuda",
    num_epochs=3,
    deepspeed_config="./ds_config.json",
    temperature=2.0,
    alpha=0.7
)


Starting Epoch 1...


Epoch 1:   1%|          | 999/105122 [02:35<4:57:37,  5.83batch/s] 2025-04-30 12:19:48,112 - INFO - Epoch 1 (Step 001000): Train loss 6.689, Val loss 6.885 Validation Perplexity: 847.75
Epoch 1:   2%|▏         | 1999/105122 [08:48<4:10:14,  6.87batch/s]   2025-04-30 12:26:00,163 - INFO - Epoch 1 (Step 002000): Train loss 6.468, Val loss 6.480 Validation Perplexity: 631.10
Epoch 1:   3%|▎         | 2999/105122 [15:00<4:07:03,  6.89batch/s]   2025-04-30 12:32:34,517 - INFO - Epoch 1 (Step 003000): Train loss 6.559, Val loss 6.518 Validation Perplexity: 680.98
Epoch 1:   4%|▍         | 3999/105122 [21:33<4:38:35,  6.05batch/s]   2025-04-30 12:38:44,429 - INFO - Epoch 1 (Step 004000): Train loss 6.490, Val loss 6.709 Validation Perplexity: 782.81
Epoch 1:   5%|▍         | 4999/105122 [27:42<4:43:58,  5.88batch/s]   2025-04-30 12:44:54,250 - INFO - Epoch 1 (Step 005000): Train loss 6.760, Val loss 6.720 Validation Perplexity: 759.84
Epoch 1:   6%|▌         | 5999/105122 [33:52<4:20:19,  6.3

KeyboardInterrupt: 