In [1]:
import os
import sys
# LightLMローカルモジュール
from model import Transformer, ModelConfig
from trainer import Trainer, TrainerConfig, DataLoader


# 外部ライブラリ
from transformers import AutoTokenizer
import torch
import torch.nn.functional as F
from tqdm import tqdm
import os
import time
import json

print("全てのインポート完了")

  from .autonotebook import tqdm as notebook_tqdm


全てのインポート完了


In [2]:
torch.set_float32_matmul_precision('high')
torch.cuda.empty_cache()

tokenizer_id = "HuggingFaceTB/SmolLM-360M"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
tokenizer.pad_token = tokenizer.eos_token

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
# 設定パラメータ（Colab環境最適化）
print("🛠️ 学習・モデル設定を構成中...")

# Colab環境向けTrainerConfig
train_config = TrainerConfig(
    vocab_size=tokenizer.vocab_size,
    num_epochs=4,  # Colab環境での短時間テスト

    # Colab環境設定（単一GPU）
    use_ddp=False,  # Colab は単一GPU環境
    use_moe=True,
    use_lossfreebalance=False,
    clean_cuda_cache=True,
    use_compile=True,  # PyTorch 2.0 最適化
    use_dtype="bfloat16" if device == 'cuda' else "float32",

    seed=42,
    max_seq_len=512,  # Colab GPU メモリ制限のため短縮
    batch_size=2,     # Colab GPU メモリに適合
    accumulation_steps=64,  # 実効バッチサイズ32を維持

    weight_decay=0.1,
    warmup_ratio=0.1,
    learning_rate=5e-4,
    betas=(0.90, 0.95),
    update_rate=1e-5,

    val_ratio=0.005,
    steps_for_eval=10000,  # より頻繁な評価
    eval_interval=500,   # より短い間隔

    checkpoints_frequency=250,
    path_to_checkpoints="./model_testing",

    # データセットパスをマイドライブ基準に変更
    #tokenized_dataset_path = "HuggingFaceTB/cosmopedia",
    tokenized_dataset_path = "HuggingFaceFW/fineweb-edu",
    sub_target_files = "data/CC-MAIN-2025-26/*.parquet",
    #sub_target_files = "data/CC-MAIN-2013-20/train-00000-of-00014.parquet",
    eval_log_file="./log/eval.txt",

    continue_train = True,
    checkpoint_path = 'model_testing/model.checkpoint.epoch0_step2250_global2250.pt',
)

# Colab環境向けModelConfig
config = ModelConfig(
    vocab_size=tokenizer.vocab_size,

    num_dims=512,      # 効率的なサイズ
    num_heads=16,
    num_kv_heads=4,    # GQA による効率化
    num_layers=24,     # Colab GPU に適したサイズ
    ffn_hidden_dims=512 * 4,

    rmsnorm_eps=1e-6,
    rope_theta=1e5,

    context_len=1024,  # Colab 環境での制限

    use_cache=False,
    use_flash=True,    # 利用可能な場合
    use_moe=True,     # シンプル構成

    # MoE設定（未使用）
    moe_num_experts=4,
    moe_active_experts=1,
    moe_eps=1e-6,
    moe_aux_loss_coef=0.01,
    moe_shared_experts=1,
    use_lossfreebalance=False,
)

# パラメータ数を計算・表示
def estimate_parameters(config):
    # 概算計算
    vocab_emb = config.vocab_size * config.num_dims
    pos_emb = 0  # RoPE使用のため位置埋め込み不要
    attention = config.num_layers * config.num_dims * (
        config.num_dims * (config.num_heads + 2 * config.num_kv_heads) // config.num_heads +
        config.num_dims * config.num_dims
    )
    ffn = config.num_layers * config.num_dims * config.ffn_hidden_dims * 2
    output_head = config.vocab_size * config.num_dims

    total = (vocab_emb + attention + ffn) // 1_000_000  # 出力層は重み共有と仮定
    return total

estimated_params = estimate_parameters(config)
print(f"📊 推定パラメータ数: 約 {estimated_params}M")
print(f"🎯 最大シーケンス長: {train_config.max_seq_len}")
print(f"📦 バッチサイズ: {train_config.batch_size} (実効: {train_config.batch_size * train_config.accumulation_steps})")
print("✅ 設定完了 - Colab環境向けに最適化済み")

🛠️ 学習・モデル設定を構成中...
📊 推定パラメータ数: 約 3306M
🎯 最大シーケンス長: 512
📦 バッチサイズ: 2 (実効: 128)
✅ 設定完了 - Colab環境向けに最適化済み


In [4]:
# 必要なディレクトリを作成
os.makedirs("./model_testing", exist_ok=True)
os.makedirs("./log", exist_ok=True)


# モデル初期化
model = Transformer(config)

# パラメータ数を正確に計算
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"実際のパラメータ数: {total_params:,} ({total_params/1e6:.1f}M)")
print(f"学習可能パラメータ: {trainable_params:,}")


"""if continue_train:
    checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))

    state_dict = checkpoint['model']
    new_state_dict = {}
    for k, v in state_dict.items():
        if k.startswith("_orig_mod."):
            new_state_dict[k[len("_orig_mod."):]] = v
        else:
            new_state_dict[k] = v

    model.load_state_dict(new_state_dict, strict=False)

model.to(device)"""


実際のパラメータ数: 418,456,064 (418.5M)
学習可能パラメータ: 418,456,064


'if continue_train:\n    checkpoint = torch.load(checkpoint_path, map_location=torch.device(\'cpu\'))\n\n    state_dict = checkpoint[\'model\']\n    new_state_dict = {}\n    for k, v in state_dict.items():\n        if k.startswith("_orig_mod."):\n            new_state_dict[k[len("_orig_mod."):]] = v\n        else:\n            new_state_dict[k] = v\n\n    model.load_state_dict(new_state_dict, strict=False)\n\nmodel.to(device)'

In [5]:
# データローダー初期化
data_loader = DataLoader(train_config, tokenizer=tokenizer, hf_split="train", cache = "./cache", use_cache=True)



Initializing DataLoader...
Loading Hugging Face dataset 'HuggingFaceFW/fineweb-edu' split='train' and tokenizing (seq_len=512) ...
Using text column: 'text'
Loading tokenized dataset from cache: ./cache
Dataset loaded.
Total tokens loaded:  7,186,815,488
DataLoader initialized. Dataset size: 14036749, Train size: 13966566, Val size: 70183
Train indices: 0 to 13966566, Val indices: 13966566 to 14036749


In [6]:
# トレーナー初期化
trainer = Trainer(train_config, model, tokenizer)
# Trainerにcheckpointを渡して再開
#if continue_train and checkpoint is not None:
#    print("チェックポイントからトレーナーを再開中...")
#    trainer.load_checkpoint(checkpoint)

print("全コンポーネント初期化完了！")

Checkpoint loaded: epoch=0, step=2250, global_step=2250
Continuing training from checkpoint: model_testing/model.checkpoint.epoch0_step2250_global2250.pt
Restoring model weights from checkpoint...
Device: cuda:0
Model's trainable params: 418.46M
Tokens per step: 65536
use torch.compile(): True
Use MoE: Yes 
Number of experts: 4
Number of used experts during inference: 1
Method of aux_loss: default
Number of parameters will be used during inference: 191.93M
全コンポーネント初期化完了！


In [7]:
trainer.train(data_loader)

Calculating number of training steps...
Preparing for training...
Calculating number of training steps...
data_loader.num_train_steps(): 6983283, accumulation_steps: 64
DDP: False 
Use MoE: True
Use loss-free balancing: False
Batch size per GPU: 2
Max sequence length: 512
Gradient accumulation steps: 64
Effective batch size: 128
Total tokens per step: 65536
Total tokens per epoch: 7150895104
Number of steps per epoch: 109114
Restoring training state from checkpoint...
Training state restored from checkpoint: epoch=0, step=2251, global_step=2251
Starting training...
Total epochs: 4, steps per epoch: 109114, total steps: 436456
Resuming from epoch 0, step 2251 (global step 2251)
Epoch: 0 | Step: 2251/109114 | Global Step: 2251 | loss: 4.7118 | norm: 1.4038 | lr: 4.9530874098e-05 | tok/s: 4397.2 | dataset idx: 288512/13966566
Epoch: 0 | Step: 2252/109114 | Global Step: 2252 | loss: 4.6915 | norm: 0.8477 | lr: 4.9541757361e-05 | tok/s: 5286.4 | dataset idx: 288640/13966566
Epoch: 0 | Step:

100%|██████████| 10000/10000 [11:11<00:00, 14.88it/s]


Evaluation at global step 2500: val_loss = 4.5338
Cleaning CUDA cache
Checkpoint saved: ./model_testing/model.checkpoint.epoch0_step2500_global2500.pt
Epoch: 0 | Step: 2501/109114 | Global Step: 2501 | loss: 4.6605 | norm: 1.3245 | lr: 5.2251689770e-05 | tok/s: 5235.1 | dataset idx: 320512/13966566
Epoch: 0 | Step: 2502/109114 | Global Step: 2502 | loss: 4.4974 | norm: 0.8653 | lr: 5.2262573032e-05 | tok/s: 5247.4 | dataset idx: 320640/13966566
Epoch: 0 | Step: 2503/109114 | Global Step: 2503 | loss: 4.4015 | norm: 1.2748 | lr: 5.2273456295e-05 | tok/s: 5187.5 | dataset idx: 320768/13966566
Epoch: 0 | Step: 2504/109114 | Global Step: 2504 | loss: 4.6469 | norm: 0.9101 | lr: 5.2284339558e-05 | tok/s: 5276.0 | dataset idx: 320896/13966566
Epoch: 0 | Step: 2505/109114 | Global Step: 2505 | loss: 4.4659 | norm: 1.5079 | lr: 5.2295222820e-05 | tok/s: 5267.6 | dataset idx: 321024/13966566
Epoch: 0 | Step: 2506/109114 | Global Step: 2506 | loss: 4.5060 | norm: 1.1883 | lr: 5.2306106083e-05 | 

100%|██████████| 10000/10000 [11:14<00:00, 14.83it/s]


Evaluation at global step 3000: val_loss = 4.3769
Cleaning CUDA cache
Checkpoint saved: ./model_testing/model.checkpoint.epoch0_step3000_global3000.pt
Epoch: 0 | Step: 3001/109114 | Global Step: 3001 | loss: 4.4593 | norm: 1.1635 | lr: 5.7693321114e-05 | tok/s: 5138.1 | dataset idx: 384512/13966566
Epoch: 0 | Step: 3002/109114 | Global Step: 3002 | loss: 4.2851 | norm: 0.9688 | lr: 5.7704204376e-05 | tok/s: 5198.1 | dataset idx: 384640/13966566
Epoch: 0 | Step: 3003/109114 | Global Step: 3003 | loss: 4.5812 | norm: 1.0072 | lr: 5.7715087639e-05 | tok/s: 5545.1 | dataset idx: 384768/13966566
Epoch: 0 | Step: 3004/109114 | Global Step: 3004 | loss: 4.3559 | norm: 1.1176 | lr: 5.7725970902e-05 | tok/s: 5557.7 | dataset idx: 384896/13966566
Epoch: 0 | Step: 3005/109114 | Global Step: 3005 | loss: 4.5473 | norm: 0.8884 | lr: 5.7736854164e-05 | tok/s: 5627.5 | dataset idx: 385024/13966566
Epoch: 0 | Step: 3006/109114 | Global Step: 3006 | loss: 4.3662 | norm: 1.0486 | lr: 5.7747737427e-05 | 

100%|██████████| 10000/10000 [11:11<00:00, 14.89it/s]


Evaluation at global step 3500: val_loss = 4.2353
Cleaning CUDA cache
Checkpoint saved: ./model_testing/model.checkpoint.epoch0_step3500_global3500.pt
Epoch: 0 | Step: 3501/109114 | Global Step: 3501 | loss: 4.4025 | norm: 0.8331 | lr: 6.3134952457e-05 | tok/s: 5244.1 | dataset idx: 448512/13966566
Epoch: 0 | Step: 3502/109114 | Global Step: 3502 | loss: 4.2414 | norm: 0.9116 | lr: 6.3145835720e-05 | tok/s: 5284.0 | dataset idx: 448640/13966566
Epoch: 0 | Step: 3503/109114 | Global Step: 3503 | loss: 4.1069 | norm: 0.7068 | lr: 6.3156718983e-05 | tok/s: 5280.4 | dataset idx: 448768/13966566
Epoch: 0 | Step: 3504/109114 | Global Step: 3504 | loss: 4.4033 | norm: 0.8463 | lr: 6.3167602245e-05 | tok/s: 5283.8 | dataset idx: 448896/13966566
Epoch: 0 | Step: 3505/109114 | Global Step: 3505 | loss: 4.3072 | norm: 0.8198 | lr: 6.3178485508e-05 | tok/s: 5280.3 | dataset idx: 449024/13966566
Epoch: 0 | Step: 3506/109114 | Global Step: 3506 | loss: 4.1474 | norm: 0.8364 | lr: 6.3189368771e-05 | 

  2%|▏         | 182/10000 [00:12<11:01, 14.85it/s]

Epoch finished.
current_idx: 14036748 ,end_idx: 14036749, new_idx: 14036750, len_dataset: 14036749, start_idx: 13966566
Starting new epoch...
Current epoch: 1
Loading Hugging Face dataset 'HuggingFaceFW/fineweb-edu' split='train' and tokenizing (seq_len=512) ...


  2%|▏         | 182/10000 [00:23<11:01, 14.85it/s]

Using text column: 'text'
Loading tokenized dataset from cache: ./cache
Dataset loaded.


100%|██████████| 10000/10000 [11:27<00:00, 14.55it/s]


Evaluation at global step 4000: val_loss = 4.1052
Cleaning CUDA cache
Checkpoint saved: ./model_testing/model.checkpoint.epoch0_step4000_global4000.pt
Epoch: 0 | Step: 4001/109114 | Global Step: 4001 | loss: 4.0164 | norm: 0.7313 | lr: 6.8576583801e-05 | tok/s: 5234.4 | dataset idx: 512512/13966566
Epoch: 0 | Step: 4002/109114 | Global Step: 4002 | loss: 4.0020 | norm: 0.8087 | lr: 6.8587467064e-05 | tok/s: 5236.4 | dataset idx: 512640/13966566
Epoch: 0 | Step: 4003/109114 | Global Step: 4003 | loss: 4.0667 | norm: 0.8872 | lr: 6.8598350326e-05 | tok/s: 5250.8 | dataset idx: 512768/13966566
Epoch: 0 | Step: 4004/109114 | Global Step: 4004 | loss: 4.2029 | norm: 0.7182 | lr: 6.8609233589e-05 | tok/s: 5167.8 | dataset idx: 512896/13966566
Epoch: 0 | Step: 4005/109114 | Global Step: 4005 | loss: 4.1887 | norm: 0.9417 | lr: 6.8620116852e-05 | tok/s: 5247.1 | dataset idx: 513024/13966566
Epoch: 0 | Step: 4006/109114 | Global Step: 4006 | loss: 4.2127 | norm: 0.8588 | lr: 6.8631000115e-05 | 

100%|██████████| 10000/10000 [11:10<00:00, 14.92it/s]


Evaluation at global step 4500: val_loss = 3.9782
Cleaning CUDA cache
Checkpoint saved: ./model_testing/model.checkpoint.epoch0_step4500_global4500.pt
Epoch: 0 | Step: 4501/109114 | Global Step: 4501 | loss: 4.0164 | norm: 0.6952 | lr: 7.4018215145e-05 | tok/s: 5256.5 | dataset idx: 576512/13966566
Epoch: 0 | Step: 4502/109114 | Global Step: 4502 | loss: 3.8805 | norm: 0.6852 | lr: 7.4029098408e-05 | tok/s: 5251.9 | dataset idx: 576640/13966566
Epoch: 0 | Step: 4503/109114 | Global Step: 4503 | loss: 3.9121 | norm: 0.7059 | lr: 7.4039981670e-05 | tok/s: 5284.9 | dataset idx: 576768/13966566
Epoch: 0 | Step: 4504/109114 | Global Step: 4504 | loss: 3.9473 | norm: 0.7958 | lr: 7.4050864933e-05 | tok/s: 5265.4 | dataset idx: 576896/13966566
Epoch: 0 | Step: 4505/109114 | Global Step: 4505 | loss: 4.1306 | norm: 0.6688 | lr: 7.4061748196e-05 | tok/s: 5253.2 | dataset idx: 577024/13966566
Epoch: 0 | Step: 4506/109114 | Global Step: 4506 | loss: 3.9427 | norm: 0.7486 | lr: 7.4072631458e-05 | 

100%|██████████| 10000/10000 [11:10<00:00, 14.92it/s]


Evaluation at global step 5000: val_loss = 3.8549
Cleaning CUDA cache
Checkpoint saved: ./model_testing/model.checkpoint.epoch0_step5000_global5000.pt
Epoch: 0 | Step: 5001/109114 | Global Step: 5001 | loss: 3.9306 | norm: 0.7433 | lr: 7.9459846489e-05 | tok/s: 5156.6 | dataset idx: 640512/13966566
Epoch: 0 | Step: 5002/109114 | Global Step: 5002 | loss: 3.7298 | norm: 0.5968 | lr: 7.9470729751e-05 | tok/s: 5245.1 | dataset idx: 640640/13966566
Epoch: 0 | Step: 5003/109114 | Global Step: 5003 | loss: 3.8267 | norm: 0.6621 | lr: 7.9481613014e-05 | tok/s: 5270.5 | dataset idx: 640768/13966566
Epoch: 0 | Step: 5004/109114 | Global Step: 5004 | loss: 3.7215 | norm: 0.6285 | lr: 7.9492496277e-05 | tok/s: 5250.8 | dataset idx: 640896/13966566
Epoch: 0 | Step: 5005/109114 | Global Step: 5005 | loss: 3.8203 | norm: 0.5980 | lr: 7.9503379539e-05 | tok/s: 5231.4 | dataset idx: 641024/13966566
Epoch: 0 | Step: 5006/109114 | Global Step: 5006 | loss: 3.9257 | norm: 0.6431 | lr: 7.9514262802e-05 | 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 

: 