In [1]:
from google.colab import drive
import os
import sys

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers>=4.21.0 --quiet
!pip install torch torchvision torchaudio --quiet
!pip install tqdm --quiet
!pip install datasets --quiet
!pip install datatrove --quiet

try:
    !pip install flash-attn --no-build-isolation --quiet
    print("Flash Attention がインストールされました")
except:
    print("Flash Attention のインストールに失敗 - 標準Attentionを使用します")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m67.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.6/61.6 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone
Flash Attention がインストールされました


In [3]:
#Google Drive内のモジュールを利用するために作業ディレクトリを変更し、pathに追加する
lightlm_path = '/content/drive/MyDrive/LightLM'

os.chdir(lightlm_path)
print(f"作業ディレクトリ: {os.getcwd()}")

if lightlm_path not in sys.path:
    sys.path.insert(0, lightlm_path)

作業ディレクトリ: /content/drive/MyDrive/LightLM


In [4]:
from model import Transformer, ModelConfig
from trainer import Trainer, TrainerConfig, DataLoader

from transformers import AutoTokenizer
import torch
import torch.nn.functional as F
from tqdm import tqdm
import os
import time
import json

print("全てのインポート完了")

全てのインポート完了


In [5]:
torch.set_float32_matmul_precision('high')
torch.cuda.empty_cache()

tokenizer_id = "HuggingFaceTB/SmolLM-360M"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
tokenizer.pad_token = tokenizer.eos_token

device = 'cuda' if torch.cuda.is_available() else 'cpu'

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

In [6]:
print("学習・モデル設定を構成中...")

train_config = TrainerConfig(
    vocab_size=tokenizer.vocab_size,
    num_epochs=4,
    use_ddp=False,
    use_moe=True,
    use_lossfreebalance=False,
    clean_cuda_cache=True,
    use_compile=True,
    use_dtype="float16" if device == 'cuda' else "float32", #T4はbfloatに未対応

    seed=42,
    max_seq_len=512,
    batch_size=2,
    accumulation_steps=64,

    weight_decay=0.1,
    warmup_ratio=0.1,
    learning_rate=5e-4,
    betas=(0.90, 0.95),
    update_rate=1e-5,

    val_ratio=0.005,
    steps_for_eval=1000,
    eval_interval=250,

    checkpoints_frequency=250,
    path_to_checkpoints="/content/drive/MyDrive/LightLM/model_testing",
    max_checkpoints_to_keep=4, # 0の場合は全て保持、-1の場合は最新1つのチェックポイントを保持　colabの場合はゴミ箱システムのせいでどれを設定しても結局重みは圧迫する

    tokenized_dataset_path = "HuggingFaceFW/fineweb-edu",
    #sub_target_files = "", #all data
    #sub_target_files = "data/CC-MAIN-2025-26/*.parquet",
    #sub_target_files = "data/CC-MAIN-2025-26/000_00049.parquet",
    sub_target_files = [
        "data/CC-MAIN-2025-26/000_00047.parquet",
        "data/CC-MAIN-2025-26/000_00048.parquet",
        "data/CC-MAIN-2025-26/000_00049.parquet"
    ],
    eval_log_file="/content/drive/MyDrive/LightLM/log/eval.txt",

    continue_train = False,
    checkpoint_path = 'model_testing/model.checkpoint.epoch0_step16000_global16000.pt',
)

config = ModelConfig(
    vocab_size=tokenizer.vocab_size,

    num_dims=512,
    num_heads=16,
    num_kv_heads=4,    # GQA による効率化
    num_layers=12,
    ffn_hidden_dims=512 * 4,
    # 無料版google Driveの少量すぎる保存容量と、貧弱な計算資源を考慮し、GPT-2リスペクトでさらにモデルサイズを小さく
    rmsnorm_eps=1e-6,
    rope_theta=1e5,

    context_len=512,

    use_cache=False,
    use_flash=True,    # 利用可能な場合
    use_moe=True,     # シンプル構成

    moe_num_experts=3, #一つ少なく
    moe_active_experts=1,
    moe_eps=1e-6,
    moe_aux_loss_coef=0.01,
    moe_shared_experts=1,
    use_lossfreebalance=False,
)

学習・モデル設定を構成中...


In [7]:
# 必要なディレクトリを作成
os.makedirs("/content/drive/MyDrive/LightLM/model_testing", exist_ok=True)
os.makedirs("/content/drive/MyDrive/LightLM/log", exist_ok=True)


# モデル初期化
model = Transformer(config)

# パラメータ数を正確に計算
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"実際のパラメータ数: {total_params:,} ({total_params/1e6:.1f}M)")
print(f"学習可能パラメータ: {trainable_params:,}")

実際のパラメータ数: 184,056,320 (184.1M)
学習可能パラメータ: 184,056,320


In [10]:
# データローダー初期化
data_loader = DataLoader(train_config, tokenizer=tokenizer, hf_split="train", cache = "/content/drive/MyDrive/LightLM/cache", use_cache=True)

# トレーナー初期化
trainer = Trainer(train_config, model, tokenizer)

Initializing DataLoader...
Loading tokenized dataset from cache: /content/drive/MyDrive/LightLM/cache
Dataset loaded from cache.
Total tokens loaded:  323,646,976
DataLoader initialized. Dataset size: 632123, Train size: 628963, Val size: 3160
Train indices: 0 to 628963, Val indices: 628963 to 632123
Device: cuda:0
Model's trainable params: 184.06M
Tokens per step: 65536
use torch.compile(): True
Use MoE: Yes 
Number of experts: 3
Number of used experts during inference: 1
Method of aux_loss: default
Number of parameters will be used during inference: 108.55M


In [None]:
trainer.train(data_loader)

Calculating number of training steps...
Preparing for training...
Calculating number of training steps...
data_loader.num_train_steps(): 314482, accumulation_steps: 64
DDP: False 
Use MoE: True
Use loss-free balancing: False
Batch size per GPU: 2
Max sequence length: 512
Gradient accumulation steps: 64
Effective batch size: 128
Total tokens per step: 65536
Total tokens per epoch: 322043904
Number of steps per epoch: 4914
Starting training...
Total epochs: 4, steps per epoch: 4914, total steps: 19656


W0920 17:17:58.464000 1216 torch/_inductor/utils.py:1436] [2/0_1] Not enough SMs to use max_autotune_gemm mode
Online softmax is disabled on the fly since Inductor decides to
split the reduction. Cut an issue to PyTorch if this is an
important use case and you want to speed it up with online
softmax.



Epoch: 0 | Step: 0/4914 | Global Step: 0 | loss: 11.0790 | norm: 3.6321 | lr: 2.5241730280e-05 | tok/s: 2090.8 | dataset idx: 128/628963
Epoch: 0 | Step: 1/4914 | Global Step: 1 | loss: 10.9091 | norm: 4.0744 | lr: 2.5483460560e-05 | tok/s: 7027.7 | dataset idx: 256/628963
Epoch: 0 | Step: 2/4914 | Global Step: 2 | loss: 10.7732 | norm: 3.5063 | lr: 2.5725190840e-05 | tok/s: 8407.8 | dataset idx: 384/628963
Epoch: 0 | Step: 3/4914 | Global Step: 3 | loss: 10.6216 | norm: 3.6050 | lr: 2.5966921120e-05 | tok/s: 8046.8 | dataset idx: 512/628963
Epoch: 0 | Step: 4/4914 | Global Step: 4 | loss: 10.4507 | norm: 3.8628 | lr: 2.6208651399e-05 | tok/s: 8224.0 | dataset idx: 640/628963
Epoch: 0 | Step: 5/4914 | Global Step: 5 | loss: 10.1795 | norm: 4.2965 | lr: 2.6450381679e-05 | tok/s: 8315.7 | dataset idx: 768/628963
Epoch: 0 | Step: 6/4914 | Global Step: 6 | loss: 10.3109 | norm: 3.2702 | lr: 2.6692111959e-05 | tok/s: 8200.4 | dataset idx: 896/628963
Epoch: 0 | Step: 7/4914 | Global Step: 7 

Online softmax is disabled on the fly since Inductor decides to
split the reduction. Cut an issue to PyTorch if this is an
important use case and you want to speed it up with online
softmax.

100%|██████████| 1000/1000 [00:50<00:00, 19.65it/s]


Evaluation at global step 250: val_loss = 7.1040
Cleaning CUDA cache
Checkpoint saved: /content/drive/MyDrive/LightLM/model_testing/model.checkpoint.epoch0_step250_global250.pt (keeping max 2 checkpoints)
Epoch: 0 | Step: 251/4914 | Global Step: 251 | loss: 6.8651 | norm: 2.0416 | lr: 8.5916030534e-05 | tok/s: 8143.6 | dataset idx: 32256/628963
Epoch: 0 | Step: 252/4914 | Global Step: 252 | loss: 7.1266 | norm: 2.1679 | lr: 8.6157760814e-05 | tok/s: 7298.8 | dataset idx: 32384/628963
Epoch: 0 | Step: 253/4914 | Global Step: 253 | loss: 7.0604 | norm: 1.9215 | lr: 8.6399491094e-05 | tok/s: 6561.5 | dataset idx: 32512/628963
Epoch: 0 | Step: 254/4914 | Global Step: 254 | loss: 6.8443 | norm: 1.9641 | lr: 8.6641221374e-05 | tok/s: 6715.7 | dataset idx: 32640/628963
Epoch: 0 | Step: 255/4914 | Global Step: 255 | loss: 6.9626 | norm: 2.0257 | lr: 8.6882951654e-05 | tok/s: 7841.0 | dataset idx: 32768/628963
Epoch: 0 | Step: 256/4914 | Global Step: 256 | loss: 6.6847 | norm: 1.9657 | lr: 8.71

 58%|█████▊    | 579/1000 [00:24<00:16, 25.11it/s]

Epoch finished.
current_idx: 632121 ,end_idx: 632123, new_idx: 632123, len_dataset: 632123, start_idx: 628963
Starting new epoch...
Current epoch: 1
Loading tokenized dataset from cache: /content/drive/MyDrive/LightLM/cache


 58%|█████▊    | 582/1000 [00:26<01:32,  4.54it/s]

Dataset loaded from cache.


100%|██████████| 1000/1000 [00:43<00:00, 22.87it/s]


Evaluation at global step 500: val_loss = 7.6966
Cleaning CUDA cache
Checkpoint saved: /content/drive/MyDrive/LightLM/model_testing/model.checkpoint.epoch0_step500_global500.pt (keeping max 2 checkpoints)
Epoch: 0 | Step: 501/4914 | Global Step: 501 | loss: 7.8100 | norm: 22.2423 | lr: 1.4634860051e-04 | tok/s: 9301.7 | dataset idx: 64256/628963
Epoch: 0 | Step: 502/4914 | Global Step: 502 | loss: 7.8240 | norm: 19.9852 | lr: 1.4659033079e-04 | tok/s: 6100.5 | dataset idx: 64384/628963
Epoch: 0 | Step: 503/4914 | Global Step: 503 | loss: 7.5533 | norm: 24.4381 | lr: 1.4683206107e-04 | tok/s: 6397.4 | dataset idx: 64512/628963
Epoch: 0 | Step: 504/4914 | Global Step: 504 | loss: 7.9130 | norm: 22.6959 | lr: 1.4707379135e-04 | tok/s: 7818.6 | dataset idx: 64640/628963
Epoch: 0 | Step: 505/4914 | Global Step: 505 | loss: 7.5415 | norm: 19.0359 | lr: 1.4731552163e-04 | tok/s: 8819.2 | dataset idx: 64768/628963
Epoch: 0 | Step: 506/4914 | Global Step: 506 | loss: 7.6967 | norm: 17.0950 | lr

100%|██████████| 1000/1000 [00:43<00:00, 23.19it/s]


Evaluation at global step 750: val_loss = 9.7744
Cleaning CUDA cache
Deleted old checkpoint: model.checkpoint.epoch0_step250_global250.pt
Checkpoint saved: /content/drive/MyDrive/LightLM/model_testing/model.checkpoint.epoch0_step750_global750.pt (keeping max 2 checkpoints)
Epoch: 0 | Step: 751/4914 | Global Step: 751 | loss: 9.9360 | norm: 4.1644 | lr: 2.0678117048e-04 | tok/s: 8848.2 | dataset idx: 96256/628963
Epoch: 0 | Step: 752/4914 | Global Step: 752 | loss: 9.8888 | norm: 4.0264 | lr: 2.0702290076e-04 | tok/s: 5656.4 | dataset idx: 96384/628963
Epoch: 0 | Step: 753/4914 | Global Step: 753 | loss: 9.6770 | norm: 7.0489 | lr: 2.0726463104e-04 | tok/s: 5341.8 | dataset idx: 96512/628963
Epoch: 0 | Step: 754/4914 | Global Step: 754 | loss: 9.9269 | norm: 5.3013 | lr: 2.0750636132e-04 | tok/s: 8032.6 | dataset idx: 96640/628963
Epoch: 0 | Step: 755/4914 | Global Step: 755 | loss: 9.7153 | norm: 5.1699 | lr: 2.0774809160e-04 | tok/s: 8876.0 | dataset idx: 96768/628963
Epoch: 0 | Step:

 16%|█▌        | 159/1000 [00:06<00:38, 22.07it/s]

Epoch finished.
current_idx: 632121 ,end_idx: 632123, new_idx: 632123, len_dataset: 632123, start_idx: 628963
Starting new epoch...
Current epoch: 2
Loading tokenized dataset from cache: /content/drive/MyDrive/LightLM/cache


 16%|█▌        | 162/1000 [00:08<03:12,  4.36it/s]

Dataset loaded from cache.


100%|██████████| 1000/1000 [00:43<00:00, 22.77it/s]


Evaluation at global step 1000: val_loss = 11.9456
Cleaning CUDA cache
Deleted old checkpoint: model.checkpoint.epoch0_step500_global500.pt
Checkpoint saved: /content/drive/MyDrive/LightLM/model_testing/model.checkpoint.epoch0_step1000_global1000.pt (keeping max 2 checkpoints)
Epoch: 0 | Step: 1001/4914 | Global Step: 1001 | loss: 11.9602 | norm: 2.1881 | lr: 2.6721374046e-04 | tok/s: 8793.9 | dataset idx: 128256/628963
Epoch: 0 | Step: 1002/4914 | Global Step: 1002 | loss: 11.9862 | norm: 2.3250 | lr: 2.6745547074e-04 | tok/s: 7555.4 | dataset idx: 128384/628963
Epoch: 0 | Step: 1003/4914 | Global Step: 1003 | loss: 11.4528 | norm: 2.1549 | lr: 2.6769720102e-04 | tok/s: 7117.1 | dataset idx: 128512/628963
Epoch: 0 | Step: 1004/4914 | Global Step: 1004 | loss: 12.4366 | norm: 2.2639 | lr: 2.6793893130e-04 | tok/s: 7022.3 | dataset idx: 128640/628963
Epoch: 0 | Step: 1005/4914 | Global Step: 1005 | loss: 11.6936 | norm: 2.1091 | lr: 2.6818066158e-04 | tok/s: 7338.4 | dataset idx: 128768

 74%|███████▍  | 739/1000 [00:31<00:10, 24.74it/s]

Epoch finished.
current_idx: 632121 ,end_idx: 632123, new_idx: 632123, len_dataset: 632123, start_idx: 628963
Starting new epoch...
Current epoch: 3
Loading tokenized dataset from cache: /content/drive/MyDrive/LightLM/cache


 74%|███████▍  | 742/1000 [00:32<00:51,  5.02it/s]

Dataset loaded from cache.


100%|██████████| 1000/1000 [00:43<00:00, 22.77it/s]


Evaluation at global step 1250: val_loss = 15.9548
Cleaning CUDA cache
Deleted old checkpoint: model.checkpoint.epoch0_step1000_global1000.pt
Checkpoint saved: /content/drive/MyDrive/LightLM/model_testing/model.checkpoint.epoch0_step1250_global1250.pt (keeping max 2 checkpoints)
Epoch: 0 | Step: 1251/4914 | Global Step: 1251 | loss: 16.0705 | norm: 2.8390 | lr: 3.2764631043e-04 | tok/s: 8683.4 | dataset idx: 160256/628963
Epoch: 0 | Step: 1252/4914 | Global Step: 1252 | loss: 16.3883 | norm: 2.6701 | lr: 3.2788804071e-04 | tok/s: 7559.1 | dataset idx: 160384/628963
Epoch: 0 | Step: 1253/4914 | Global Step: 1253 | loss: 16.2076 | norm: 3.2271 | lr: 3.2812977099e-04 | tok/s: 7765.5 | dataset idx: 160512/628963
Epoch: 0 | Step: 1254/4914 | Global Step: 1254 | loss: 15.9050 | norm: 2.6266 | lr: 3.2837150127e-04 | tok/s: 7251.7 | dataset idx: 160640/628963
Epoch: 0 | Step: 1255/4914 | Global Step: 1255 | loss: 16.1407 | norm: 2.5187 | lr: 3.2861323155e-04 | tok/s: 7985.8 | dataset idx: 1607

In [None]:

from google.colab import runtime
runtime.unassign()