In [1]:
from google.colab import drive
import os
import sys

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers>=4.21.0 --quiet
!pip install torch torchvision torchaudio --quiet
!pip install tqdm --quiet
!pip install datasets --quiet
!pip install datatrove --quiet

try:
    !pip install flash-attn --no-build-isolation --quiet
    print("Flash Attention がインストールされました")
except:
    print("Flash Attention のインストールに失敗 - 標準Attentionを使用します")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.6/61.6 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m115.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone
Flash Attention がインストールされました


In [3]:
#Google Drive内のモジュールを利用するために作業ディレクトリを変更し、pathに追加する
lightlm_path = '/content/drive/MyDrive/LightLM'

os.chdir(lightlm_path)
print(f"作業ディレクトリ: {os.getcwd()}")

if lightlm_path not in sys.path:
    sys.path.insert(0, lightlm_path)

作業ディレクトリ: /content/drive/MyDrive/LightLM


In [4]:
from model import Transformer, ModelConfig
from trainer import Trainer, TrainerConfig, DataLoader

from transformers import AutoTokenizer
import torch
import torch.nn.functional as F
from tqdm import tqdm
import os
import time
import json

print("全てのインポート完了")

全てのインポート完了


In [5]:
torch.set_float32_matmul_precision('high')
torch.cuda.empty_cache()

tokenizer_id = "HuggingFaceTB/SmolLM-360M"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
tokenizer.pad_token = tokenizer.eos_token

device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

In [6]:
print("学習・モデル設定を構成中...")

train_config = TrainerConfig(
    vocab_size=tokenizer.vocab_size,
    num_epochs=4,
    use_ddp=False,
    use_moe=True,
    use_lossfreebalance=False,
    clean_cuda_cache=True,
    use_compile=True,
    use_dtype="float16" if device == 'cuda' else "float32", #T4はbfloatに未対応

    seed=42,
    max_seq_len=512,
    batch_size=2,
    accumulation_steps=64,

    weight_decay=0.1,
    warmup_ratio=0.1,
    learning_rate=5e-4,
    betas=(0.90, 0.95),
    update_rate=1e-5,

    val_ratio=0.005,
    steps_for_eval=1000,
    eval_interval=250,

    checkpoints_frequency=250,
    path_to_checkpoints="/content/drive/MyDrive/LightLM/model_testing-small",
    max_checkpoints_to_keep=0, # 0の場合は全て保持、-1の場合は最新1つのチェックポイントを保持　colabの場合はゴミ箱システムのせいでどれを設定しても結局重みは圧迫する

    tokenized_dataset_path = "HuggingFaceFW/fineweb-edu",
    #sub_target_files = "", #all data
    #sub_target_files = "data/CC-MAIN-2025-26/*.parquet",
    #sub_target_files = "data/CC-MAIN-2025-26/000_00049.parquet",
    sub_target_files = [
        #"data/CC-MAIN-2025-26/000_00047.parquet",
        "data/CC-MAIN-2025-26/000_00048.parquet",
        "data/CC-MAIN-2025-26/000_00049.parquet"
    ],
    eval_log_file="/content/drive/MyDrive/LightLM/log/eval-small.txt",

    continue_train = False,
    checkpoint_path = 'model_testing/model.checkpoint.epoch0_step16000_global16000.pt',
)

config = ModelConfig(
    vocab_size=tokenizer.vocab_size,

    num_dims=512,
    num_heads=16,
    num_kv_heads=4,    # GQA による効率化
    num_layers=12,
    ffn_hidden_dims=512 * 4,
    # 無料版google Driveの少量すぎる保存容量と、貧弱な計算資源を考慮し、GPT-2リスペクトでさらにモデルサイズを小さく
    rmsnorm_eps=1e-6,
    rope_theta=1e5,

    context_len=512,

    use_cache=False,
    use_flash=True,    # 利用可能な場合
    use_moe=True,     # シンプル構成

    moe_num_experts=3, #一つ少なく
    moe_active_experts=1,
    moe_eps=1e-6,
    moe_aux_loss_coef=0.01,
    moe_shared_experts=1,
    use_lossfreebalance=False,
)

学習・モデル設定を構成中...


In [7]:
# 必要なディレクトリを作成
os.makedirs("/content/drive/MyDrive/LightLM/model_testing-small", exist_ok=True)
os.makedirs("/content/drive/MyDrive/LightLM/log", exist_ok=True)


# モデル初期化
model = Transformer(config)

# パラメータ数を正確に計算
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"実際のパラメータ数: {total_params:,} ({total_params/1e6:.1f}M)")
print(f"学習可能パラメータ: {trainable_params:,}")

実際のパラメータ数: 184,056,320 (184.1M)
学習可能パラメータ: 184,056,320


In [8]:
# データローダー初期化
data_loader = DataLoader(train_config, tokenizer=tokenizer, hf_split="train", cache = "/content/drive/MyDrive/LightLM/cache-small", use_cache=True)

## トレーナー初期化
trainer = Trainer(train_config, model, tokenizer)

Initializing DataLoader...
Loading tokenized dataset from cache: /content/drive/MyDrive/LightLM/cache-small
Dataset loaded from cache.
Total tokens loaded:  215,939,584
DataLoader initialized. Dataset size: 421757, Train size: 419649, Val size: 2108
Train indices: 0 to 419649, Val indices: 419649 to 421757
Device: cuda:0
Model's trainable params: 184.06M
Tokens per step: 65536
use torch.compile(): True
Use GradScaler: Yes (dtype: float16)
Use MoE: Yes 
Number of experts: 3
Number of used experts during inference: 1
Method of aux_loss: default
Number of parameters will be used during inference: 108.55M


In [None]:
trainer.train(data_loader)

Calculating number of training steps...
Preparing for training...
Calculating number of training steps...
data_loader.num_train_steps(): 209825, accumulation_steps: 64
DDP: False 
Use MoE: True
Use loss-free balancing: False
Batch size per GPU: 2
Max sequence length: 512
Gradient accumulation steps: 64
Effective batch size: 128
Total tokens per step: 65536
Total tokens per epoch: 214892544
Number of steps per epoch: 3279
Starting training...
Total epochs: 4, steps per epoch: 3279, total steps: 13116


W0922 16:22:48.225000 248 torch/_inductor/utils.py:1436] [2/0_1] Not enough SMs to use max_autotune_gemm mode
Online softmax is disabled on the fly since Inductor decides to
split the reduction. Cut an issue to PyTorch if this is an
important use case and you want to speed it up with online
softmax.



Epoch: 0 | Step: 0/3279 | Global Step: 0 | loss: 11.1721 | norm: 3.9628 | lr: 2.5362318841e-05 | tok/s: 2061.0 | dataset idx: 128/419649
Epoch: 0 | Step: 1/3279 | Global Step: 1 | loss: 10.9384 | norm: 4.3347 | lr: 2.5724637681e-05 | tok/s: 6912.2 | dataset idx: 256/419649
Epoch: 0 | Step: 2/3279 | Global Step: 2 | loss: 10.8035 | norm: 3.7924 | lr: 2.6086956522e-05 | tok/s: 8573.3 | dataset idx: 384/419649
Epoch: 0 | Step: 3/3279 | Global Step: 3 | loss: 10.5687 | norm: 4.4411 | lr: 2.6449275362e-05 | tok/s: 8157.7 | dataset idx: 512/419649
Epoch: 0 | Step: 4/3279 | Global Step: 4 | loss: 10.5330 | norm: 3.9896 | lr: 2.6811594203e-05 | tok/s: 8378.7 | dataset idx: 640/419649
Epoch: 0 | Step: 5/3279 | Global Step: 5 | loss: 10.2340 | norm: 4.4276 | lr: 2.7173913043e-05 | tok/s: 7053.6 | dataset idx: 768/419649
Epoch: 0 | Step: 6/3279 | Global Step: 6 | loss: 10.1171 | norm: 4.2185 | lr: 2.7536231884e-05 | tok/s: 6058.4 | dataset idx: 896/419649
Epoch: 0 | Step: 7/3279 | Global Step: 7 

In [None]:

from google.colab import runtime
runtime.unassign()