In [None]:
#%env TORCH_FABRIC_DISABLE_ATOMIC_SAVE=1
#%env PL_DISABLE_FSSPEC=1

In [None]:
!rm -rf /content/bd3lms

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!cd /content && git clone https://github.com/ntua-el21050/bd3lms.git


Cloning into 'bd3lms'...
remote: Enumerating objects: 753, done.[K
remote: Counting objects: 100% (214/214), done.[K
remote: Compressing objects: 100% (46/46), done.[K
remote: Total 753 (delta 192), reused 168 (delta 168), pack-reused 539 (from 2)[K
Receiving objects: 100% (753/753), 1.12 MiB | 18.80 MiB/s, done.
Resolving deltas: 100% (488/488), done.


In [4]:
# Apply fix to diffusion.py: Override sampling_eps from config when resuming checkpoint
import os

diffusion_file = '/content/bd3lms/diffusion.py'

with open(diffusion_file, 'r') as f:
    content = f.read()

# Find and replace the on_load_checkpoint method
old_code = '''  def on_load_checkpoint(self, checkpoint):
    print('Loading checkpoint at', checkpoint['global_step'])
    self._restarting_skip_val_flag = True

    # for models compiled with `torch.compile`
    if '_orig_mod.' in list(checkpoint['state_dict'].keys())[0]:
      checkpoint = self._replace_ckpt_keys(checkpoint)

    if self.ema:
      self.ema.load_state_dict(checkpoint['ema'])
    if 'sampling_eps_min' in checkpoint.keys():
      self.sampling_eps_min = checkpoint['sampling_eps_min']
      self.sampling_eps_max = checkpoint['sampling_eps_max']
    # Copied from:
    # https://github.com/Dao-AILab/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py#L41
    self.fast_forward_epochs = checkpoint['loops'][
      'fit_loop']['epoch_progress']['current']['completed']
    self.fast_forward_batches = checkpoint['loops'][
      'fit_loop']['epoch_loop.batch_progress'][
        'current']['completed']'''

new_code = '''  def on_load_checkpoint(self, checkpoint):
    print('Loading checkpoint at', checkpoint['global_step'])
    self._restarting_skip_val_flag = True

    # for models compiled with `torch.compile`
    if '_orig_mod.' in list(checkpoint['state_dict'].keys())[0]:
      checkpoint = self._replace_ckpt_keys(checkpoint)

    if self.ema:
      self.ema.load_state_dict(checkpoint['ema'])
    if 'sampling_eps_min' in checkpoint.keys():
      self.sampling_eps_min = checkpoint['sampling_eps_min']
      self.sampling_eps_max = checkpoint['sampling_eps_max']

    # Override sampling_eps in the checkpoint state_dict BEFORE Lightning loads it
    # This is the only reliable way to change buffer values when resuming
    if self.var_min and 'sampling_eps_min' in checkpoint['state_dict']:
      checkpoint['state_dict']['sampling_eps_min'] = torch.tensor(
        self.config.training.sampling_eps_min)
      checkpoint['state_dict']['sampling_eps_max'] = torch.tensor(
        self.config.training.sampling_eps_max)
      print(f'✓ Overriding sampling_eps in checkpoint before load: '
            f'min={self.config.training.sampling_eps_min}, '
            f'max={self.config.training.sampling_eps_max}')

    # Copied from:
    # https://github.com/Dao-AILab/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py#L41
    self.fast_forward_epochs = checkpoint['loops'][
      'fit_loop']['epoch_progress']['current']['completed']
    self.fast_forward_batches = checkpoint['loops'][
      'fit_loop']['epoch_loop.batch_progress'][
        'current']['completed']'''

if old_code in content:
    content = content.replace(old_code, new_code)
    with open(diffusion_file, 'w') as f:
        f.write(content)
    print('✅ Successfully patched diffusion.py - sampling_eps override applied!')
else:
    print('⚠️ Could not find the code to patch. It may already be patched or the format differs.')
    print('Checking if patch is already applied...')
    if 'Overriding sampling_eps in checkpoint before load' in content:
        print('✅ Patch already exists in diffusion.py!')
    else:
        print('❌ Patch not found and could not be applied.')

✅ Successfully patched diffusion.py - sampling_eps override applied!


In [5]:
# Patch main.py to disable TQDMProgressBar when resuming from checkpoint
import os

main_file = '/content/bd3lms/main.py'

with open(main_file, 'r') as f:
    content = f.read()

# Find the trainer initialization section and add progress bar disable
old_code = '''  trainer = hydra.utils.instantiate(
    config.trainer,
    default_root_dir=os.getcwd(),
    callbacks=callbacks,
    strategy=hydra.utils.instantiate(config.strategy),
    logger=wandb_logger)'''

new_code = '''  # Disable TQDMProgressBar when resuming from checkpoint (Lightning bug workaround)
  enable_pb = not config.checkpointing.resume_ckpt_path

  trainer = hydra.utils.instantiate(
    config.trainer,
    default_root_dir=os.getcwd(),
    callbacks=callbacks,
    strategy=hydra.utils.instantiate(config.strategy),
    logger=wandb_logger,
    enable_progress_bar=enable_pb)'''

if old_code in content:
    content = content.replace(old_code, new_code)
    with open(main_file, 'w') as f:
        f.write(content)
    print('✅ Successfully patched main.py - TQDMProgressBar disabled for checkpoint resumption!')
else:
    print('⚠️ Could not find the code to patch. It may already be patched or the format differs.')
    print('Checking if patch is already applied...')
    if 'Disable TQDMProgressBar when resuming from checkpoint' in content:
        print('✅ Patch already exists in main.py!')
    else:
        print('❌ Patch not found and could not be applied.')


✅ Successfully patched main.py - TQDMProgressBar disabled for checkpoint resumption!


In [6]:
# Patch utils.py to fix checkpoint path detection (fsspec issue with Google Drive)
import os

utils_file = '/content/bd3lms/utils.py'

with open(utils_file, 'r') as f:
    content = f.read()

old_code = '''def fsspec_exists(filename):
  """Check if a file exists using fsspec."""
  fs, _ = fsspec.core.url_to_fs(filename)
  return fs.exists(filename)'''

new_code = '''def fsspec_exists(filename):
  """Check if a file exists using fsspec."""
  try:
    fs, _ = fsspec.core.url_to_fs(filename)
    exists = fs.exists(filename)
    if not exists:
      # Fallback to os.path.exists for local paths (Google Drive in Colab)
      import os
      exists = os.path.exists(filename)
    return exists
  except Exception as e:
    # If fsspec fails, try standard os.path.exists
    import os
    return os.path.exists(filename)'''

if old_code in content:
    content = content.replace(old_code, new_code)
    with open(utils_file, 'w') as f:
        f.write(content)
    print('✅ Successfully patched utils.py - checkpoint detection fixed!')
else:
    print('⚠️ Could not find the code to patch. It may already be patched.')
    if 'Fallback to os.path.exists' in content:
        print('✅ Patch already exists in utils.py!')
    else:
        print('❌ Patch not found and could not be applied.')


✅ Successfully patched utils.py - checkpoint detection fixed!


In [7]:
!pip install -q \
    torchmetrics==1.6.2 \
    datasets==3.3.2 \
    einops==0.8.1 \
    fsspec==2024.2.0 \
    hydra-core==1.3.2 \
    lightning==2.5.0.post0 \
    omegaconf==2.3.0 \
    packaging==23.2 \
    pandas==2.2.1 \
    rich==13.7.1 \
    scikit-learn==1.5.1 \
    timm==0.9.16 \
    transformers==4.49.0 \
    matplotlib==3.10.0 \
    wandb


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.4/40.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m931.6/931.6 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m170.9/170.9 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.5/154.5 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m815.2/815.2 kB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

In [8]:
import sys
sys.path.append("/content")


In [None]:
# Train base mdoel
!cd /content/bd3lms && python main.py \
  mode=train \
  model=tiny \
  algo=bd3lm \
  data=lm1b_wrap \
  model.length=128 \
  model.attn_backend=sdpa \
  block_size=128 \
  trainer.devices=1 \
  loader.global_batch_size=4 \
  loader.batch_size=4 \
  loader.eval_batch_size=4 \
  trainer.max_steps=500 \
  data.max_train_samples=500 \
  data.max_valid_samples=100 \
  data.max_test_samples=100 \
  training.nll_diagram=False \
  algo.fix_clipping=False \
  training.sampling_eps_min=0.0 \
  training.sampling_eps_max=1.0

Seed set to 1
[2mCONFIG[0m
[2m├── [0m[2mmode[0m
[2m│   [0m[2m└── [0m[2;40mtrain                                                                   [0m
[2m├── [0m[2mdiffusion[0m
[2m│   [0m[2m└── [0m[2;40mabsorbing_state                                                         [0m
[2m├── [0m[2mseed[0m
[2m│   [0m[2m└── [0m[2;40m1                                                                       [0m
[2m├── [0m[2mblock_size[0m
[2m│   [0m[2m└── [0m[2;40m128                                                                     [0m
[2m├── [0m[2mdata[0m
[2m│   [0m[2m└── [0m[2;91;40mmax_samples[0m[2;97;40m:[0m[2;97;40m [0m[2;40mnull                                                       [0m
[2m│   [0m[2m    [0m[2;91;40mmax_train_samples[0m[2;97;40m:[0m[2;97;40m [0m[2;40m500                                                  [0m
[2m│   [0m[2m    [0m[2;91;40mmax_valid_samples[0m[2;97;40m:[0m[2;97;40m [0m[2;40m100        

In [None]:
# Sync to drive
!mkdir -p /content/drive/MyDrive/bd3lms_storage_final
!rsync -av \
  /content/bd3lms/outputs/lm1b \
  /content/drive/MyDrive/bd3lms_storage_final


sending incremental file list
lm1b/
lm1b/2026.01.05/
lm1b/2026.01.05/022135/
lm1b/2026.01.05/022135/config_tree.txt
lm1b/2026.01.05/022135/main.log
lm1b/2026.01.05/022135/.hydra/
lm1b/2026.01.05/022135/.hydra/config.yaml
lm1b/2026.01.05/022135/.hydra/hydra.yaml
lm1b/2026.01.05/022135/.hydra/overrides.yaml
lm1b/2026.01.05/022135/checkpoints/
lm1b/2026.01.05/022135/checkpoints/16-500.ckpt
lm1b/2026.01.05/022135/checkpoints/best.ckpt
lm1b/2026.01.05/022135/checkpoints/last.ckpt

sent 1,098,513,284 bytes  received 196 bytes  169,002,073.85 bytes/sec
total size is 1,098,244,406  speedup is 1.00


In [None]:
# Fine-tune block size 128, clipping
!cd /content/bd3lms && python main.py \
  mode=train \
  model=tiny \
  algo=bd3lm \
  data=lm1b_wrap \
  model.length=128 \
  model.attn_backend=sdpa \
  block_size=128 \
  trainer.devices=1 \
  loader.global_batch_size=4 \
  loader.batch_size=4 \
  loader.eval_batch_size=4 \
  trainer.max_steps=600 \
  data.max_train_samples=500 \
  data.max_valid_samples=100 \
  data.max_test_samples=100 \
  training.nll_diagram=False \
  algo.fix_clipping=False \
  training.sampling_eps_min=0 \
  training.sampling_eps_max=0.5 \
  checkpointing.resume_ckpt_path=/content/drive/MyDrive/bd3lms_storage_final/lm1b/2026.01.05/022135/checkpoints/last.ckpt 2>&1 | grep -v "TQDMProgressBar\|val_progress_bar"

Seed set to 1
CONFIG
├── mode
│   └── train                                                                   
├── diffusion
│   └── absorbing_state                                                         
├── seed
│   └── 1                                                                       
├── block_size
│   └── 128                                                                     
├── data
│   └── max_samples: null                                                       
│       max_train_samples: 500                                                  
│       max_valid_samples: 100                                                  
│       max_test_samples: 100                                                   
│       train: lm1b                                                             
│       valid: lm1b                                                             
│       tokenizer_name_or_path: bert-base-uncased                               
│       cache_dir: /share/kulesh

In [9]:
#eval_bd3lm_best block size 128, clipping

!cd /content/bd3lms && python main.py \
  mode=ppl_eval \
  model=tiny \
  algo=bd3lm \
  data=lm1b_wrap \
  model.length=128 \
  model.attn_backend=sdpa \
  block_size=128 \
  trainer.devices=1 \
  trainer.num_nodes=1 \
  loader.global_batch_size=4 \
  loader.batch_size=4 \
  loader.eval_batch_size=4 \
  eval.checkpoint_path=/content/drive/MyDrive/bd3lms_storage_final/lm1b/2026.01.05/024407/checkpoints/best.ckpt \
  data.max_valid_samples=100 \
  data.max_test_samples=100

Seed set to 1
[2mCONFIG[0m
[2m├── [0m[2mmode[0m
[2m│   [0m[2m└── [0m[2;40mppl_eval                                                                [0m
[2m├── [0m[2mdiffusion[0m
[2m│   [0m[2m└── [0m[2;40mabsorbing_state                                                         [0m
[2m├── [0m[2mseed[0m
[2m│   [0m[2m└── [0m[2;40m1                                                                       [0m
[2m├── [0m[2mblock_size[0m
[2m│   [0m[2m└── [0m[2;40m128                                                                     [0m
[2m├── [0m[2mdata[0m
[2m│   [0m[2m└── [0m[2;91;40mmax_samples[0m[2;97;40m:[0m[2;97;40m [0m[2;40mnull                                                       [0m
[2m│   [0m[2m    [0m[2;91;40mmax_train_samples[0m[2;97;40m:[0m[2;97;40m [0m[2;40mnull                                                 [0m
[2m│   [0m[2m    [0m[2;91;40mmax_valid_samples[0m[2;97;40m:[0m[2;97;40m [0m[2;40m100        

In [None]:
# Fine-tune block size 128, no clipping
!cd /content/bd3lms && python main.py \
  mode=train \
  model=tiny \
  algo=bd3lm \
  data=lm1b_wrap \
  model.length=128 \
  model.attn_backend=sdpa \
  block_size=128 \
  trainer.devices=1 \
  loader.global_batch_size=4 \
  loader.batch_size=4 \
  loader.eval_batch_size=4 \
  trainer.max_steps=600 \
  data.max_train_samples=500 \
  data.max_valid_samples=100 \
  data.max_test_samples=100 \
  training.nll_diagram=False \
  algo.fix_clipping=False \
  training.sampling_eps_min=0 \
  training.sampling_eps_max=1 \
  checkpointing.resume_ckpt_path=/content/drive/MyDrive/bd3lms_storage_final/lm1b/2026.01.05/022135/checkpoints/last.ckpt 2>&1 | grep -v "TQDMProgressBar\|val_progress_bar"

Seed set to 1
CONFIG
├── mode
│   └── train                                                                   
├── diffusion
│   └── absorbing_state                                                         
├── seed
│   └── 1                                                                       
├── block_size
│   └── 128                                                                     
├── data
│   └── max_samples: null                                                       
│       max_train_samples: 500                                                  
│       max_valid_samples: 100                                                  
│       max_test_samples: 100                                                   
│       train: lm1b                                                             
│       valid: lm1b                                                             
│       tokenizer_name_or_path: bert-base-uncased                               
│       cache_dir: /share/kulesh

In [2]:
!ls /content/drive/MyDrive/bd3lms_storage_final/lm1b/2026.01.05/025009/

checkpoints  config_tree.txt  main.log


In [10]:
#eval_bd3lm_best block size 128, no clipping

!cd /content/bd3lms && python main.py \
  mode=ppl_eval \
  model=tiny \
  algo=bd3lm \
  data=lm1b_wrap \
  model.length=128 \
  model.attn_backend=sdpa \
  block_size=128 \
  trainer.devices=1 \
  trainer.num_nodes=1 \
  loader.global_batch_size=4 \
  loader.batch_size=4 \
  loader.eval_batch_size=4 \
  eval.checkpoint_path=/content/drive/MyDrive/bd3lms_storage_final/lm1b/2026.01.05/025009/checkpoints/best.ckpt \
  data.max_valid_samples=100 \
  data.max_test_samples=100

Seed set to 1
[2mCONFIG[0m
[2m├── [0m[2mmode[0m
[2m│   [0m[2m└── [0m[2;40mppl_eval                                                                [0m
[2m├── [0m[2mdiffusion[0m
[2m│   [0m[2m└── [0m[2;40mabsorbing_state                                                         [0m
[2m├── [0m[2mseed[0m
[2m│   [0m[2m└── [0m[2;40m1                                                                       [0m
[2m├── [0m[2mblock_size[0m
[2m│   [0m[2m└── [0m[2;40m128                                                                     [0m
[2m├── [0m[2mdata[0m
[2m│   [0m[2m└── [0m[2;91;40mmax_samples[0m[2;97;40m:[0m[2;97;40m [0m[2;40mnull                                                       [0m
[2m│   [0m[2m    [0m[2;91;40mmax_train_samples[0m[2;97;40m:[0m[2;97;40m [0m[2;40mnull                                                 [0m
[2m│   [0m[2m    [0m[2;91;40mmax_valid_samples[0m[2;97;40m:[0m[2;97;40m [0m[2;40m100        

In [None]:
# Sync to drive
!mkdir -p /content/drive/MyDrive/bd3lms_storage_final
!rsync -av \
  /content/bd3lms/outputs/lm1b \
  /content/drive/MyDrive/bd3lms_storage_final

sending incremental file list
lm1b/
lm1b/2026.01.05/
lm1b/2026.01.05/022135/
lm1b/2026.01.05/022135/.hydra/
lm1b/2026.01.05/022135/checkpoints/
lm1b/2026.01.05/024407/
lm1b/2026.01.05/024407/config_tree.txt
lm1b/2026.01.05/024407/main.log
lm1b/2026.01.05/024407/.hydra/
lm1b/2026.01.05/024407/.hydra/config.yaml
lm1b/2026.01.05/024407/.hydra/hydra.yaml
lm1b/2026.01.05/024407/.hydra/overrides.yaml
lm1b/2026.01.05/024407/checkpoints/
lm1b/2026.01.05/024407/checkpoints/best.ckpt
lm1b/2026.01.05/024407/checkpoints/last.ckpt
lm1b/2026.01.05/025009/
lm1b/2026.01.05/025009/config_tree.txt
lm1b/2026.01.05/025009/main.log
lm1b/2026.01.05/025009/.hydra/
lm1b/2026.01.05/025009/.hydra/config.yaml
lm1b/2026.01.05/025009/.hydra/hydra.yaml
lm1b/2026.01.05/025009/.hydra/overrides.yaml
lm1b/2026.01.05/025009/checkpoints/
lm1b/2026.01.05/025009/checkpoints/best.ckpt
lm1b/2026.01.05/025009/checkpoints/last.ckpt

sent 1,464,700,511 bytes  received 357 bytes  108,496,360.59 bytes/sec
total size is 2,562,585,

In [None]:
# Fine-tune block size 16, clipping
!cd /content/bd3lms && python main.py \
  mode=train \
  model=tiny \
  algo=bd3lm \
  data=lm1b_wrap \
  model.length=128 \
  model.attn_backend=sdpa \
  block_size=16 \
  trainer.devices=1 \
  loader.global_batch_size=4 \
  loader.batch_size=4 \
  loader.eval_batch_size=4 \
  trainer.max_steps=600 \
  data.max_train_samples=500 \
  data.max_valid_samples=100 \
  data.max_test_samples=100 \
  training.nll_diagram=False \
  algo.fix_clipping=False \
  training.sampling_eps_min=0.3 \
  training.sampling_eps_max=0.8 \
  checkpointing.resume_ckpt_path=/content/drive/MyDrive/bd3lms_storage_final/lm1b/2026.01.05/022135/checkpoints/last.ckpt 2>&1 | grep -v "TQDMProgressBar\|val_progress_bar"

Seed set to 1
CONFIG
├── mode
│   └── train                                                                   
├── diffusion
│   └── absorbing_state                                                         
├── seed
│   └── 1                                                                       
├── block_size
│   └── 16                                                                      
├── data
│   └── max_samples: null                                                       
│       max_train_samples: 500                                                  
│       max_valid_samples: 100                                                  
│       max_test_samples: 100                                                   
│       train: lm1b                                                             
│       valid: lm1b                                                             
│       tokenizer_name_or_path: bert-base-uncased                               
│       cache_dir: /share/kulesh

In [11]:
#eval_bd3lm_best block size 16, clipping

!cd /content/bd3lms && python main.py \
  mode=ppl_eval \
  model=tiny \
  algo=bd3lm \
  data=lm1b_wrap \
  model.length=128 \
  model.attn_backend=sdpa \
  block_size=16 \
  trainer.devices=1 \
  trainer.num_nodes=1 \
  loader.global_batch_size=4 \
  loader.batch_size=4 \
  loader.eval_batch_size=4 \
  eval.checkpoint_path=/content/drive/MyDrive/bd3lms_storage_final/lm1b/2026.01.05/025849/checkpoints/best.ckpt \
  data.max_valid_samples=100 \
  data.max_test_samples=100

Seed set to 1
[2mCONFIG[0m
[2m├── [0m[2mmode[0m
[2m│   [0m[2m└── [0m[2;40mppl_eval                                                                [0m
[2m├── [0m[2mdiffusion[0m
[2m│   [0m[2m└── [0m[2;40mabsorbing_state                                                         [0m
[2m├── [0m[2mseed[0m
[2m│   [0m[2m└── [0m[2;40m1                                                                       [0m
[2m├── [0m[2mblock_size[0m
[2m│   [0m[2m└── [0m[2;40m16                                                                      [0m
[2m├── [0m[2mdata[0m
[2m│   [0m[2m└── [0m[2;91;40mmax_samples[0m[2;97;40m:[0m[2;97;40m [0m[2;40mnull                                                       [0m
[2m│   [0m[2m    [0m[2;91;40mmax_train_samples[0m[2;97;40m:[0m[2;97;40m [0m[2;40mnull                                                 [0m
[2m│   [0m[2m    [0m[2;91;40mmax_valid_samples[0m[2;97;40m:[0m[2;97;40m [0m[2;40m100        

In [None]:
# Fine-tune block size 16, no clipping
!cd /content/bd3lms && python main.py \
  mode=train \
  model=tiny \
  algo=bd3lm \
  data=lm1b_wrap \
  model.length=128 \
  model.attn_backend=sdpa \
  block_size=16 \
  trainer.devices=1 \
  loader.global_batch_size=4 \
  loader.batch_size=4 \
  loader.eval_batch_size=4 \
  trainer.max_steps=600 \
  data.max_train_samples=500 \
  data.max_valid_samples=100 \
  data.max_test_samples=100 \
  training.nll_diagram=False \
  algo.fix_clipping=False \
  training.sampling_eps_min=0 \
  training.sampling_eps_max=1 \
  checkpointing.resume_ckpt_path=/content/drive/MyDrive/bd3lms_storage_final/lm1b/2026.01.05/022135/checkpoints/last.ckpt 2>&1 | grep -v "TQDMProgressBar\|val_progress_bar"

Seed set to 1
CONFIG
├── mode
│   └── train                                                                   
├── diffusion
│   └── absorbing_state                                                         
├── seed
│   └── 1                                                                       
├── block_size
│   └── 16                                                                      
├── data
│   └── max_samples: null                                                       
│       max_train_samples: 500                                                  
│       max_valid_samples: 100                                                  
│       max_test_samples: 100                                                   
│       train: lm1b                                                             
│       valid: lm1b                                                             
│       tokenizer_name_or_path: bert-base-uncased                               
│       cache_dir: /share/kulesh

In [12]:
#eval_bd3lm_best block size 16, no clipping

!cd /content/bd3lms && python main.py \
  mode=ppl_eval \
  model=tiny \
  algo=bd3lm \
  data=lm1b_wrap \
  model.length=128 \
  model.attn_backend=sdpa \
  block_size=16 \
  trainer.devices=1 \
  trainer.num_nodes=1 \
  loader.global_batch_size=4 \
  loader.batch_size=4 \
  loader.eval_batch_size=4 \
  eval.checkpoint_path=/content/drive/MyDrive/bd3lms_storage_final/lm1b/2026.01.05/030413/checkpoints/best.ckpt \
  data.max_valid_samples=100 \
  data.max_test_samples=100

Seed set to 1
[2mCONFIG[0m
[2m├── [0m[2mmode[0m
[2m│   [0m[2m└── [0m[2;40mppl_eval                                                                [0m
[2m├── [0m[2mdiffusion[0m
[2m│   [0m[2m└── [0m[2;40mabsorbing_state                                                         [0m
[2m├── [0m[2mseed[0m
[2m│   [0m[2m└── [0m[2;40m1                                                                       [0m
[2m├── [0m[2mblock_size[0m
[2m│   [0m[2m└── [0m[2;40m16                                                                      [0m
[2m├── [0m[2mdata[0m
[2m│   [0m[2m└── [0m[2;91;40mmax_samples[0m[2;97;40m:[0m[2;97;40m [0m[2;40mnull                                                       [0m
[2m│   [0m[2m    [0m[2;91;40mmax_train_samples[0m[2;97;40m:[0m[2;97;40m [0m[2;40mnull                                                 [0m
[2m│   [0m[2m    [0m[2;91;40mmax_valid_samples[0m[2;97;40m:[0m[2;97;40m [0m[2;40m100        

In [None]:
# Sync to drive
!mkdir -p /content/drive/MyDrive/bd3lms_storage_final
!rsync -av \
  /content/bd3lms/outputs/lm1b \
  /content/drive/MyDrive/bd3lms_storage_final

sending incremental file list
lm1b/
lm1b/2026.01.05/
lm1b/2026.01.05/022135/
lm1b/2026.01.05/022135/.hydra/
lm1b/2026.01.05/022135/checkpoints/
lm1b/2026.01.05/024407/
lm1b/2026.01.05/024407/.hydra/
lm1b/2026.01.05/024407/checkpoints/
lm1b/2026.01.05/025009/
lm1b/2026.01.05/025009/.hydra/
lm1b/2026.01.05/025009/checkpoints/
lm1b/2026.01.05/025827/
lm1b/2026.01.05/025827/config_tree.txt
lm1b/2026.01.05/025827/main.log
lm1b/2026.01.05/025827/.hydra/
lm1b/2026.01.05/025827/.hydra/config.yaml
lm1b/2026.01.05/025827/.hydra/hydra.yaml
lm1b/2026.01.05/025827/.hydra/overrides.yaml
lm1b/2026.01.05/025838/
lm1b/2026.01.05/025838/config_tree.txt
lm1b/2026.01.05/025838/main.log
lm1b/2026.01.05/025838/.hydra/
lm1b/2026.01.05/025838/.hydra/config.yaml
lm1b/2026.01.05/025838/.hydra/hydra.yaml
lm1b/2026.01.05/025838/.hydra/overrides.yaml
lm1b/2026.01.05/025849/
lm1b/2026.01.05/025849/config_tree.txt
lm1b/2026.01.05/025849/main.log
lm1b/2026.01.05/025849/.hydra/
lm1b/2026.01.05/025849/.hydra/config.yam

In [None]:
# Fine-tune block size 4, clipping
!cd /content/bd3lms && python main.py \
  mode=train \
  model=tiny \
  algo=bd3lm \
  data=lm1b_wrap \
  model.length=128 \
  model.attn_backend=sdpa \
  block_size=4 \
  trainer.devices=1 \
  loader.global_batch_size=4 \
  loader.batch_size=4 \
  loader.eval_batch_size=4 \
  trainer.max_steps=600 \
  data.max_train_samples=500 \
  data.max_valid_samples=100 \
  data.max_test_samples=100 \
  training.nll_diagram=False \
  algo.fix_clipping=False \
  training.sampling_eps_min=0.5 \
  training.sampling_eps_max=1 \
  checkpointing.resume_ckpt_path=/content/drive/MyDrive/bd3lms_storage_final/lm1b/2026.01.05/022135/checkpoints/last.ckpt 2>&1 | grep -v "TQDMProgressBar\|val_progress_bar"

Seed set to 1
CONFIG
├── mode
│   └── train                                                                   
├── diffusion
│   └── absorbing_state                                                         
├── seed
│   └── 1                                                                       
├── block_size
│   └── 4                                                                       
├── data
│   └── max_samples: null                                                       
│       max_train_samples: 500                                                  
│       max_valid_samples: 100                                                  
│       max_test_samples: 100                                                   
│       train: lm1b                                                             
│       valid: lm1b                                                             
│       tokenizer_name_or_path: bert-base-uncased                               
│       cache_dir: /share/kulesh

In [13]:
#eval_bd3lm_best block size 4, clipping

!cd /content/bd3lms && python main.py \
  mode=ppl_eval \
  model=tiny \
  algo=bd3lm \
  data=lm1b_wrap \
  model.length=128 \
  model.attn_backend=sdpa \
  block_size=4 \
  trainer.devices=1 \
  trainer.num_nodes=1 \
  loader.global_batch_size=4 \
  loader.batch_size=4 \
  loader.eval_batch_size=4 \
  eval.checkpoint_path=/content/drive/MyDrive/bd3lms_storage_final/lm1b/2026.01.05/031018/checkpoints/best.ckpt \
  data.max_valid_samples=100 \
  data.max_test_samples=100

Seed set to 1
[2mCONFIG[0m
[2m├── [0m[2mmode[0m
[2m│   [0m[2m└── [0m[2;40mppl_eval                                                                [0m
[2m├── [0m[2mdiffusion[0m
[2m│   [0m[2m└── [0m[2;40mabsorbing_state                                                         [0m
[2m├── [0m[2mseed[0m
[2m│   [0m[2m└── [0m[2;40m1                                                                       [0m
[2m├── [0m[2mblock_size[0m
[2m│   [0m[2m└── [0m[2;40m4                                                                       [0m
[2m├── [0m[2mdata[0m
[2m│   [0m[2m└── [0m[2;91;40mmax_samples[0m[2;97;40m:[0m[2;97;40m [0m[2;40mnull                                                       [0m
[2m│   [0m[2m    [0m[2;91;40mmax_train_samples[0m[2;97;40m:[0m[2;97;40m [0m[2;40mnull                                                 [0m
[2m│   [0m[2m    [0m[2;91;40mmax_valid_samples[0m[2;97;40m:[0m[2;97;40m [0m[2;40m100        

In [None]:
# Fine-tune block size 4, no clipping
!cd /content/bd3lms && python main.py \
  mode=train \
  model=tiny \
  algo=bd3lm \
  data=lm1b_wrap \
  model.length=128 \
  model.attn_backend=sdpa \
  block_size=4 \
  trainer.devices=1 \
  loader.global_batch_size=4 \
  loader.batch_size=4 \
  loader.eval_batch_size=4 \
  trainer.max_steps=600 \
  data.max_train_samples=500 \
  data.max_valid_samples=100 \
  data.max_test_samples=100 \
  training.nll_diagram=False \
  algo.fix_clipping=False \
  training.sampling_eps_min=0 \
  training.sampling_eps_max=1 \
  checkpointing.resume_ckpt_path=/content/drive/MyDrive/bd3lms_storage_final/lm1b/2026.01.05/022135/checkpoints/last.ckpt 2>&1 | grep -v "TQDMProgressBar\|val_progress_bar"

Seed set to 1
CONFIG
├── mode
│   └── train                                                                   
├── diffusion
│   └── absorbing_state                                                         
├── seed
│   └── 1                                                                       
├── block_size
│   └── 4                                                                       
├── data
│   └── max_samples: null                                                       
│       max_train_samples: 500                                                  
│       max_valid_samples: 100                                                  
│       max_test_samples: 100                                                   
│       train: lm1b                                                             
│       valid: lm1b                                                             
│       tokenizer_name_or_path: bert-base-uncased                               
│       cache_dir: /share/kulesh

In [14]:
#eval_bd3lm_best block size 4, no clipping

!cd /content/bd3lms && python main.py \
  mode=ppl_eval \
  model=tiny \
  algo=bd3lm \
  data=lm1b_wrap \
  model.length=128 \
  model.attn_backend=sdpa \
  block_size=4 \
  trainer.devices=1 \
  trainer.num_nodes=1 \
  loader.global_batch_size=4 \
  loader.batch_size=4 \
  loader.eval_batch_size=4 \
  eval.checkpoint_path=/content/drive/MyDrive/bd3lms_storage_final/lm1b/2026.01.05/031558/checkpoints/best.ckpt \
  data.max_valid_samples=100 \
  data.max_test_samples=100

Seed set to 1
[2mCONFIG[0m
[2m├── [0m[2mmode[0m
[2m│   [0m[2m└── [0m[2;40mppl_eval                                                                [0m
[2m├── [0m[2mdiffusion[0m
[2m│   [0m[2m└── [0m[2;40mabsorbing_state                                                         [0m
[2m├── [0m[2mseed[0m
[2m│   [0m[2m└── [0m[2;40m1                                                                       [0m
[2m├── [0m[2mblock_size[0m
[2m│   [0m[2m└── [0m[2;40m4                                                                       [0m
[2m├── [0m[2mdata[0m
[2m│   [0m[2m└── [0m[2;91;40mmax_samples[0m[2;97;40m:[0m[2;97;40m [0m[2;40mnull                                                       [0m
[2m│   [0m[2m    [0m[2;91;40mmax_train_samples[0m[2;97;40m:[0m[2;97;40m [0m[2;40mnull                                                 [0m
[2m│   [0m[2m    [0m[2;91;40mmax_valid_samples[0m[2;97;40m:[0m[2;97;40m [0m[2;40m100        

In [None]:
# Sync to drive
!mkdir -p /content/drive/MyDrive/bd3lms_storage_final
!rsync -av \
  /content/bd3lms/outputs/lm1b \
  /content/drive/MyDrive/bd3lms_storage_final

sending incremental file list
lm1b/
lm1b/2026.01.05/
lm1b/2026.01.05/022135/
lm1b/2026.01.05/022135/.hydra/
lm1b/2026.01.05/022135/checkpoints/
lm1b/2026.01.05/024407/
lm1b/2026.01.05/024407/.hydra/
lm1b/2026.01.05/024407/checkpoints/
lm1b/2026.01.05/025009/
lm1b/2026.01.05/025009/.hydra/
lm1b/2026.01.05/025009/checkpoints/
lm1b/2026.01.05/025827/
lm1b/2026.01.05/025827/.hydra/
lm1b/2026.01.05/025838/
lm1b/2026.01.05/025838/.hydra/
lm1b/2026.01.05/025849/
lm1b/2026.01.05/025849/.hydra/
lm1b/2026.01.05/025849/checkpoints/
lm1b/2026.01.05/030413/
lm1b/2026.01.05/030413/.hydra/
lm1b/2026.01.05/030413/checkpoints/
lm1b/2026.01.05/031018/
lm1b/2026.01.05/031018/config_tree.txt
lm1b/2026.01.05/031018/main.log
lm1b/2026.01.05/031018/.hydra/
lm1b/2026.01.05/031018/.hydra/config.yaml
lm1b/2026.01.05/031018/.hydra/hydra.yaml
lm1b/2026.01.05/031018/.hydra/overrides.yaml
lm1b/2026.01.05/031018/checkpoints/
lm1b/2026.01.05/031018/checkpoints/best.ckpt
lm1b/2026.01.05/031018/checkpoints/last.ckpt
lm