In [10]:
import os
concat_path = "XTT22_train.fa"

In [12]:
full_fasta_path = os.path.abspath(concat_path)
output_dir = os.path.abspath("preprocessed_data")
output_yaml = f"""
- datapaths: ["{full_fasta_path}"]
  output_dir: "{output_dir}"
  output_prefix: XTT22_train
  train_split: 0.9
  valid_split: 0.05
  test_split: 0.05
  overwrite: True
  embed_reverse_complement: true
  random_reverse_complement: 0.0
  random_lineage_dropout: 0.0
  include_sequence_id: false
  transcribe: "back_transcribe"
  force_uppercase: false
  indexed_dataset_dtype: "uint8"
  tokenizer_type: "Byte-Level"
  vocab_file: null
  vocab_size: null
  merges_file: null
  pretrained_tokenizer_model: null
  special_tokens: null
  fast_hf_tokenizer: true
  append_eod: true
  enforce_sample_length: null
  ftfy: false
  workers: 1
  preproc_concurrency: 100000
  chunksize: 25
  drop_empty_sequences: true
  nnn_filter: false  # If you split your fasta on NNN (in human these are contigs), then you should set this to true.
  seed: 12342  # Not relevant because we are not using random reverse complement or lineage dropout.
"""
with open("preprocess_config.yaml", "w") as f:
    print(output_yaml, file=f)

In [13]:
!preprocess_evo2 --config preprocess_config.yaml

[NeMo I 2025-05-04 03:51:59 nemo_logging:393] Using byte-level tokenization
[NeMo I 2025-05-04 03:51:59 nemo_logging:393] Created temporary binary datasets: /workspace/preprocessed_data/XTT22_train_byte-level_train.bin.tmp /workspace/preprocessed_data/XTT22_train_byte-level_val.bin.tmp /workspace/preprocessed_data/XTT22_train_byte-level_test.bin.tmp
[NeMo I 2025-05-04 04:10:51 nemo_logging:393] Average preprocessing time per sequence: 0.027723908739850228
[NeMo I 2025-05-04 04:10:51 nemo_logging:393] Average indexing time per sequence: 0.07471058235001256
[NeMo I 2025-05-04 04:10:51 nemo_logging:393] Number of sequences processed: 12092
[NeMo I 2025-05-04 04:10:51 nemo_logging:393] Finished preprocessing XTT22_train ([PosixPath('/workspace/XTT22_train.fa')]) in 1132.750 seconds with 1 workers.


In [14]:
!ls -lh preprocessed_data/

total 9.3G
-rw-r--r-- 1 root root 1.5G May  4 04:10 XTT22_train_byte-level_test.bin
-rw-r--r-- 1 root root  13K May  4 04:10 XTT22_train_byte-level_test.idx
-rw-r--r-- 1 root root  12G May  4 04:10 XTT22_train_byte-level_train.bin
-rw-r--r-- 1 root root 213K May  4 04:10 XTT22_train_byte-level_train.idx
-rw-r--r-- 1 root root 568M May  4 04:10 XTT22_train_byte-level_val.bin
-rw-r--r-- 1 root root  12K May  4 04:10 XTT22_train_byte-level_val.idx
-rw-r--r-- 1 root root    0 May  4 03:44 sc3_train_byte-level_test.bin.tmp
-rw-r--r-- 1 root root    0 May  4 03:44 sc3_train_byte-level_train.bin.tmp
-rw-r--r-- 1 root root    0 May  4 03:44 sc3_train_byte-level_val.bin.tmp


In [1]:
!evo2_convert_to_nemo2 \
  --model-path hf://arcinstitute/savanna_evo2_1b_base \
  --model-size 1b --output-dir nemo2_evo2_1b_8k

savanna_evo2_1b_base.pt: 100%|█████████████| 2.71G/2.71G [04:17<00:00, 10.5MB/s]
[NeMo I 2025-05-04 13:41:01 nemo_logging:393] Using byte-level tokenization
[INFO     | pytorch_lightning.utilities.rank_zero]: GPU available: True (cuda), used: False
[INFO     | pytorch_lightning.utilities.rank_zero]: TPU available: False, using: 0 TPU cores
[INFO     | pytorch_lightning.utilities.rank_zero]: HPU available: False, using: 0 HPUs
[NeMo W 2025-05-04 13:41:01 nemo_logging:405] /usr/local/lib/python3.12/dist-packages/lightning/pytorch/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
    
[NeMo I 2025-05-04 13:41:01 nemo_logging:393] Fixing mis-match between ddp-config & mcore-optimizer config
[NeMo I 2025-05-04 13:41:01 nemo_logging:393] Rank 0 has data parallel group : [0]
[NeMo I 2025-05-04 13:41:01 nemo_logging:393] Rank 0 has combined group of data parallel and context parallel : [0]
[NeMo I 2025-05-04 13:41:01 nemo_logging:393] All d

In [2]:
import os
from pathlib import Path
output_pfx = str(Path(os.path.abspath("preprocessed_data"))/"XTT22_train_byte-level")
output_yaml = f"""
- dataset_prefix: {output_pfx}_train
  dataset_split: train
  dataset_weight: 1.0
- dataset_prefix: {output_pfx}_val
  dataset_split: validation
  dataset_weight: 1.0
- dataset_prefix: {output_pfx}_test
  dataset_split: test
  dataset_weight: 1.0
"""
with open("training_data_config.yaml", "w") as f:
    print(output_yaml, file=f)

In [None]:
!train_evo2 \
    -d training_data_config.yaml \
    --dataset-dir {preprocessed_data} \
    --model-size 1b \
    --devices 1 \
    --num-nodes 1 \
    --seq-length 1 \
    --micro-batch-size 1 \
    --lr 0.0001 \
    --warmup-steps 5 \
    --max-steps 100 \
    --ckpt-dir nemo2_evo2_1b_8k \
    --clip-grad 1 \
    --wd 0.01 \
    --activation-checkpoint-recompute-num-layers 1 \
    --val-check-interval 50 \
    --ckpt-async-save

[NeMo I 2025-05-05 10:33:34 nemo_logging:393] Using byte-level tokenization
[NeMo W 2025-05-05 10:33:34 nemo_logging:405] WandB is currently turned off.
[NeMo W 2025-05-05 10:33:34 nemo_logging:405] User-set tensorboard is currently turned off. Internally one may still be set by NeMo2.
[INFO     | pytorch_lightning.utilities.rank_zero]: Trainer already configured with model summary callbacks: [<class 'lightning.pytorch.callbacks.rich_model_summary.RichModelSummary'>]. Skipping setting a default `ModelSummary` callback.
[INFO     | pytorch_lightning.utilities.rank_zero]: GPU available: True (cuda), used: True
[INFO     | pytorch_lightning.utilities.rank_zero]: TPU available: False, using: 0 TPU cores
[INFO     | pytorch_lightning.utilities.rank_zero]: HPU available: False, using: 0 HPUs
[NeMo I 2025-05-05 10:33:34 nemo_logging:393] Experiments will be logged at results/evo2/dev
[NeMo W 2025-05-05 10:33:34 nemo_logging:405] There were no checkpoints found in checkpoint_dir or no checkpoi

In [14]:
!pip install -q peft

[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/looseversion-1.3.0-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/lightning_utilities-0.12.0.dev0-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/dill-0.3.9-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/o

In [19]:
!cp /workspace/train_lora2.py /usr/local/lib/python3.12/dist-packages/bionemo/evo2/run/train.py

In [8]:
!train_evo2 \
    -d training_data_config.yaml \
    --dataset-dir ./preprocessed_data \
    --model-size 7b \
    --devices 1 \
    --num-nodes 1 \
    --seq-length 8192 \
    --micro-batch-size 1 \
    --lr 0.0001 \
    --warmup-steps 5 \
    --max-steps 100 \
    --clip-grad 1 \
    --wd 0.01 \
    --activation-checkpoint-recompute-num-layers 1 \
    --val-check-interval 50 \
    --ckpt-async-save

[NeMo I 2025-05-04 13:57:53 nemo_logging:393] Using byte-level tokenization
[NeMo W 2025-05-04 13:57:53 nemo_logging:405] WandB is currently turned off.
[NeMo W 2025-05-04 13:57:53 nemo_logging:405] User-set tensorboard is currently turned off. Internally one may still be set by NeMo2.
[INFO     | pytorch_lightning.utilities.rank_zero]: Trainer already configured with model summary callbacks: [<class 'lightning.pytorch.callbacks.rich_model_summary.RichModelSummary'>]. Skipping setting a default `ModelSummary` callback.
[INFO     | pytorch_lightning.utilities.rank_zero]: GPU available: True (cuda), used: True
[INFO     | pytorch_lightning.utilities.rank_zero]: TPU available: False, using: 0 TPU cores
[INFO     | pytorch_lightning.utilities.rank_zero]: HPU available: False, using: 0 HPUs
[NeMo I 2025-05-04 13:57:53 nemo_logging:393] Experiments will be logged at results/evo2/dev
[NeMo W 2025-05-04 13:57:53 nemo_logging:405] There were no checkpoints found in checkpoint_dir or no checkpoi