In [1]:
import os
concat_path = "XTT22_train.fa"

In [2]:
full_fasta_path = os.path.abspath(concat_path)
output_dir = os.path.abspath("preprocessed_data")
output_yaml = f"""
- datapaths: ["{full_fasta_path}"]
  output_dir: "{output_dir}"
  output_prefix: XTT22_train
  train_split: 0.9
  valid_split: 0.05
  test_split: 0.05
  overwrite: True
  embed_reverse_complement: true
  random_reverse_complement: 0.0
  random_lineage_dropout: 0.0
  include_sequence_id: false
  transcribe: "back_transcribe"
  force_uppercase: false
  indexed_dataset_dtype: "uint8"
  tokenizer_type: "Byte-Level"
  vocab_file: null
  vocab_size: null
  merges_file: null
  pretrained_tokenizer_model: null
  special_tokens: null
  fast_hf_tokenizer: true
  append_eod: true
  enforce_sample_length: null
  ftfy: false
  workers: 1
  preproc_concurrency: 100000
  chunksize: 25
  drop_empty_sequences: true
  nnn_filter: false  # If you split your fasta on NNN (in human these are contigs), then you should set this to true.
  seed: 12342  # Not relevant because we are not using random reverse complement or lineage dropout.
"""
with open("preprocess_config.yaml", "w") as f:
    print(output_yaml, file=f)

In [3]:
!preprocess_evo2 --config preprocess_config.yaml

[NeMo I 2025-05-24 12:37:06 nemo_logging:393] Using byte-level tokenization
[NeMo I 2025-05-24 12:37:06 nemo_logging:393] Created temporary binary datasets: /workspace/preprocessed_data/XTT22_train_byte-level_train.bin.tmp /workspace/preprocessed_data/XTT22_train_byte-level_val.bin.tmp /workspace/preprocessed_data/XTT22_train_byte-level_test.bin.tmp
[NeMo I 2025-05-24 13:12:11 nemo_logging:393] Average preprocessing time per sequence: 0.04470627161196968
[NeMo I 2025-05-24 13:12:11 nemo_logging:393] Average indexing time per sequence: 0.1463382052460373
[NeMo I 2025-05-24 13:12:11 nemo_logging:393] Number of sequences processed: 12092
[NeMo I 2025-05-24 13:12:11 nemo_logging:393] Finished preprocessing XTT22_train ([PosixPath('/workspace/XTT22_train.fa')]) in 2105.082 seconds with 1 workers.


In [4]:
!ls -lh preprocessed_data/

total 14G
-rw-r--r-- 1 root root 936M May 24 13:11 XTT22_train_byte-level_test.bin
-rw-r--r-- 1 root root  12K May 24 13:12 XTT22_train_byte-level_test.idx
-rw-r--r-- 1 root root  13G May 24 13:12 XTT22_train_byte-level_train.bin
-rw-r--r-- 1 root root 213K May 24 13:12 XTT22_train_byte-level_train.idx
-rw-r--r-- 1 root root 411M May 24 13:12 XTT22_train_byte-level_val.bin
-rw-r--r-- 1 root root  12K May 24 13:12 XTT22_train_byte-level_val.idx


In [3]:
!cp /workspace/hyena_modified.py /usr/local/lib/python3.12/dist-packages/nemo/collections/llm/gpt/model/hyena.py

In [4]:
!evo2_convert_to_nemo2 \
  --model-path /workspace/savanna_evo2_7b/savanna_evo2_7b.pt \
  --model-size 7b --output-dir nemo2_evo2_7b

Could not find the bitsandbytes CUDA binary at PosixPath('/usr/local/lib/python3.12/dist-packages/bitsandbytes/libbitsandbytes_cuda129.so')
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
[NeMo I 2025-06-02 07:46:00 nemo_logging:393] Using byte-level tokenization
GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
[NeMo W 2025-06-02 07:46:00 nemo_logging:405] /usr/local/lib/python3.12/dist-packages/lightning/pytorch/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
    
[NeMo I 2025-06-02 07:46:00 nemo_logging:393] Fixing mis-match between ddp-config & mcore-optimizer config
[NeMo I 2025-06-02 07:46:00 nemo_logging:393] Rank 0 has data parallel group : [0]
[NeMo I 2025-06-02 07:46:00 nemo_logging:393] Rank 0 has combined group of data parallel and context parallel : [0

In [1]:
# ==================== NCCL超时问题解决方案 ====================
import os
import subprocess
import time
from datetime import datetime

# 1. 备份和替换训练脚本
print("🔧 备份并替换训练脚本...")
!cp /usr/local/lib/python3.12/dist-packages/bionemo/evo2/run/train.py /usr/local/lib/python3.12/dist-packages/bionemo/evo2/run/train.py.backup
!cp /workspace/bionemo_train.py /usr/local/lib/python3.12/dist-packages/bionemo/evo2/run/train.py

# 2. 设置NCCL和分布式环境变量
print("🔧 配置NCCL超时和优化参数...")

# NCCL超时设置 - 增加到2小时
os.environ['NCCL_TIMEOUT'] = '7200'  # 2小时超时
os.environ['TORCH_NCCL_BLOCKING_WAIT'] = '1'  # 使用新的环境变量名
os.environ['TORCH_NCCL_ASYNC_ERROR_HANDLING'] = '1'  # 使用新的环境变量名
os.environ['NCCL_DEBUG'] = 'INFO'  # 启用详细调试信息

# PyTorch分布式超时设置
os.environ['TORCH_DISTRIBUTED_TIMEOUT'] = '7200'  # PyTorch分布式超时
os.environ['TORCH_NCCL_TRACE_BUFFER_SIZE'] = '1024'  # 启用NCCL跟踪

# 数据加载和通信优化
os.environ['NCCL_BUFFSIZE'] = '8388608'  # 增加缓冲区大小到8MB
os.environ['NCCL_NTHREADS'] = '8'  # 增加NCCL线程数
os.environ['NCCL_MIN_NTHREADS'] = '4'  # 最小线程数

# 避免内存碎片和并行冲突
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'  # 避免tokenizer并行冲突
os.environ['OMP_NUM_THREADS'] = '4'  # 限制OpenMP线程数

# 数据集准备优化
os.environ['NCCL_P2P_DISABLE'] = '0'  # 确保P2P通信启用
os.environ['NCCL_SHM_DISABLE'] = '0'  # 确保共享内存通信启用

print("环境变量设置完成:")
for key in ['NCCL_TIMEOUT', 'TORCH_DISTRIBUTED_TIMEOUT', 'NCCL_DEBUG', 'NCCL_BUFFSIZE']:
    print(f"  {key}: {os.environ.get(key)}")

# 3. 定义带监控的训练函数
def run_training_with_monitoring():
    """带实时监控的训练启动函数"""
    
    # 训练配置参数
    training_config = {
        'data_config': 'training_data_config.yaml',
        'dataset_dir': 'preprocessed_data',
        'model_size': '7b',
        'devices': 2,
        'num_nodes': 1,
        'seq_length': 1,
        'micro_batch_size': 1,
        'lr': 0.0001,
        'warmup_steps': 5,
        'max_steps': 200000,
        'ckpt_dir': 'nemo2_evo2_7b',
        'clip_grad': 1,
        'wd': 0.01,
        'activation_checkpoint_recompute_num_layers': 1,
        'val_check_interval': 1000
    }
    
    # 构建训练命令
    cmd = f"""
    train_evo2 \\
        -d {training_config['data_config']} \\
        --dataset-dir {training_config['dataset_dir']} \\
        --model-size {training_config['model_size']} \\
        --devices {training_config['devices']} \\
        --num-nodes {training_config['num_nodes']} \\
        --seq-length {training_config['seq_length']} \\
        --micro-batch-size {training_config['micro_batch_size']} \\
        --lr {training_config['lr']} \\
        --warmup-steps {training_config['warmup_steps']} \\
        --max-steps {training_config['max_steps']} \\
        --ckpt-dir {training_config['ckpt_dir']} \\
        --clip-grad {training_config['clip_grad']} \\
        --wd {training_config['wd']} \\
        --activation-checkpoint-recompute-num-layers {training_config['activation_checkpoint_recompute_num_layers']} \\
        --val-check-interval {training_config['val_check_interval']} \\
        --ckpt-async-save
    """.strip().replace('\n', ' ').replace('  ', ' ')
    
    print(f"🚀 开始训练时间: {datetime.now()}")
    print(f"📋 训练命令: {cmd}")
    print("="*80)
    
    try:
        # 启动训练进程
        process = subprocess.Popen(
            cmd,
            shell=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            universal_newlines=True,
            bufsize=1
        )
        
        # 实时监控输出
        start_time = time.time()
        last_output_time = start_time
        dataset_preparation_detected = False
        
        for line in iter(process.stdout.readline, ''):
            current_time = time.time()
            elapsed = current_time - start_time
            
            # 打印带时间戳的输出
            print(f"[{elapsed:.1f}s] {line.rstrip()}")
            
            # 检查关键信息
            keywords = ['dataset', 'preparing', 'loading', 'barrier', 'build', 'index']
            if any(keyword in line.lower() for keyword in keywords):
                print(f"📊 数据集准备阶段: {line.rstrip()}")
                dataset_preparation_detected = True
                last_output_time = current_time
            
            # NCCL相关信息特别标记
            if 'nccl' in line.lower():
                print(f"🔗 NCCL通信: {line.rstrip()}")
                last_output_time = current_time
            
            # 错误信息特别标记
            if any(err in line.lower() for err in ['error', 'timeout', 'fail']):
                print(f"❌ 错误信息: {line.rstrip()}")
                last_output_time = current_time
            
            # 长时间无输出的警告
            if current_time - last_output_time > 300:  # 5分钟无输出
                elapsed_no_output = current_time - last_output_time
                if dataset_preparation_detected:
                    print(f"\n⏳ [数据集准备] 已有 {elapsed_no_output:.1f} 秒无输出，数据集构建中...")
                else:
                    print(f"\n⏳ [等待中] 已有 {elapsed_no_output:.1f} 秒无输出...")
                last_output_time = current_time
        
        # 等待进程完成
        return_code = process.wait()
        
        if return_code == 0:
            print(f"\n✅ 训练成功完成! 总耗时: {time.time() - start_time:.1f} 秒")
        else:
            print(f"\n❌ 训练失败，返回码: {return_code}")
            
    except KeyboardInterrupt:
        print("\n🛑 用户中断训练")
        process.terminate()
    except Exception as e:
        print(f"\n💥 训练出错: {e}")

# 4. 启动训练
print("🎯 启动带监控的训练...")
run_training_with_monitoring()

🔧 设置PyTorch分布式超时: 7200秒
Could not find the bitsandbytes CUDA binary at PosixPath('/usr/local/lib/python3.12/dist-packages/bitsandbytes/libbitsandbytes_cuda129.so')
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
[NeMo I 2025-06-05 09:21:58 nemo_logging:393] Using byte-level tokenization
启用 LoRA 微调...
模型结构调试信息:
--------------------------------------------------
模型总共有 1 个模块
尝试访问模型的其他属性...
模块结构:
   1.  (HyenaModel)

目标模式: ['module.decoder.layers.17.self_attention.linear_qkv']

所有线性层 (0):

找到的注意力相关层 (0):

找到的线性层 (0):

找到的QKV层 (0):
⚠️  没有找到任何线性层！模型可能还没有完全初始化。
这在 NeMo/Megatron 框架中是正常的，模型结构会在训练开始时初始化。
LoRA 配置已保存，将在模型完全初始化后应用。
总参数数量: 0
可训练参数数量: 0
⚠️  警告: 模型总参数数量为0，可能模型初始化有问题
⚠️  警告: 没有可训练的参数，LoRA 可能没有正确应用
LoRA 将在训练开始时自动应用
已添加 LoRA 回调，将在训练开始时应用 LoRA
[NeMo W 2025-06-05 09:21:58 nemo_logging:405] WandB is currently turned off.
[NeMo W 2025-06-05 09:21:58 nemo_logging:405] User-set tensorboard is 

In [None]:
# ==================== 备选方案：分步骤训练 ====================
# 如果上面的方法仍然出现NCCL超时，可以尝试这个分步骤的方法

def run_training_with_dataset_warmup():
    """分步骤训练：先单GPU预热数据集，再多GPU训练"""
    
    print("🔥 步骤1: 数据集预热（单GPU模式）")
    print("="*60)
    
    # 单GPU预热命令
    warmup_cmd = """
    train_evo2 \\
        -d training_data_config.yaml \\
        --dataset-dir preprocessed_data \\
        --model-size 7b \\
        --devices 1 \\
        --num-nodes 1 \\
        --seq-length 1 \\
        --micro-batch-size 1 \\
        --max-steps 1 \\
        --ckpt-dir warmup_ckpt \\
        --val-check-interval 1
    """.strip().replace('\n', ' ').replace('  ', ' ')
    
    print(f"预热命令: {warmup_cmd}")
    
    # 执行预热
    try:
        result = subprocess.run(
            warmup_cmd, 
            shell=True, 
            capture_output=True, 
            text=True, 
            timeout=1800  # 30分钟超时
        )
        
        if result.returncode == 0:
            print("✅ 数据集预热成功!")
            print("🚀 步骤2: 开始正式多GPU训练")
            print("="*60)
            
            # 清理预热的checkpoint
            !rm -rf warmup_ckpt
            
            # 现在运行正常的多GPU训练
            run_training_with_monitoring()
        else:
            print(f"❌ 数据集预热失败:")
            print(f"返回码: {result.returncode}")
            print(f"错误输出: {result.stderr}")
            
    except subprocess.TimeoutExpired:
        print("❌ 数据集预热超时（30分钟）")
    except Exception as e:
        print(f"❌ 数据集预热出错: {e}")

# 取消注释下面的行来使用分步骤方法
# print("🔄 启动分步骤训练...")
# run_training_with_dataset_warmup()


In [None]:
# ==================== 故障排除工具 ====================
# 用于诊断NCCL和训练问题

def diagnose_system():
    """系统诊断和故障排除"""
    
    print("🔍 系统诊断报告")
    print("="*60)
    
    # 检查GPU状态
    print("📟 GPU状态:")
    !nvidia-smi --query-gpu=index,name,memory.used,memory.total,utilization.gpu --format=csv,noheader,nounits
    
    print("\n💾 内存使用:")
    !free -h | head -2
    
    print("\n💽 磁盘空间:")
    !df -h | grep -E '^/dev|File'
    
    print("\n🔗 网络接口:")
    !ip addr show | grep -E '^[0-9]+:|inet '
    
    print("\n🔧 NCCL环境变量:")
    !env | grep -E '^(NCCL|TORCH)' | sort
    
    print("\n📊 进程状态:")
    !ps aux | grep -E '(train_evo2|python.*train)' | grep -v grep
    
    print("\n🐍 PyTorch和CUDA版本:")
    import torch
    print(f"PyTorch版本: {torch.__version__}")
    print(f"CUDA可用: {torch.cuda.is_available()}")
    print(f"CUDA版本: {torch.version.cuda}")
    print(f"CUDA设备数: {torch.cuda.device_count()}")
    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            print(f"  GPU {i}: {torch.cuda.get_device_name(i)}")
            print(f"    内存: {torch.cuda.get_device_properties(i).total_memory // 1024**3} GB")

def check_nccl_connectivity():
    """测试NCCL连接性"""
    print("\n🔗 NCCL连接性测试:")
    
    test_script = '''
import torch
import torch.distributed as dist
import os

if torch.cuda.is_available() and torch.cuda.device_count() >= 2:
    try:
        os.environ["MASTER_ADDR"] = "localhost"
        os.environ["MASTER_PORT"] = "12355"
        
        # 测试NCCL初始化
        dist.init_process_group("nccl", rank=0, world_size=1)
        print("✅ NCCL初始化成功")
        
        # 测试tensor操作
        tensor = torch.ones(2).cuda()
        print(f"✅ CUDA tensor创建成功: {tensor}")
        
        dist.destroy_process_group()
        print("✅ NCCL清理成功")
    except Exception as e:
        print(f"❌ NCCL测试失败: {e}")
else:
    print("⚠️ 需要至少2个GPU来进行NCCL测试")
'''
    
    exec(test_script)

# 运行诊断
diagnose_system()
check_nccl_connectivity()
