In [None]:
# Evo2 训练脚本 - NCCL超时问题解决方案

此脚本解决了在数据集构建阶段发生的NCCL超时问题。主要策略包括：
1. 增加NCCL和分布式通信超时时间
2. 启用详细的调试日志
3. 优化数据加载配置
4. 添加进度监控


In [None]:
## 1. 替换训练脚本


In [None]:
# 备份原始训练脚本
!cp /usr/local/lib/python3.12/dist-packages/bionemo/evo2/run/train.py /usr/local/lib/python3.12/dist-packages/bionemo/evo2/run/train.py.backup

# 替换为修改后的训练脚本
!cp /workspace/bionemo_train.py /usr/local/lib/python3.12/dist-packages/bionemo/evo2/run/train.py

print("训练脚本已替换完成")


In [None]:
## 2. 设置环境变量解决NCCL超时问题


In [None]:
import os

# NCCL超时设置 - 增加到2小时
os.environ['NCCL_TIMEOUT'] = '7200'  # 2小时超时
os.environ['NCCL_BLOCKING_WAIT'] = '1'  # 阻塞等待，避免竞争条件
os.environ['NCCL_DEBUG'] = 'INFO'  # 启用详细调试信息

# PyTorch分布式超时设置
os.environ['TORCH_DISTRIBUTED_TIMEOUT'] = '7200'  # PyTorch分布式超时
os.environ['TORCH_NCCL_TRACE_BUFFER_SIZE'] = '1024'  # 启用NCCL跟踪

# 数据加载优化
os.environ['NCCL_ASYNC_ERROR_HANDLING'] = '1'  # 异步错误处理
os.environ['NCCL_BUFFSIZE'] = '8388608'  # 增加缓冲区大小
os.environ['NCCL_NTHREADS'] = '8'  # 增加NCCL线程数

# 避免内存碎片
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'

# 数据集准备相关设置
os.environ['TOKENIZERS_PARALLELISM'] = 'false'  # 避免tokenizer并行冲突
os.environ['OMP_NUM_THREADS'] = '4'  # 限制OpenMP线程数

print("环境变量设置完成:")
for key in ['NCCL_TIMEOUT', 'TORCH_DISTRIBUTED_TIMEOUT', 'NCCL_DEBUG']:
    print(f"{key}: {os.environ.get(key)}")


In [None]:
## 3. 检查系统资源和GPU状态


In [None]:
# 检查GPU状态
!nvidia-smi
print("\n" + "="*50)

# 检查内存使用
!free -h
print("\n" + "="*50)

# 检查磁盘空间
!df -h | head -10


In [None]:
## 4. 设置训练参数


In [None]:
# 训练配置参数
training_config = {
    'data_config': 'training_data_config.yaml',
    'dataset_dir': 'preprocessed_data',  # 需要根据实际路径修改
    'model_size': '7b',
    'devices': 2,
    'num_nodes': 1,
    'seq_length': 1,
    'micro_batch_size': 1,
    'lr': 0.0001,
    'warmup_steps': 5,
    'max_steps': 200000,
    'ckpt_dir': 'nemo2_evo2_7b',
    'clip_grad': 1,
    'wd': 0.01,
    'activation_checkpoint_recompute_num_layers': 1,
    'val_check_interval': 1000,
    'ckpt_async_save': True
}

print("训练配置:")
for key, value in training_config.items():
    print(f"{key}: {value}")


In [None]:
## 5. 启动训练（带超时监控）


In [None]:
import subprocess
import time
from datetime import datetime

def run_training_with_monitoring():
    # 构建训练命令
    cmd = f"""
    train_evo2 \\
        -d {training_config['data_config']} \\
        --dataset-dir {training_config['dataset_dir']} \\
        --model-size {training_config['model_size']} \\
        --devices {training_config['devices']} \\
        --num-nodes {training_config['num_nodes']} \\
        --seq-length {training_config['seq_length']} \\
        --micro-batch-size {training_config['micro_batch_size']} \\
        --lr {training_config['lr']} \\
        --warmup-steps {training_config['warmup_steps']} \\
        --max-steps {training_config['max_steps']} \\
        --ckpt-dir {training_config['ckpt_dir']} \\
        --clip-grad {training_config['clip_grad']} \\
        --wd {training_config['wd']} \\
        --activation-checkpoint-recompute-num-layers {training_config['activation_checkpoint_recompute_num_layers']} \\
        --val-check-interval {training_config['val_check_interval']} \\
        --ckpt-async-save
    """.strip().replace('\n', ' ').replace('  ', ' ')
    
    print(f"开始训练时间: {datetime.now()}")
    print(f"训练命令: {cmd}")
    print("\n" + "="*80)
    
    try:
        # 启动训练进程
        process = subprocess.Popen(
            cmd,
            shell=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            universal_newlines=True,
            bufsize=1
        )
        
        # 实时监控输出
        start_time = time.time()
        last_output_time = start_time
        
        for line in iter(process.stdout.readline, ''):
            current_time = time.time()
            elapsed = current_time - start_time
            
            # 打印带时间戳的输出
            print(f"[{elapsed:.1f}s] {line.rstrip()}")
            
            # 检查关键信息
            if any(keyword in line.lower() for keyword in ['dataset', 'preparing', 'loading', 'barrier']):
                print(f"*** 数据集准备阶段: {line.rstrip()} ***")
                last_output_time = current_time
            
            # 如果太长时间没有输出，给出提示
            if current_time - last_output_time > 300:  # 5分钟无输出
                print(f"\n[警告] 已有 {(current_time - last_output_time):.1f} 秒无输出，可能在准备数据集...")
                last_output_time = current_time
        
        # 等待进程完成
        return_code = process.wait()
        
        if return_code == 0:
            print(f"\n训练成功完成! 总耗时: {time.time() - start_time:.1f} 秒")
        else:
            print(f"\n训练失败，返回码: {return_code}")
            
    except KeyboardInterrupt:
        print("\n用户中断训练")
        process.terminate()
    except Exception as e:
        print(f"\n训练出错: {e}")

# 启动训练
run_training_with_monitoring()


In [None]:
## 6. 替代方案：分步骤训练（如果仍有超时问题）


In [None]:
# 如果上面的方法仍然超时，可以尝试分步骤方法

def run_training_with_dataset_warmup():
    """先预热数据集，再开始正式训练"""
    
    print("步骤1: 数据集预热（单进程）")
    warmup_cmd = f"""
    train_evo2 \\
        -d {training_config['data_config']} \\
        --dataset-dir {training_config['dataset_dir']} \\
        --model-size {training_config['model_size']} \\
        --devices 1 \\
        --num-nodes 1 \\
        --seq-length {training_config['seq_length']} \\
        --micro-batch-size 1 \\
        --max-steps 1 \\
        --ckpt-dir warmup_ckpt
    """.strip().replace('\n', ' ').replace('  ', ' ')
    
    print(f"预热命令: {warmup_cmd}")
    
    # 执行预热
    result = subprocess.run(warmup_cmd, shell=True, capture_output=True, text=True)
    
    if result.returncode == 0:
        print("数据集预热成功！")
        
        print("\n步骤2: 开始正式多GPU训练")
        # 现在运行正常的多GPU训练
        run_training_with_monitoring()
    else:
        print(f"数据集预热失败: {result.stderr}")

# 取消注释下面的行来使用分步骤方法
# run_training_with_dataset_warmup()


In [None]:
## 7. 故障排除和日志分析


In [None]:
# 检查NCCL相关进程
!ps aux | grep -E '(train_evo2|python.*train)' | grep -v grep

print("\n检查NCCL环境变量:")
!env | grep NCCL

print("\n检查PyTorch版本和CUDA:")
import torch
print(f"PyTorch版本: {torch.__version__}")
print(f"CUDA可用: {torch.cuda.is_available()}")
print(f"CUDA设备数: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
