In [1]:
# 导入必要的库
import tensorflow as tf
import numpy as np
import os
import ast
import logging
import string
import random
import yaml
from datetime import datetime

from dimenet.model.dimenet_pp import create_dimenet_pp_from_data_container
from dimenet.model.activations import swish
from dimenet.training.trainer import Trainer
from dimenet.training.metrics import Metrics
from dimenet.training.data_container import DataContainer
from dimenet.training.data_provider import DataProvider


In [2]:
# 配置日志系统
logger = logging.getLogger()
logger.handlers = []
ch = logging.StreamHandler()
formatter = logging.Formatter(
        fmt='%(asctime)s (%(levelname)s): %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
ch.setFormatter(formatter)
logger.addHandler(ch)
logger.setLevel('INFO')

# 设置TensorFlow日志级别
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
tf.get_logger().setLevel('WARN')
tf.autograph.set_verbosity(2)

In [3]:
# 加载配置文件
with open('config_pp.yaml', 'r') as c:
    config = yaml.safe_load(c)

# 冻结训练参数
freeze_backbone = config.get('freeze_backbone', False)
pretrained_model_path = config.get('pretrained_model_path', None)

# 解析字符串配置（处理None等特殊值）
for key, val in config.items():
    if type(val) is str:
        try:
            config[key] = ast.literal_eval(val)
        except (ValueError, SyntaxError):
            pass

# 提取DimeNet++模型配置参数
model_name = config['model_name']
if model_name != "dimenet++":
    raise ValueError(f"只支持DimeNet++模型，当前配置: '{model_name}'")

out_emb_size = config['out_emb_size']
int_emb_size = config['int_emb_size']
basis_emb_size = config['basis_emb_size']
extensive = config['extensive']
    
# 提取通用配置参数
emb_size = config['emb_size']
num_blocks = config['num_blocks']
num_spherical = config['num_spherical']
num_radial = config['num_radial']
output_init = config['output_init']
cutoff = config['cutoff']
envelope_exponent = config['envelope_exponent']
num_before_skip = config['num_before_skip']
num_after_skip = config['num_after_skip']
num_dense_output = config['num_dense_output']

# 提取训练配置参数
num_train = config['num_train']
num_valid = config['num_valid']
data_seed = config['data_seed']
dataset = config['dataset']
logdir = config['logdir']
num_steps = config['num_steps']
ema_decay = config['ema_decay']
learning_rate = config['learning_rate']
warmup_steps = config['warmup_steps']
decay_rate = config['decay_rate']
decay_steps = config['decay_steps']
batch_size = config['batch_size']
evaluation_interval = config['evaluation_interval']
save_interval = config['save_interval']
restart = config['restart']
comment = config['comment']
targets = config['targets']

print(f"配置加载完成！")
print(f"模型: {model_name}")
print(f"冻结主干: {freeze_backbone}")
print(f"预训练路径: {pretrained_model_path}")
print(f"训练步数: {num_steps}")
print(f"批次大小: {batch_size}")


配置加载完成！
模型: dimenet++
冻结主干: True
预训练路径: pretrained/dimenet_pp/alpha
训练步数: 50000
批次大小: 16


In [4]:
# 加载FIA数据集
json_file = 'data/FIA49k_Al.json'
xlsx_file = 'data/FIA49k.xlsx'

# 创建数据容器
data_container = DataContainer(
    json_file=json_file, 
    xlsx_file=xlsx_file, 
    cutoff=cutoff, 
    target_keys=None  # 自动从Excel文件确定目标键
)

print(f"数据容器加载完成！")
print(f"目标键: {data_container.target_keys}")
print(f"数据量: {len(data_container)}")
print(f"目标值形状: {data_container.targets.shape}")


成功提取 2215 个化合物的FIA目标值
目标值形状: (2215, 4)
FIA列: ['fia_gas-DSDBLYP', 'fia_gas-PBEh3c', 'fia_solv-DSDBLYP', 'fia_solv-PBEh3c']
目标值标准化完成 - 均值: [469.7716  461.22308 282.7777  274.22876], 标准差: [57.100338 61.692844 47.78473  52.19486 ]
数据容器加载完成！
目标键: ['fia_gas-DSDBLYP', 'fia_gas-PBEh3c', 'fia_solv-DSDBLYP', 'fia_solv-PBEh3c']
数据量: 2215
目标值形状: (2215, 4)


In [5]:
# 显示数据集划分情况
print("=== 数据集划分 ===")
total_samples = len(data_container)
num_test = total_samples - num_train - num_valid

print(f"总样本数: {total_samples}")
print(f"训练集: {num_train} 样本 ({num_train/total_samples*100:.1f}%)")
print(f"验证集: {num_valid} 样本 ({num_valid/total_samples*100:.1f}%)")
print(f"测试集: {num_test} 样本 ({num_test/total_samples*100:.1f}%)")

# 验证数据划分
if num_test <= 0:
    print("警告: 测试集样本数为0或负数, 请检查数据划分参数")
else:
    print("数据划分正常")


=== 数据集划分 ===
总样本数: 2215
训练集: 1600 样本 (72.2%)
验证集: 400 样本 (18.1%)
测试集: 215 样本 (9.7%)
数据划分正常


In [6]:
# 创建DimeNet++模型
print("=== 创建DimeNet++模型 ===")
model = create_dimenet_pp_from_data_container(
    data_container,
    emb_size=emb_size, out_emb_size=out_emb_size,
    int_emb_size=int_emb_size, basis_emb_size=basis_emb_size,
    num_blocks=num_blocks, num_spherical=num_spherical, num_radial=num_radial,
    cutoff=cutoff, envelope_exponent=envelope_exponent,
    num_before_skip=num_before_skip, num_after_skip=num_after_skip,
    num_dense_output=num_dense_output,
    activation=swish, extensive=extensive, output_init=output_init,
    freeze_backbone=freeze_backbone)

print(f"模型创建完成！目标数量: {model.num_targets}")

# 构建模型（通过前向传播）
print("构建模型...")
data_provider = DataProvider(data_container, num_train, num_valid, batch_size,
                           seed=data_seed, randomized=True)
train_dataset = data_provider.get_dataset('train').prefetch(tf.data.experimental.AUTOTUNE)
train_dataset_iter = iter(train_dataset)
inputs, targets = next(train_dataset_iter)
_ = model(inputs)  # 这会构建模型

print(f"模型构建完成！")
print(f"模型参数总数: {model.count_params()}")

# 显示可训练层信息（在模型构建后）
if hasattr(model, 'print_trainable_layers'):
    model.print_trainable_layers()
print(f"   可训练参数数量: {model.get_trainable_params_count()}")

=== 创建DimeNet++模型 ===
根据数据容器创建模型，目标数量: 4
目标键: ['fia_gas-DSDBLYP', 'fia_gas-PBEh3c', 'fia_solv-DSDBLYP', 'fia_solv-PBEh3c']
Metal device set to: Apple M1

systemMemory: 8.00 GB
maxCacheSize: 2.67 GB

主干网络已冻结，只训练输出头
主干网络已冻结，只训练输出头
模型创建完成！目标数量: 4
构建模型...
FIA目标值形状: [None, 4]
FIA目标键: ['fia_gas-DSDBLYP', 'fia_gas-PBEh3c', 'fia_solv-DSDBLYP', 'fia_solv-PBEh3c']
目标值数量: 4
FIA目标值形状: (16, 4)
FIA目标值示例: [[-0.07125839  0.03458285  0.03160409  0.14777401]
 [ 0.02963555  0.09814543 -0.43191    -0.3118239 ]]


2025-09-25 09:40:27.810409: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


模型构建完成！
模型参数总数: 1889670
=== 可训练层信息 ===
  emb_block: 已冻结
  rbf_layer: 已冻结
  sbf_layer: 已冻结
  int_block_0: 已冻结
  int_block_1: 已冻结
  int_block_2: 已冻结
  int_block_3: 已冻结
  output_block_0: 可训练, 参数数量: 231936
  output_block_1: 可训练, 参数数量: 231936
  output_block_2: 可训练, 参数数量: 231936
  output_block_3: 可训练, 参数数量: 231936
  output_block_4: 可训练, 参数数量: 231936
   可训练参数数量: 1159680


In [None]:
# 加载预训练权重
if pretrained_model_path and freeze_backbone:
    print("=== 加载预训练权重 ===")
    print(f"正在加载预训练权重: {pretrained_model_path}")
    
    # 检查预训练模型是否存在
    if os.path.exists(pretrained_model_path):
        try:
            # 尝试加载兼容的权重，跳过不兼容的输出层
            print("尝试加载兼容的预训练权重...")
            checkpoint_path = os.path.join(pretrained_model_path, 'ckpt')
            if os.path.exists(checkpoint_path + '.index'):
                # 使用tf.train.Checkpoint只加载兼容的变量
                ckpt = tf.train.Checkpoint(model=model)
                # 只恢复兼容的变量，跳过输出层
                status = ckpt.restore(checkpoint_path)
                # 检查哪些变量被恢复了
                restored_vars = status.expect_partial()
                print(f"预训练权重部分加载成功，跳过了不兼容的输出层")
            else:
                print("未找到checkpoint文件, 跳过预训练权重加载")
            
            # 显示可训练参数数量
            if hasattr(model, 'get_trainable_params_count'):
                trainable_params = model.get_trainable_params_count()
                print(f"可训练参数数量: {trainable_params}")
        except Exception as e:
            print(f"预训练权重加载失败: {e}")
            print("将使用随机初始化的权重")
    else:
        print(f"预训练模型路径不存在: {pretrained_model_path}")
        print("将使用随机初始化的权重")
elif freeze_backbone:
    print("=== 冻结主干网络 ===")
    if hasattr(model, 'get_trainable_params_count'):
        trainable_params = model.get_trainable_params_count()
        print(f"   可训练参数数量: {trainable_params}")
else:
    print("=== 正常训练模式 ===")


=== 加载预训练权重 ===
正在加载预训练权重: pretrained/dimenet_pp/alpha
尝试加载兼容的预训练权重...
✅ 预训练权重部分加载成功，跳过了不兼容的输出层
   可训练参数数量: 1159680


In [7]:
# 创建训练器
print("=== 创建训练器 ===")
trainer = Trainer(model, learning_rate, warmup_steps,
                  decay_steps, decay_rate,
                  ema_decay=ema_decay, max_grad_norm=1000,
                  freeze_backbone=freeze_backbone)

print(f"✅ 训练器初始化完成！")
print(f"   冻结主干: {freeze_backbone}")
print(f"   学习率: {learning_rate}")
print(f"   优化器: {type(trainer.optimizer).__name__}")

=== 创建训练器 ===
✅ 训练器初始化完成！
   冻结主干: True
   学习率: 0.001
   优化器: Adam


In [8]:
# 测试集评估函数
def evaluate_on_test_set(model, data_provider, batch_size, trainer, max_batches=10):
    """在测试集上评估模型"""
    test_dataset = data_provider.get_dataset('test').prefetch(tf.data.experimental.AUTOTUNE)
    test_dataset_iter = iter(test_dataset)
    test_metrics = Metrics('test', data_container.target_keys)
    
    num_test = len(data_container) - num_train - num_valid
    num_batches = min(int(np.ceil(num_test / batch_size)), max_batches)
    
    for i in range(num_batches):
        try:
            inputs, targets = next(test_dataset_iter)
            preds = model(inputs, training=False)
            loss, mae = trainer.compute_loss(targets, preds)
            nsamples = tf.shape(preds)[0]
            test_metrics.update_state(loss, loss, mae, nsamples)
        except StopIteration:
            break
        except Exception as e:
            print(f"测试批次 {i} 出错: {e}")
            break
    
    return test_metrics

In [9]:
# 学习率已经在Trainer中根据freeze_backbone自动调整
print(f"学习率调度器: {type(trainer.optimizer.learning_rate).__name__}")
print(f"基础学习率: {learning_rate}")
print(f"冻结训练: {freeze_backbone}")

学习率调度器: LinearWarmupExponentialDecay
基础学习率: 0.001
冻结训练: True


In [None]:
# 创建训练和验证数据集
print("=== 创建训练和验证数据集 ===")

# 创建训练和验证字典
train = {}
validation = {}

# 创建训练数据集
train['dataset'] = data_provider.get_dataset('train').prefetch(tf.data.experimental.AUTOTUNE)
train['dataset_iter'] = iter(train['dataset'])

# 创建验证数据集
validation['dataset'] = data_provider.get_dataset('val').prefetch(tf.data.experimental.AUTOTUNE)
validation['dataset_iter'] = iter(validation['dataset'])

# 创建训练和验证指标
train['metrics'] = Metrics('train', data_container.target_keys)
validation['metrics'] = Metrics('val', data_container.target_keys)

print(f"✅ 数据集创建完成！")
print(f"   训练集: {num_train} 样本")
print(f"   验证集: {num_valid} 样本")

# 创建日志目录
log_dir = f"logs_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
os.makedirs(log_dir, exist_ok=True)
step_ckpt_folder = log_dir
best_loss_file = os.path.join(log_dir, 'best_loss.npz')
best_ckpt_file = os.path.join(log_dir, 'best_ckpt')

# 初始化最佳指标
metrics_best = {k: np.inf for k in validation['metrics'].keys()}
metrics_best['step'] = 0

# 创建summary writer
summary_writer = tf.summary.create_file_writer(log_dir)

print(f"✅ 日志目录创建完成: {log_dir}")

# 测试数据集迭代器和模型
print("🔍 测试数据集迭代器...")
try:
    test_inputs, test_targets = next(train['dataset_iter'])
    print(f"✅ 训练数据集迭代器正常 - 输入形状: {test_inputs['node_attr'].shape}, 目标形状: {test_targets.shape}")
except Exception as e:
    print(f"❌ 训练数据集迭代器出错: {e}")

print("🔍 测试模型前向传播...")
try:
    test_preds = model(test_inputs, training=True)
    print(f"✅ 模型前向传播正常 - 预测形状: {test_preds.shape}")
except Exception as e:
    print(f"❌ 模型前向传播出错: {e}")

### Set up checkpointing and load latest checkpoint

=== 创建训练和验证数据集 ===
✅ 数据集创建完成！
   训练集: 1600 样本
   验证集: 400 样本
✅ 日志目录创建完成: logs_20250922_204908
🔍 测试数据集迭代器...
❌ 训练数据集迭代器出错: 'node_attr'
🔍 测试模型前向传播...
✅ 模型前向传播正常 - 预测形状: (16, 4)


In [12]:
# Set up checkpointing
ckpt = tf.train.Checkpoint(step=tf.Variable(1), optimizer=trainer.optimizer, model=model)
manager = tf.train.CheckpointManager(ckpt, step_ckpt_folder, max_to_keep=3)

# Restore latest checkpoint
ckpt_restored = tf.train.latest_checkpoint(log_dir)
if ckpt_restored is not None:
    ckpt.restore(ckpt_restored)

### Training loop

Note that the warning `UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.` is expected. It is due to the backward pass of `tf.gather` producing sparse gradients, which the previous layer has to convert to a dense tensor.

In [None]:
with summary_writer.as_default():
    steps_per_epoch = int(np.ceil(num_train / batch_size))
    
    print(f"🚀 开始训练！总步数: {num_steps}, 每轮: {steps_per_epoch}步")
    print("=" * 50)

    if ckpt_restored is not None:
        step_init = ckpt.step.numpy()
        print(f"📂 从检查点恢复: 步骤 {step_init}")
    else:
        step_init = 1
        print(f"🆕 从头开始训练")
    
    for step in range(step_init, num_steps + 1):
        # Update step number
        ckpt.step.assign(step)
        tf.summary.experimental.set_step(step)

        # 添加调试输出
        if step <= 5 or step % 10 == 0:
            print(f"🔍 步骤 {step}: 开始训练...")
        
        try:
            # Perform training step
            loss = trainer.train_on_batch(train['dataset_iter'], train['metrics'])
            
            if step <= 5 or step % 10 == 0:
                print(f"✅ 步骤 {step}: 训练完成，损失 = {loss:.6f}")
                
        except StopIteration:
            print(f"⚠️ 步骤 {step}: 数据集迭代器耗尽，重新创建...")
            train['dataset_iter'] = iter(train['dataset'])
            loss = trainer.train_on_batch(train['dataset_iter'], train['metrics'])
        except Exception as e:
            print(f"❌ 步骤 {step}: 训练出错 - {e}")
            break

        # 每100步显示训练进度
        if step % 100 == 0:
            epoch = step // steps_per_epoch
            progress = (step - step_init) / (num_steps - step_init + 1) * 100
            print(f"📊 步骤 {step}/{num_steps} (轮次 {epoch+1}) - 进度: {progress:.1f}% - 损失: {train['metrics'].loss:.6f}")

        # Save progress
        if (step % save_interval == 0):
            manager.save()
            print(f"💾 检查点已保存 (步骤 {step})")

        # Evaluate model and log results
        if (step % evaluation_interval == 0):
            print(f"\n🔍 评估模型 (步骤 {step})...")

            # Save backup variables and load averaged variables
            trainer.save_variable_backups()
            trainer.load_averaged_variables()

            # Compute results on the validation set
            for i in range(int(np.ceil(num_valid / batch_size))):
                trainer.test_on_batch(validation['dataset_iter'], validation['metrics'])

            # Update and save best result
            if validation['metrics'].mean_mae < metrics_best['mean_mae_val']:
                metrics_best['step'] = step
                metrics_best.update(validation['metrics'].result())

                np.savez(best_loss_file, **metrics_best)
                model.save_weights(best_ckpt_file)
                print(f"🏆 新的最佳模型已保存! 验证MAE: {validation['metrics'].mean_mae:.6f}")

            for key, val in metrics_best.items():
                if key != 'step':
                    tf.summary.scalar(key + '_best', val)

            epoch = step // steps_per_epoch
            print(f"📈 步骤 {step}/{num_steps} (轮次 {epoch+1}): 训练损失: {train['metrics'].loss:.6f}, 验证损失: {validation['metrics'].loss:.6f}")
            print(f"   训练MAE: {train['metrics'].mean_mae:.6f}, 验证MAE: {validation['metrics'].mean_mae:.6f}, 最佳验证MAE: {metrics_best['mean_mae_val']:.6f}")
            print("-" * 50)

            train['metrics'].write()
            validation['metrics'].write()

            train['metrics'].reset_states()
            validation['metrics'].reset_states()

            # Restore backup variables
            trainer.restore_variable_backups()
    
    print(f"\n🎉 训练完成! 最佳验证MAE: {metrics_best['mean_mae_val']:.6f}, 模型保存在: {log_dir}")

🚀 开始训练！总步数: 50000, 每轮: 100步
🆕 从头开始训练
🔍 步骤 1: 开始训练...
✅ 步骤 1: 训练完成，损失 = 239.933670
🔍 步骤 2: 开始训练...
✅ 步骤 2: 训练完成，损失 = 180.053955
🔍 步骤 3: 开始训练...
✅ 步骤 3: 训练完成，损失 = 196.125381
🔍 步骤 4: 开始训练...
✅ 步骤 4: 训练完成，损失 = 183.205048
🔍 步骤 5: 开始训练...
✅ 步骤 5: 训练完成，损失 = 253.412033
🔍 步骤 10: 开始训练...
✅ 步骤 10: 训练完成，损失 = 158.679871
🔍 步骤 20: 开始训练...
✅ 步骤 20: 训练完成，损失 = 145.188889
🔍 步骤 30: 开始训练...
✅ 步骤 30: 训练完成，损失 = 147.169403
🔍 步骤 40: 开始训练...
✅ 步骤 40: 训练完成，损失 = 86.645569


In [None]:
# 训练完成后在测试集上评估模型
print("=== 测试集评估 ===")

# 加载最佳模型权重
if os.path.exists(best_ckpt_file):
    print(f"加载最佳模型权重: {best_ckpt_file}")
    model.load_weights(best_ckpt_file)
else:
    print("未找到最佳模型权重，使用当前模型")

# 在测试集上评估
print("在测试集上评估模型...")
test_metrics = evaluate_on_test_set(model, data_provider, batch_size, trainer, max_batches=50)

# 显示测试结果
print(f"\n📊 测试集评估结果:")
print(f"   测试损失: {test_metrics.loss:.6f}")
print(f"   测试MAE: {test_metrics.mean_mae:.6f}")

# 显示每个目标的详细结果
print(f"\n📈 各目标详细结果:")
for i, target_key in enumerate(data_container.target_keys):
    target_mae = test_metrics.mae[i] if hasattr(test_metrics, 'mae') else "N/A"
    print(f"   {target_key}: MAE = {target_mae}")

# 保存测试结果
test_results = {
    'test_loss': test_metrics.loss,
    'test_mae': test_metrics.mean_mae,
    'target_keys': data_container.target_keys
}
np.savez(os.path.join(log_dir, 'test_results.npz'), **test_results)
print(f"\n💾 测试结果已保存到: {os.path.join(log_dir, 'test_results.npz')}")

print("\n🎯 训练和测试完成！所有结果已保存到日志目录。")
