In [None]:
## install finrl library
!pip install git+https://github.com/AI4Finance-Foundation/FinRL.git

# 导入库

In [None]:
# 导入必要的库
import pandas as pd
import numpy as np
import os
from stable_baselines3.common.logger import configure
from finrl.agents.stablebaselines3.models import DRLAgent
from finrl.config import INDICATORS, TRAINED_MODEL_DIR, RESULTS_DIR
from finrl.main import check_and_make_directories
from finrl.meta.env_stock_trading.env_stocktrading import StockTradingEnv
import glob
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False    # 用来正常显示负号
import torch
import time
from stable_baselines3.common.logger import configure
from stable_baselines3.common.utils import set_random_seed
print("检查GPU可用性...")
use_cuda = torch.cuda.is_available()
if use_cuda:
    cuda_device_count = torch.cuda.device_count()
    cuda_device_name = torch.cuda.get_device_name(0)
    print(f"✓ 发现 {cuda_device_count} 个可用的GPU设备")
    print(f"✓ 当前使用: {cuda_device_name}")
else:
    print("✗ 未发现可用的GPU，将使用CPU进行训练")

# 确保模型保存目录存在
check_and_make_directories([TRAINED_MODEL_DIR])
# 设置随机种子以确保结果可复现
set_random_seed(0)

# 加载数据

In [None]:
# 加载预处理后的训练数据
processed_data_file = "data/processed_data/train_data_small.csv"

# 检查文件是否存在
if not os.path.exists(processed_data_file):
    raise FileNotFoundError(
        f"找不到处理后的数据文件: {processed_data_file}，请先运行 process_data.ipynb"
    )

# 加载训练数据
train = pd.read_csv(processed_data_file)
train = train.set_index(train.columns[0])
train.index.names = [""]

print(f"加载训练数据: {len(train)} 条记录")

train.head()

# 构建交易环节

In [None]:
# 构建交易环境的参数
# 计算环境参数
stock_dimension = len(train.tic.unique())
state_space = 1 + 2 * stock_dimension + len(INDICATORS) * stock_dimension
print(f"股票数量: {stock_dimension}, 状态空间维度: {state_space}")

# 设置环境参数
buy_cost_list = sell_cost_list = [0.001] * stock_dimension  # 交易成本 0.1%
num_stock_shares = [0] * stock_dimension  # 初始持有股票数量

env_kwargs = {
    "hmax": 503,  # 最大持仓数量
    "initial_amount": 1000000,  # 初始资金
    "num_stock_shares": num_stock_shares,
    "buy_cost_pct": buy_cost_list,
    "sell_cost_pct": sell_cost_list,
    "state_space": state_space,
    "stock_dim": stock_dimension,
    "tech_indicator_list": INDICATORS,
    "action_space": stock_dimension,
    "reward_scaling": 1e-4,
}

# 构建交易环境
e_train_gym = StockTradingEnv(df=train, **env_kwargs)
env_train, _ = e_train_gym.get_sb_env()
print(f"环境类型: {type(env_train)}")

# 算法选择

In [None]:
# 算法选择与GPU/CPU设置
# 设置为 True 选择使用相应算法
if_using_a2c = False
if_using_ddpg = False
if_using_ppo = True
if_using_td3 = True
if_using_sac = True

# GPU相关设置
if use_cuda:
    # 根据算法特性分配设备
    cpu_device = torch.device("cpu")
    gpu_device = torch.device("cuda")

    print(f"CPU设备: {cpu_device}")
    print(f"GPU设备: {gpu_device}")
else:
    # 如果没有GPU，所有模型都使用CPU
    cpu_device = gpu_device = torch.device("cpu")
    print("未检测到GPU，所有模型将使用CPU")

# 因为有GPU加速，可以适当增加训练步数提高性能
a2c_timesteps = 50000  # 即使有GPU也用较少步数，因为在CPU上运行
ddpg_timesteps = 100000 if use_cuda else 50000
ppo_timesteps = 100000 if use_cuda else 50000
td3_timesteps = 80000 if use_cuda else 30000
sac_timesteps = 80000 if use_cuda else 30000

print("选中的算法及其训练设备:")
print(f"A2C: {'✓' if if_using_a2c else '✗'} (设备: CPU)")
print(f"DDPG: {'✓' if if_using_ddpg else '✗'} (设备: {'GPU' if use_cuda else 'CPU'})")
print(f"PPO: {'✓' if if_using_ppo else '✗'} (设备: {'GPU' if use_cuda else 'CPU'})")
print(f"TD3: {'✓' if if_using_td3 else '✗'} (设备: {'GPU' if use_cuda else 'CPU'})")
print(f"SAC: {'✓' if if_using_sac else '✗'} (设备: {'GPU' if use_cuda else 'CPU'})")

## A2C 模型

In [None]:
# 训练 A2C 模型
if if_using_a2c:
    print("\n======== 开始训练 A2C 模型 ========")
    agent = DRLAgent(env=env_train)

    # A2C特定参数 - GPU优化
    A2C_PARAMS = {
        "n_steps": 5,
        "ent_coef": 0.01,
        "learning_rate": 0.0007,
        "device": cpu_device,  # 使用GPU
    }

    model_a2c = agent.get_model("a2c", model_kwargs=A2C_PARAMS)

    # 设置日志记录
    tmp_path = RESULTS_DIR + "/a2c"
    new_logger_a2c = configure(tmp_path, ["stdout", "csv", "tensorboard"])
    model_a2c.set_logger(new_logger_a2c)

    # 训练模型
    train_start_time = time.time()
    print(f"开始训练，总步数: {a2c_timesteps}")
    trained_a2c = agent.train_model(
        model=model_a2c, tb_log_name="a2c", total_timesteps=a2c_timesteps
    )

    # 计算训练时间
    train_end_time = time.time()
    train_time = train_end_time - train_start_time
    print(f"A2C 训练完成，耗时: {train_time:.2f}秒 ({train_time/60:.2f}分钟)")

    # 保存模型
    trained_a2c.save(TRAINED_MODEL_DIR + "/agent_a2c")
    print(f"模型已保存至 {TRAINED_MODEL_DIR}/agent_a2c")

## DDPG 模型

In [None]:
# 训练 DDPG 模型
if if_using_ddpg:
    print("\n======== 开始训练 DDPG 模型 ========")
    agent = DRLAgent(env=env_train)

    # DDPG特定参数 - 内存优化
    DDPG_PARAMS = {
        "buffer_size": 10000,  # 减小缓冲区大小（原50000）
        "learning_rate": 0.0005,
        "batch_size": 32,  # 减小批量大小（原128/64）
        "device": gpu_device,
    }

    model_ddpg = agent.get_model("ddpg", model_kwargs=DDPG_PARAMS)

    # 设置日志记录 - 减少日志频率
    tmp_path = RESULTS_DIR + "/ddpg"
    new_logger_ddpg = configure(
        tmp_path, ["stdout", "csv"]
    )  # 移除tensorboard减轻内存负担
    model_ddpg.set_logger(new_logger_ddpg)

    # 训练模型 - 分阶段训练以减轻内存压力
    train_start_time = time.time()
    total_steps = ddpg_timesteps
    steps_per_stage = 10000  # 每阶段训练步数
    stages = total_steps // steps_per_stage

    print(f"开始分阶段训练DDPG，总步数: {total_steps}，分为{stages}个阶段")

    # 手动垃圾回收
    import gc

    for stage in range(stages):
        print(f"阶段 {stage+1}/{stages}，训练步数: {steps_per_stage}")
        model_ddpg = agent.train_model(
            model=model_ddpg,
            tb_log_name=f"ddpg_stage_{stage}",
            total_timesteps=steps_per_stage,
        )

        # 强制垃圾回收
        gc.collect()

        # 每阶段保存一次模型，避免全部失败
        if (stage + 1) % 2 == 0:
            checkpoint_path = f"{TRAINED_MODEL_DIR}/agent_ddpg_checkpoint_{stage+1}"
            model_ddpg.save(checkpoint_path)
            print(f"保存阶段性检查点: {checkpoint_path}")

    # 计算训练时间
    train_end_time = time.time()
    train_time = train_end_time - train_start_time
    print(f"DDPG 训练完成，耗时: {train_time:.2f}秒 ({train_time/60:.2f}分钟)")

    # 保存最终模型
    trained_ddpg = model_ddpg  # 使用最终训练好的模型
    trained_ddpg.save(TRAINED_MODEL_DIR + "/agent_ddpg")
    print(f"模型已保存至 {TRAINED_MODEL_DIR}/agent_ddpg")

    # 清理内存
    gc.collect()
    print("内存已清理")

## PPO 模型

In [None]:
# 训练 PPO 模型 - 分阶段版本
if if_using_ppo:
    print("\n======== 开始训练 PPO 模型 ========")
    agent = DRLAgent(env=env_train)

    # PPO 特定参数 - GPU优化
    PPO_PARAMS = {
        "n_steps": 2048,
        "ent_coef": 0.01,
        "learning_rate": 0.00025,
        "batch_size": 256 if use_cuda else 128,  # GPU上使用更大batch_size
        "device": gpu_device,  # 使用GPU
    }
    model_ppo = agent.get_model("ppo", model_kwargs=PPO_PARAMS)

    # 设置日志记录
    tmp_path = RESULTS_DIR + "/ppo"
    new_logger_ppo = configure(tmp_path, ["stdout", "csv"])  # 移除tensorboard减轻负担
    model_ppo.set_logger(new_logger_ppo)

    # 分阶段训练设置
    import gc

    train_start_time = time.time()
    total_steps = ppo_timesteps
    steps_per_stage = 20000  # 每阶段训练步数
    stages = total_steps // steps_per_stage

    print(f"开始分阶段训练PPO，总步数: {total_steps}，分为{stages}个阶段")

    try:
        for stage in range(stages):
            print(f"阶段 {stage+1}/{stages}，训练步数: {steps_per_stage}")
            model_ppo = agent.train_model(
                model=model_ppo,
                tb_log_name=f"ppo_stage_{stage}",
                total_timesteps=steps_per_stage,
            )

            # 强制垃圾回收
            gc.collect()

            # 每阶段保存一次模型
            checkpoint_path = f"{TRAINED_MODEL_DIR}/agent_ppo_checkpoint_{stage+1}"
            model_ppo.save(checkpoint_path)
            print(f"保存阶段性检查点: {checkpoint_path}")

    except KeyboardInterrupt:
        print("\n训练被用户中断，保存当前模型...")
        interrupt_path = f"{TRAINED_MODEL_DIR}/agent_ppo_interrupted"
        model_ppo.save(interrupt_path)
        print(f"中断模型已保存至: {interrupt_path}")

    # 计算训练时间
    train_end_time = time.time()
    train_time = train_end_time - train_start_time
    print(f"PPO 训练完成，耗时: {train_time:.2f}秒 ({train_time/60:.2f}分钟)")

    # 保存最终模型
    trained_ppo = model_ppo
    trained_ppo.save(TRAINED_MODEL_DIR + "/agent_ppo")
    print(f"最终模型已保存至 {TRAINED_MODEL_DIR}/agent_ppo")

    # 清理内存
    gc.collect()
    print("内存已清理")

## TD3 模型

In [None]:
# 训练 TD3 模型
if if_using_td3:
    print("\n======== 开始训练 TD3 模型 ========")
    agent = DRLAgent(env=env_train)

    # TD3 特定参数 - GPU优化
    TD3_PARAMS = {
        "batch_size": 256 if use_cuda else 100,
        "buffer_size": 1000000,
        "learning_rate": 0.001,
        "device": gpu_device,  # 使用GPU
    }
    model_td3 = agent.get_model("td3", model_kwargs=TD3_PARAMS)

    # 设置日志记录
    tmp_path = RESULTS_DIR + "/td3"
    new_logger_td3 = configure(tmp_path, ["stdout", "csv", "tensorboard"])
    model_td3.set_logger(new_logger_td3)

    # 训练模型
    train_start_time = time.time()
    print(f"开始训练，总步数: {td3_timesteps}")
    trained_td3 = agent.train_model(
        model=model_td3, tb_log_name="td3", total_timesteps=td3_timesteps
    )

    # 计算训练时间
    train_end_time = time.time()
    train_time = train_end_time - train_start_time
    print(f"TD3 训练完成，耗时: {train_time:.2f}秒 ({train_time/60:.2f}分钟)")

    # 保存模型
    trained_td3.save(TRAINED_MODEL_DIR + "/agent_td3")
    print(f"模型已保存至 {TRAINED_MODEL_DIR}/agent_td3")

## SAC 模型

In [None]:
# 训练 SAC 模型
if if_using_sac:
    print("\n======== 开始训练 SAC 模型 ========")
    agent = DRLAgent(env=env_train)

    # SAC 特定参数 - GPU优化
    SAC_PARAMS = {
        "batch_size": 256 if use_cuda else 128,
        "buffer_size": 300000,
        "learning_rate": 0.0001,
        "learning_starts": 100,
        "ent_coef": "auto_0.1",
        "device": gpu_device,  # 使用GPU
    }
    model_sac = agent.get_model("sac", model_kwargs=SAC_PARAMS)

    # 设置日志记录
    tmp_path = RESULTS_DIR + "/sac"
    new_logger_sac = configure(tmp_path, ["stdout", "csv", "tensorboard"])
    model_sac.set_logger(new_logger_sac)

    # 训练模型
    train_start_time = time.time()
    print(f"开始训练，总步数: {sac_timesteps}")
    trained_sac = agent.train_model(
        model=model_sac, tb_log_name="sac", total_timesteps=sac_timesteps
    )

    # 计算训练时间
    train_end_time = time.time()
    train_time = train_end_time - train_start_time
    print(f"SAC 训练完成，耗时: {train_time:.2f}秒 ({train_time/60:.2f}分钟)")

    # 保存模型
    trained_sac.save(TRAINED_MODEL_DIR + "/agent_sac")
    print(f"模型已保存至 {TRAINED_MODEL_DIR}/agent_sac")

# 可视化训练过程

In [None]:
# 可视化训练过程中的模型性能
import pandas as pd
import matplotlib.pyplot as plt
import glob
import os


def visualize_training_results(model_name):
    """可视化模型训练过程中的奖励和损失"""
    csv_path = os.path.join(RESULTS_DIR, model_name, "*.monitor.csv")
    csv_files = glob.glob(csv_path)

    if not csv_files:
        print(f"未找到 {model_name} 的训练记录文件")
        return

    # 读取训练记录
    data = pd.read_csv(csv_files[0], skiprows=1)

    # 绘制奖励变化趋势
    plt.figure(figsize=(10, 5))
    plt.plot(data["r"], label="奖励", alpha=0.3)
    plt.plot(data["r"].rolling(window=100).mean(), label="奖励均值(窗口=100)")
    plt.title(f"{model_name} 模型训练奖励")
    plt.xlabel("训练步数")
    plt.ylabel("奖励")
    plt.legend()
    # 确保results目录存在
    os.makedirs("results", exist_ok=True)
    plt.savefig(f"results/{model_name}_reward.png")
    plt.show()


# 可视化训练结果
trained_models = []
if if_using_a2c:
    visualize_training_results("a2c")
    trained_models.append("A2C")
if if_using_ddpg:
    visualize_training_results("ddpg")
    trained_models.append("DDPG")
if if_using_ppo:
    visualize_training_results("ppo")
    trained_models.append("PPO")
if if_using_td3:
    visualize_training_results("td3")
    trained_models.append("TD3")
if if_using_sac:
    visualize_training_results("sac")
    trained_models.append("SAC")

# 总结训练结果

In [None]:
# 总结训练结果
print("\n======== 训练完成 ========")
print(f"使用设备: {'GPU (CUDA)' if use_cuda else 'CPU'}")
print(f"训练的模型: {', '.join(trained_models)}")
print(f"所有模型已保存至 {TRAINED_MODEL_DIR} 目录")
print("\n下一步: 运行 back_test.ipynb 来评估训练好的模型表现")