In [9]:
import numpy as np
from gym.utils import seeding
from gym import spaces
import gym
from stable_baselines3 import PPO
import pandas as pd

In [13]:
class TradingEnv(gym.Env):
    def __init__(self, df):
        super(TradingEnv, self).__init__()

        # 株価データ
        self.df = df
        self.reward_range = (0, 1)

        # アクションと観測空間を定義
        self.action_space = spaces.Discrete(3)
        self.observation_space = spaces.Box(low=0, high=np.inf, shape=(self.df.shape[1],))

        # 環境の初期化
        self.reset()

    def step(self, action):
        # ここに、エージェントが取ったアクションに基づく環境の変化と報酬を計算するロジックを書く

        done = False
        reward = 0
        obs = self.df.iloc[self.current_step]

        if action == 0:  # Buy
            pass  # ロジックを追加
        elif action == 1:  # Hold
            pass  # ロジックを追加
        elif action == 2:  # Sell
            pass  # ロジックを追加

        self.current_step += 1

        if self.current_step >= len(self.df):
            done = True

        return obs, reward, done, {}

    def reset(self):
        # 環境を初期状態にリセットするためのロジックを書く
        self.current_step = 0
        return self.df.iloc[self.current_step]


In [16]:
def evaluate(model, num_episodes=100):
    """
    評価関数
    """
    env = model.get_env()  # 環境の取得
    all_rewards = []  # 報酬を保存するリスト

    for i in range(num_episodes):
        obs = env.reset()  # 環境のリセット
        done = False
        total_reward = 0  # 累積報酬の初期化

        while not done:
            action, _ = model.predict(obs)  # 行動の選択
            obs, reward, done, _ = env.step(action)  # 環境のステップ
            total_reward += reward  # 報酬の累積

        all_rewards.append(total_reward)

    mean_reward = np.mean(all_rewards)  # 平均報酬の計算

    print(f"Mean reward: {mean_reward} +/- {np.std(all_rewards)}")

In [17]:
# データの読み込み
df = pd.read_csv('../csv/combined_data.csv')

# 日付は文字列のまま読み込むことができないため、最小の日付からの経過秒数に変換
df['date'] = pd.to_datetime(df['date'])
df['date'] = (df['date'] - df['date'].min()).dt.total_seconds()
display(df)

# 環境の作成
env = TradingEnv(df)

# エージェントの作成
model = PPO('MlpPolicy', env, verbose=1)

# エージェントの訓練
model.learn(total_timesteps=10000)

# モデルの評価
evaluate(model, num_episodes=100)

# モデルの保存
model.save("../model/rl_v1.zip")

# モデルの読み込み
model = PPO.load("../model/rl_v1.zip")

Unnamed: 0,date,open,high,low,close
0,0.0,2507.9,2508.7,2506.3,2508.7
1,60.0,2508.7,2512.0,2508.5,2512.0
2,120.0,2512.0,2516.8,2511.5,2516.0
3,180.0,2516.2,2516.4,2514.9,2514.9
4,240.0,2514.9,2515.7,2514.8,2515.0
...,...,...,...,...,...
1456555,133909800.0,4139.0,4139.1,4138.9,4139.0
1456556,133909860.0,4139.0,4139.3,4138.9,4139.2
1456557,133909920.0,4139.2,4139.5,4139.1,4139.4
1456558,133909980.0,4139.5,4139.6,4138.8,4138.9


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------
| time/              |      |
|    fps             | 1776 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1252        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.010931309 |
|    clip_fraction        | 0.0606      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | 0.736       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0255     |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00767    |
|    value_loss         