In [2]:
import numpy as np
from gym.utils import seeding
from gym import spaces
import gym
from stable_baselines3 import PPO
import pandas as pd

2023-06-15 07:11:29.694675: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
class TradingEnv(gym.Env):
    def __init__(self, df):
        super(TradingEnv, self).__init__()

        # 株価データ
        self.df = df
        self.reward_range = (0, 1)

        # アクションと観測空間を定義
        self.action_space = spaces.Discrete(3)
        self.observation_space = spaces.Box(low=0, high=np.inf, shape=(self.df.shape[1] + 2,))  # holdingsとcashを追加

        # 環境の初期化
        self.reset()

    def step(self, action):
        # ここに、エージェントが取ったアクションに基づく環境の変化と報酬を計算するロジックを書く
        done = False
        obs = self.df.iloc[self.current_step]

        previous_value = self.portfolio_value

        if action == 0:  # Buy
            self.cash -= obs['close']
            self.holdings += 1
        elif action == 1:  # Hold
            pass
        elif action == 2:  # Sell
            self.cash += obs['close']
            self.holdings -= 1

        self.portfolio_value = self.cash + self.holdings * obs['close']
        reward = self.portfolio_value - previous_value  # 報酬は資産価値の変化

        self.current_step += 1

        if self.current_step >= len(self.df):
            done = True

        return np.append(obs, [self.holdings, self.cash]), reward, done, {}

    def reset(self):
        # 環境を初期状態にリセットするためのロジックを書く
        self.current_step = 0
        self.holdings = 0  # 保有株式数
        self.cash = 1000000  # 現金
        self.portfolio_value = self.cash
        return np.append(self.df.iloc[self.current_step], [self.holdings, self.cash])


In [None]:
def evaluate(model, num_episodes=100):
    """
    評価関数
    """
    env = model.get_env()  # 環境の取得
    all_rewards = []  # 報酬を保存するリスト

    for i in range(num_episodes):
        obs = env.reset()  # 環境のリセット
        done = False
        total_reward = 0  # 累積報酬の初期化

        while not done:
            action, _ = model.predict(obs)  # 行動の選択
            obs, reward, done, _ = env.step(action)  # 環境のステップ
            total_reward += reward  # 報酬の累積

        all_rewards.append(total_reward)

    mean_reward = np.mean(all_rewards)  # 平均報酬の計算

    print(f"Mean reward: {mean_reward} +/- {np.std(all_rewards)}")

In [6]:
# データの読み込み
df = pd.read_csv('../csv/test_combined_data_small.csv')

# 日付は文字列のまま読み込むことができないため、最小の日付からの経過秒数に変換
df['date'] = pd.to_datetime(df['date'])
df['date'] = (df['date'] - df['date'].min()).dt.total_seconds()
display(df)

# 環境の作成
env = TradingEnv(df)

# エージェントの作成
model = PPO('MlpPolicy', env, verbose=1)

# エージェントの訓練
model.learn(total_timesteps=10000)

# モデルの評価
evaluate(model, num_episodes=100)

# モデルの保存
model.save("../model/rl_v1.zip")

# モデルの読み込み
# model = PPO.load("../model/rl_v1.zip")

Unnamed: 0,date,open,high,low,close
0,0.0,4130.3,4132.3,4128.5,4128.9
1,60.0,4128.9,4130.0,4127.0,4128.6
2,120.0,4128.7,4129.3,4128.0,4128.4
3,180.0,4128.4,4129.3,4127.0,4129.2
4,240.0,4129.5,4132.0,4129.5,4130.4
...,...,...,...,...,...
994,59640.0,4154.1,4154.9,4153.7,4154.4
995,59700.0,4154.4,4154.9,4153.5,4153.7
996,59760.0,4153.7,4153.8,4152.5,4152.6
997,59820.0,4152.6,4152.7,4149.1,4149.5


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 999      |
|    ep_rew_mean     | 101      |
| time/              |          |
|    fps             | 1583     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 999         |
|    ep_rew_mean          | 1e+03       |
| time/                   |             |
|    fps                  | 1225        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009403418 |
|    clip_fraction        | 0.0346      |
|    clip_range           | 0.2         |
|    entropy_loss   