In [15]:
# Library
import sys, os
sys.path.append(os.path.abspath('../..'))

from hdf5_loader import StockDatasetHDF5
from config import *
import subclass as sc

import pandas as pd
import numpy as np
import seaborn as sns
from tqdm import tqdm
import time
import matplotlib.pyplot as plt
from collections import defaultdict, OrderedDict
from datetime import datetime, timedelta
import os
import gc
import re
import h5py
import random
import importlib
import shutil

import torch
import torch.nn as nn
from torch.utils.data import IterableDataset, DataLoader

np.set_printoptions(precision=4, suppress=True, linewidth=120)

In [16]:
import gymnasium as gym
from gymnasium.spaces import Dict, Box, Discrete

In [17]:
def initialize_log_dir(base_log_dir="./tensorboard_logs"):
    if os.path.exists(base_log_dir):
        shutil.rmtree(base_log_dir)  # 기존 로그 디렉토리 삭제
    os.makedirs(base_log_dir, exist_ok=True)  # 새 디렉토리 생성
    return os.path.join(base_log_dir, datetime.now().strftime("%Y%m%d-%H%M%S"))

---

In [18]:
class StockTradingEnv(gym.Env):
    def __init__(self, hz_dim, targ_hz, ticker_list, date_range, render_mode=None, **kwargs):
        super().__init__(**kwargs)
        self.hz_dim = hz_dim
        self.targ_hz = targ_hz
        self.ticker_list = ticker_list
        self.date_range = date_range
        
        self.tout = 0
        self.tin = 0
        self.position = 0
        
        self.hdf5_inst = StockDatasetHDF5(
            ticker_list=ticker_list,
            date_range=date_range,
        )
        self.envgen = sc.get_samples(self.hdf5_inst, hz_dim, targ_hz)
        
        self.observation_space = Dict(
            {hz: Box(-100, 100, shape=( sc.FEATURE_NUM, hz_dim[hz]), dtype=np.float32) for hz in THZ}
        )
        self.action_space = Discrete(2)

        assert render_mode is None or render_mode in self.metadata["render_modes"]
        self.render_mode = render_mode
        
    def _get_obs(self):  # return s_t
        try:
            state = next(self.envgen)
            if state:
                self.current_price = state.pop('current_price')
            return state
        except StopIteration: return -1

    def _get_info(self):  # return aux. infos
        return dict()
        
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)

        self.tout = 0
        self.tin = 0
        self.position = 0
        observation = self._get_obs()
        info = dict()
        
        # if self.render_mode == "human":
        #     self._render_frame()

        return observation, info

    
    def step(self, action):
        terminated = False
        reward = 0
        
        if self.position == 0 and action == 0:
            self.tout += 1
        elif self.position == 0 and action == 1:
            self.enter_price = self.current_price
            self.position = 1
        elif self.position == 1 and action == 1:
            self.tin += 1
        elif self.position == 1 and action == 0:
            terminated = True
            reward = (self.current_price-self.enter_price) / self.enter_price * 100
            
        observation = self._get_obs()
        info = self._get_info()
        truncated = (observation == 0)
        
        return observation, reward, terminated, truncated, info
    
    
    def render(self):
        pass
    
    
    def close(self):
        pass

In [19]:
class CustomAddLayer(nn.Module):
    def __init__(self, in_dim, out_dim):
        super(CustomAddLayer, self).__init__()
        self.affine = nn.Linear(in_dim, out_dim)
        self.relu = nn.ReLU()
        
    def forward(self, cur, next):
        cur = self.affine(cur)
        cur = torch.add(cur, next)
        cur = self.relu(cur)
        
        return cur

In [20]:
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor

class CustomCNNExtractor(BaseFeaturesExtractor):
    def __init__(self, observation_space:Dict, hz_order:list):
        super(CustomCNNExtractor, self).__init__(observation_space, features_dim=1)
        
        def get_conv1d_output_length(conv_layer:nn.Conv1d, input_length):
            return (input_length + 2 * conv_layer.padding[0] 
                    - conv_layer.dilation[0] * (conv_layer.kernel_size[0] - 1) 
                    - 1) // conv_layer.stride[0] + 1

        extractors = {}; hz_latent_dim = {}
        # We need to know size of the output of this extractor,
        # so go over all the spaces and compute output feature sizes
        for hz, subspace in observation_space.spaces.items():  # (feature_num, hz_dim)
            layers = []
            feat_dim = subspace.shape[0]
            for layer_num in range(6):
                layers.append((f"{hz}_depth_conv{layer_num}", 
                               nn.Conv1d(feat_dim, feat_dim, groups=feat_dim, kernel_size=5, stride=2, padding=2)))
                layers.append((f"{hz}_batch_norm{layer_num}-1", nn.BatchNorm1d(feat_dim)))
                layers.append((f"{hz}_relu{layer_num}-1", nn.ReLU()))
                # feat_dim *= 2
                
                layers.append((f"{hz}_point_conv{layer_num}", nn.Conv1d(feat_dim, feat_dim*2, kernel_size=1)))
                layers.append((f"{hz}_batch_norm{layer_num}-2", nn.BatchNorm1d(feat_dim*2)))
                layers.append((f"{hz}_relu{layer_num}-2", nn.ReLU()))
                
                feat_dim *= 2
                
            layers.append((f"{hz}_avg_pool", nn.AdaptiveAvgPool1d(1)))
            layers.append((f"{hz}_flatten", nn.Flatten()))
            layers = OrderedDict(layers)
            
            extractors[hz] = nn.Sequential(layers)  # -> (latent_dim)
            hz_latent_dim[hz] = feat_dim

        self.extractors = nn.ModuleDict(extractors)
        
        self.mergers = dict(); self.hzs = hz_order
        for hz_idx in range(len(self.hzs)-1):
            cur_hz, next_hz = self.hzs[hz_idx], self.hzs[hz_idx+1]
            self.mergers[cur_hz] = CustomAddLayer(hz_latent_dim[cur_hz], hz_latent_dim[next_hz])

        # Update the features dim manually
        self._features_dim = hz_latent_dim[next_hz]

    def forward(self, observations) -> torch.Tensor:
        latent_tensors = dict()
        for hz in self.hzs:
            latent_tensors[hz] = self.extractors[hz](observations[hz])
        
        state = latent_tensors[self.hzs[0]]
        for hz_idx in range(len(self.hzs)-1):
            state = self.mergers[self.hzs[hz_idx]](state, latent_tensors[self.hzs[hz_idx+1]])
        
        # print(state.shape)
        return state

In [21]:
from stable_baselines3.common.policies import ActorCriticPolicy

class CustomActorCriticPolicy(ActorCriticPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomActorCriticPolicy, self).__init__(
            *args,
            **kwargs,
            features_extractor_class=CustomCNNExtractor,  # Custom Feature Extractor 사용
            features_extractor_kwargs=dict(hz_order=['1m', '5m', '30m', '1d', '1w']),    # Latent Space Dimension 설정
        )

In [22]:
from stable_baselines3.common.callbacks import BaseCallback
from torch.utils.tensorboard import SummaryWriter

class TensorboardRewardCallback(BaseCallback):
    def __init__(self, verbose=0):
        super(TensorboardRewardCallback, self).__init__(verbose)
        self.episode_rewards = []
        # self.current_rewards = 0
        self.elap_ep = 0
        self.writer = SummaryWriter("./tensorboard_logs/train")
        self.flag = 0

    def _on_step(self) -> bool:
        # 스텝 보상 누적
        # self.current_rewards = self.locals["rewards"][0]
        return True

    def _on_rollout_end(self) -> None:
        # 에피소드 종료 시 보상을 기록
        cur_reward = self.locals["rewards"][0]
        if cur_reward != 0: 
            self.writer.add_scalar(f"episode_reward/{self.flag}", cur_reward, self.elap_ep)
            if self.elap_ep - self.flag*1000 > 1000: self.flag += 1
        
        self.episode_rewards.append(cur_reward)
        # print(cur_reward)
        self.logger.dump(self.elap_ep)
        # self.current_rewards = 0  # 보상 초기화
        self.elap_ep += 1

In [23]:
ticker_list=[
    'AAPL',  # Apple Inc.
    'MSFT',  # Microsoft Corporation
    'GOOGL', # Alphabet Inc. (Google)
    'META',  # Meta Platforms, Inc. (Facebook)
    'IBM',   # International Business Machines Corporation
    'INTC',  # Intel Corporation
]
date_range=[ST, ED]
hz_dim = {hz:128 for hz in THZ}
targ_hz = '5m'

In [24]:
_ = importlib.reload(sc)

In [25]:
from stable_baselines3.common.logger import configure
from stable_baselines3 import A2C, PPO, SAC
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv

from torch.utils.tensorboard import SummaryWriter
# tensorboard --logdir C:\Users\naniri\Documents\GitHub\FinanceProject\code\Reinforcement\tensorboard_logs
initial_dir = initialize_log_dir()

env = StockTradingEnv(hz_dim, targ_hz, ticker_list, date_range)

check_env(env, warn=False)
env = DummyVecEnv([lambda: env])

model = A2C(
    policy=CustomActorCriticPolicy,
    env=env,
    verbose=0,
    seed=42,
    tensorboard_log="./tensorboard_logs/",
    device='cpu',
    learning_rate=5e-5,
    normalize_advantage=True
)

# new_logger = configure("./tensorboard_logs/", ["stdout", "tensorboard"])
# model.set_logger(new_logger)

# Callback 인스턴스 생성
callback = TensorboardRewardCallback()

# 모델 학습
model.learn(total_timesteps=200000, callback=callback) #, callback=callback

KeyboardInterrupt: 