In [1]:
import yfinance as yf
import pandas as pd
import ta  # Technical Analysis library
from sklearn.model_selection import train_test_split

# 사용할 주식 티커 목록
tickers = [
    "GM"
]

data = {}

# 각 티커에 대해 데이터를 다운로드하고 데이터프레임에 저장
for ticker in tickers:
    df = yf.download(ticker, start='2018-01-01', end='2024-01-01')
    df['ticker'] = ticker  # 티커 열 추가

    # 이동 평균 (Moving Average)

    # 상대 강도 지수 (RSI: Relative Strength Index)
    df['RSI'] = ta.momentum.RSIIndicator(df['Close'], window=14).rsi()

    # MACD (Moving Average Convergence Divergence)
    macd = ta.trend.MACD(df['Close'])

    # 볼린저 밴드 (Bollinger Bands)
    bollinger = ta.volatility.BollingerBands(df['Close'], window=20, window_dev=2)
    df['BB_High'] = bollinger.bollinger_hband()
    df['BB_Low'] = bollinger.bollinger_lband()
    df['BB_Middle'] = bollinger.bollinger_mavg()

    # NaN 값을 각 feature의 평균으로 대체
    df['RSI'].fillna(df['RSI'].mean(), inplace=True)
    df['BB_High'].fillna(df['BB_High'].mean(), inplace=True)
    df['BB_Low'].fillna(df['BB_Low'].mean(), inplace=True)
    df['BB_Middle'].fillna(df['BB_Middle'].mean(), inplace=True)

    data[ticker] = df

# 여러 데이터프레임을 하나로 결합
combined_df = pd.concat(data.values(), ignore_index=False)

# 필요한 열 선택 (모든 feature 포함)
features = ['Open', 'High', 'Low', 'Close', 'Volume', 'RSI', 'BB_High', 'BB_Low', 'BB_Middle', 'ticker']
combined_df = combined_df[features]

# 'Date'를 인덱스로 설정 (필요한 경우)
combined_df.index.name = 'Date'

# 데이터프레임을 70% 훈련 세트와 30% 테스트 세트로 분할
train_df, test_df = train_test_split(combined_df, test_size=0.3, shuffle=False)

print("Train Set:")
print(train_df.head())
print("Test Set:")
print(test_df.head())


[*********************100%%**********************]  1 of 1 completed

Train Set:
                 Open       High        Low      Close    Volume        RSI  \
Date                                                                          
2018-01-02  41.240002  41.869999  41.150002  41.799999   6934600  49.596247   
2018-01-03  42.209999  42.950001  42.200001  42.820000  14591600  49.596247   
2018-01-04  43.090000  44.250000  43.009998  44.139999  17298700  49.596247   
2018-01-05  44.500000  44.639999  43.959999  44.009998   9643300  49.596247   
2018-01-08  44.040001  44.590000  43.520000  44.220001  13099600  49.596247   

              BB_High     BB_Low  BB_Middle ticker  
Date                                                
2018-01-02  42.560135  36.328955  39.444545     GM  
2018-01-03  42.560135  36.328955  39.444545     GM  
2018-01-04  42.560135  36.328955  39.444545     GM  
2018-01-05  42.560135  36.328955  39.444545     GM  
2018-01-08  42.560135  36.328955  39.444545     GM  
Test Set:
                 Open       High        Low      Close




In [4]:
import gym
import numpy as np
import pandas as pd
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.logger import configure
import torch
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

cumu_lst = []
cumu_hap = 0

class StockTradingEnv(gym.Env):
    global cumu_lst, cumu_hap
    metadata = {'render.modes': ['human']}

    def __init__(self, df, initial_balance=100000):
        super(StockTradingEnv, self).__init__()

        self.df = df.drop(columns=['ticker'])  # ticker 열 제거
        self.action_space = spaces.Discrete(3)  # Buy, Hold, Sell
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(len(self.df.columns),), dtype=np.float32)

        self.initial_balance = initial_balance
        self.reset()

    def reset(self):
        self.current_step = 0
        self.done = False
        self.total_profit = 0
        self.inventory = []
        self.balance = self.initial_balance
        self.state = self.df.iloc[self.current_step].values
        self.returns = [0]

        return self.state

    def step(self, action):
        global cumu_lst
        self.done = self.current_step >= len(self.df) - 1
        reward = 0
        additional_reward = 0

        if action == 0 and not self.done:  # Buy
            if self.balance >= self.state[3]:  # Check if balance is sufficient to buy
                self.inventory.append(self.state[3])  # Close price
                self.balance -= self.state[3]

        elif action == 1 and not self.done:  # Sell
            if len(self.inventory) > 0:
                bought_price = self.inventory.pop(0)
                reward = self.state[3] - bought_price  # Current price - Bought price
                self.total_profit += reward
                self.balance += self.state[3]

        # 추가된 보상 요소들
        if self.state[5] >= 70 and action == 1:  # RSI가 70 이상일 때 매도
            additional_reward += 0.6
        elif self.state[5] <= 30 and action == 0:  # RSI가 30 이하일 때 매수
            additional_reward += 0.6

        if self.state[3] >= self.state[6] and action == 1:  # 주가가 BB 상한선 이상일 때 매도
            additional_reward += 0.6
        elif self.state[3] <= self.state[7] and action == 0:  # 주가가 BB 하한선 이하일 때 매수
            additional_reward += 0.6

        if self.state[4] > self.df['Volume'].mean() and (action == 0 or action == 1):  # 거래량이 평균 이상일 때 매수 또는 매도
            additional_reward += 0.6

        total_reward = reward + additional_reward

        # Sharpe Ratio
        if len(self.returns) > 1:
            sharpe_ratio = np.mean(self.returns) / (np.std(self.returns) + 1e-10)
        else:
            sharpe_ratio = 0

        # Drawdown
        peak = np.max(self.returns)
        drawdown = peak - self.returns[-1]

        # 정규화된 Sharpe Ratio와 Drawdown
        normalized_sharpe = np.tanh(sharpe_ratio)  # -1에서 1 사이로 정규화
        normalized_drawdown = np.tanh(drawdown)  # -1에서 1 사이로 정규화

        # 스케일링 팩터
        sharpe_scale = 0.5
        drawdown_scale = 0.5

        # 보상 함수에 샤프 비율과 드로우다운 포함
        stability_bonus = sharpe_scale * normalized_sharpe - drawdown_scale * normalized_drawdown
        total_reward += stability_bonus

        # Update returns
        self.returns.append(total_reward)

        if not self.done:
            self.current_step += 1
            self.state = self.df.iloc[self.current_step].values
        else:
            self.state = np.zeros(self.observation_space.shape)  # Done 상태에서 반환할 값
            cumu_lst.append(self.total_profit)
            self.total_profit = 0

        return self.state, total_reward, self.done, {"reward": total_reward, "additional_reward": additional_reward, "sharpe_ratio": sharpe_ratio, "drawdown": drawdown, "stability_bonus": stability_bonus}

    def render(self, mode='human', close=False):
        print(f'Step: {self.current_step}, Balance: {self.balance}, Total Profit: {self.total_profit}, Sharpe Ratio: {np.mean(self.returns) / np.std(self.returns) if len(self.returns) > 1 else 0}, Drawdown: {(self.max_balance - self.balance) / self.max_balance}')

In [5]:
class EvalCallback(BaseCallback):
    def __init__(self, verbose=0):
        super(EvalCallback, self).__init__(verbose)
        self.episode_rewards = []
        self.episode_lengths = []
        self.action_distribution = []
        self.state_visit_frequency = {}
        self.cumulative_rewards = []  # 누적 보상을 저장하기 위한 리스트

    def _on_step(self) -> bool:
        # Record current step action for action distribution analysis
        self.action_distribution.append(self.locals["actions"])
        # Record state visit frequency
        state_str = str(self.locals["new_obs"])
        if state_str in self.state_visit_frequency:
            self.state_visit_frequency[state_str] += 1
        else:
            self.state_visit_frequency[state_str] = 1
        return True

    def _on_rollout_end(self) -> None:
        # Log the reward and length for each episode
        episode_reward = sum(self.locals["rewards"])
        episode_length = len(self.locals["rewards"])
        self.episode_rewards.append(episode_reward)
        self.episode_lengths.append(episode_length)
        self.cumulative_rewards.append(sum(self.episode_rewards))  # 누적 보상을 추가
        self.logger.record('rollout/episode_reward', np.sum(self.episode_rewards))
        self.logger.record('rollout/episode_length', np.sum(self.episode_lengths))

    def _on_training_end(self) -> None:
        # Save evaluation results to file or print them
        print(f"Mean episode reward: {np.mean(self.episode_rewards)}")
        print(f"Mean episode length: {np.mean(self.episode_lengths)}")
        print(f"Cumulative reward: {np.sum(self.episode_rewards)}")

        # Plot action distribution
        plt.figure(figsize=(12, 6))
        plt.hist(self.action_distribution, bins=3, align='left', rwidth=0.8)
        plt.xticks(range(3), ['Buy', 'Hold', 'Sell'])
        plt.xlabel('Actions')
        plt.ylabel('Frequency')
        plt.title('Action Distribution')
        plt.show()

        # Plot episode rewards
        plt.figure(figsize=(12, 6))
        plt.plot(self.episode_rewards)
        plt.xlabel('Episode')
        plt.ylabel('Episode Reward')
        plt.title('Episode Reward over Episodes')
        plt.show()

        # Plot cumulative rewards
        plt.figure(figsize=(12, 6))
        plt.plot(self.cumulative_rewards)
        plt.xlabel('Episode')
        plt.ylabel('Cumulative Reward')
        plt.title('Cumulative Reward over Episodes')
        plt.show()

        # Plot state visit frequency
        state_visit_sorted = sorted(self.state_visit_frequency.items(), key=lambda item: item[1], reverse=True)
        states, visits = zip(*state_visit_sorted)
        plt.figure(figsize=(12, 6))
        plt.bar(states[:10], visits[:10])
        plt.xlabel('States')
        plt.ylabel('Visit Frequency')
        plt.title('Top 10 Most Visited States')
        plt.show()

In [6]:
# 데이터 로드 및 train/test 분할
train_df, test_df = train_test_split(df, test_size=0.3, shuffle=False)

# 학습 환경 설정
train_env = DummyVecEnv([lambda: StockTradingEnv(train_df)])

# CUDA 사용 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
log_dir = "./ppo_stock_trading_v11"
os.makedirs(log_dir, exist_ok=True)
new_logger = configure(log_dir, ["stdout", "tensorboard"])

# PPO 모델 학습
from stable_baselines3.common.callbacks import CheckpointCallback

# 체크포인트 저장 콜백 설정 (예: 10000 타임스텝마다 저장)
checkpoint_callback = CheckpointCallback(save_freq=20000, save_path='./GM/GM_Checkpoint_6/',
                                         name_prefix='ppo_stock_trading')

# PPO 모델 학습
# model = PPO('MlpPolicy', train_env, learning_rate=0.0003, n_steps=2048, batch_size=64, n_epochs=10, gamma=0.99, verbose=1, device=device)
model = PPO('MlpPolicy', train_env, verbose=1, device=device)

model.set_logger(new_logger)

# 'stable_baselines3' 모듈에 임시로 '__version__' 속성을 추가합니다.
# import stable_baselines3 as sb3
# sb3.__version__ = "1.0.0"

model.learn(total_timesteps=200000, callback=[checkpoint_callback, EvalCallback()])

# 테스트 환경 설정
test_env = DummyVecEnv([lambda: StockTradingEnv(test_df)])

# 학습된 모델로 예측 및 결과 시각화
obs = test_env.reset()
total_rewards = 0
total_additional_rewards = 0
total_stability_bonus = 0

for _ in range(len(test_df)):
    action, _states = model.predict(obs)
    obs, rewards, done, info = test_env.step(action)
    total_rewards += info[0]['reward']
    total_additional_rewards += info[0]['additional_reward']
    total_stability_bonus += info[0]['stability_bonus']
    test_env.render()
    if done:
        break

print(f"Final Balance: {test_env.get_attr('balance')[0]}")
print(f"Total Profit: {test_env.get_attr('total_profit')[0]}")
print(f"Total Rewards: {total_rewards}")
print(f"Total Additional Rewards: {total_additional_rewards}")
print(f"Total Stability Bonus: {total_stability_bonus}")
print(f"profit: {total_rewards - total_additional_rewards - total_stability_bonus}")

# TensorBoard 로그 파일을 로컬에서 실행
# !tensorboard --logdir ./ppo_stock_trading

Logging to ./ppo_stock_trading_v11




Using cuda device
---------------------------------
| rollout/           |          |
|    episode_length  | 1        |
|    episode_reward  | 1.76     |
| time/              |          |
|    fps             | 489      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------


KeyboardInterrupt: 

In [7]:
import os
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

# 테스트 환경 설정
test_df = df  # 테스트용 데이터프레임 설정 (기존 데이터프레임 사용)
test_env = DummyVecEnv([lambda: StockTradingEnv(test_df)])

# 체크포인트 디렉토리 설정
checkpoint_dir = './Single_management/GM/GM_Checkpoint_2/'
checkpoint_files = [os.path.join(checkpoint_dir, f) for f in os.listdir(checkpoint_dir) if f.endswith('.zip')]

results = []

# 각 체크포인트 평가
for checkpoint_file in checkpoint_files:
    model = PPO.load(checkpoint_file, env=test_env)

    obs = test_env.reset()
    total_rewards = 0
    total_additional_rewards = 0
    total_stability_bonus = 0
    done = False

    while not done:
        action, _states = model.predict(obs)
        obs, rewards, done, info = test_env.step(action)
        total_rewards += info[0]['reward']
        total_additional_rewards += info[0]['additional_reward']
        total_stability_bonus += info[0]['stability_bonus']

    results.append({
        'checkpoint': checkpoint_file,
        'total_rewards': total_rewards,
        'total_profit': total_rewards - total_additional_rewards - total_stability_bonus,
        'total_additional_rewards': total_additional_rewards,
        'total_stability_bonus': total_stability_bonus,
    })

# 결과 출력
for result in results:
    print(f"Checkpoint: {result['checkpoint']}")
    print(f"Total Rewards: {result['total_rewards']}")
    print(f"total_additional_rewards: {result['total_additional_rewards']}")
    print(f"Total Stability_bonus: {result['total_stability_bonus']}")
    print(f"Total Profit: {result['total_profit']}")
    print("="*50)



Checkpoint: ./Single_management/GM/GM_Checkpoint_2/ppo_stock_trading_100000_steps.zip
Total Rewards: 1148.3732718985868
total_additional_rewards: 1091.4000000000185
Total Stability_bonus: 62.46325450356475
Total Profit: -5.48998260499642
Checkpoint: ./Single_management/GM/GM_Checkpoint_2/ppo_stock_trading_120000_steps.zip
Total Rewards: 1177.2282928760183
total_additional_rewards: 1097.4000000000183
Total Stability_bonus: 93.22830966068689
Total Profit: -13.400016784686912
Checkpoint: ./Single_management/GM/GM_Checkpoint_2/ppo_stock_trading_140000_steps.zip
Total Rewards: 1284.8180554519588
total_additional_rewards: 1098.0000000000182
Total Stability_bonus: 180.0580495010333
Total Profit: 6.760005950907299
Checkpoint: ./Single_management/GM/GM_Checkpoint_2/ppo_stock_trading_160000_steps.zip
Total Rewards: 1317.4143704493968
total_additional_rewards: 1104.000000000017
Total Stability_bonus: 207.74437228045363
Total Profit: 5.669998168926071
Checkpoint: ./Single_management/GM/GM_Checkpoi