``` python
import gym
from gym import spaces

class CustomEnv(gym.Env):
    """Custom Environment that follows gym interface"""
    metadata = {'render.modes': ['human']}

    def __init__(self, arg1, arg2, ...):
        super(CustomEnv, self).__init__()
        # Define action and observation space
        # They must be gym.spaces objects
        # Example when using discrete actions:
        self.action_space = spaces.Discrete(N_DISCRETE_ACTIONS)
        # Example for using image as input:
        self.observation_space = spaces.Box(low=0, high=255, shape=
                        (HEIGHT, WIDTH, N_CHANNELS), dtype=np.uint8)

    def step(self, action):
        # Execute one time step within the environment
        ...
    def reset(self):
        # Reset the state of the environment to an initial state
        ...
    def render(self, mode='human', close=False):
        # Render the environment to the screen
```

- `action_space`: contain all of the actions of agent
- `observation_space`: all data of the environment observed by agent
- `reset`: reset env to an initial state
- `step`: one time step
- `render`: print rendition of the environment

## Stock Trading Env

The model(agent) observes the price of the stock before making an action(trade)

`observation_space`contains most of input variables, such as open price, high, low, close, daily volume, and the agent observe those variables 

each `step`, the agent cosider the action up to the current and past price of the stock

`action_space` has three actions: buy, sell, hold

need to know the amount of a given stock to buy or sell each step(time). Using `Box` space, create action space that has a discrete number of action types (buy, sell, hold), or a continuous psectrum of amounts, such as 'buy x%, sell y%, hold z%'

`reward` is the account balance multiplied by some fraction of the number of time steps. Because of the delay reward is that the we wants that the agent consider the long-term investigation than short-term trade. (단타 보다는 장타...??)

### Define action_space & observatrion_step

`pandas` data frame passes the stock price to `environment`

In [1]:
import gym
from gym import spaces




In [12]:
MAX_ACCOUNT_BALANCE = 2147483647
MAX_NUM_SHARES = 2147483647
MAX_SHARE_PRICE = 5000
MAX_OPEN_POSITIONS = 5
MAX_STEPS = 20000

INITIAL_ACCOUNT_BALANCE = 10000

class StockTradingEnv(gym.Env):
    metadata = {'render.modes': ['human']} ### what is metadata / what kinds of render.modes exist?
    
    def __init__(self, df): # input is dataframe of pandas
        super(StockTradingEnv, self).__init__()
        
        self.df = df
        self.reward_range = (0, MAX_ACCOUNT_BALANCE)
        
        # Continuous action: buy x%, sell y%, hold z%
        ### How to set variables of the Box
        self.action_space = spaces.Box(
            low=np.array([0, 0]), high=np.array([3, 1]), dtype=np.float16)
        
        # prices contains the candle chart(OHLC) values for the last 5 days
        ### each step is 1 day
        self.observation_space = spaces.Box(
            low=0, high=1, shape=(6, 6), dtype=np.float16)
        
    def _next_observation(self):
        # Get the stock data points for the last 5 days and scale to between 0-1
        ### current_step = day passed.
        ### from df(stock df) read current step to after 5 steps stcok data(Open, High, Low, Close, Volume)
        frame = np.array([
            self.df.loc[self.current_step: self.current_step +
                        5, 'Open'].values / MAX_SHARE_PRICE,
            self.df.loc[self.current_step: self.current_step +
                        5, 'High'].values / MAX_SHARE_PRICE,
            self.df.loc[self.current_step: self.current_step +
                        5, 'Low'].values / MAX_SHARE_PRICE,
            self.df.loc[self.current_step: self.current_step +
                        5, 'Close'].values / MAX_SHARE_PRICE,
            self.df.loc[self.current_step: self.current_step +
                        5, 'Volume'].values / MAX_NUM_SHARES,
        ])
        
        # additional data and scale each value to between 0-1
        ### check the variables!!!!!!!!!!
        obs = np.append(frame, [[
            self.balance / MAX_ACCOUNT_BALANCE,
            self.max_net_worth / MAX_ACCOUNT_BALANCE,
            self.shares_held / MAX_NUM_SHARES,
            self.cost_basis / MAX_SHARE_PRICE,
            self.total_shares_sold / MAX_NUM_SHARES,
            self.total_sales_value / (MAX_NUM_SHARES * MAX_SHARE_PRICE),
        ]], axis=0)

        return obs
    
    def _take_action(self, action):
        # stock price is random value between Open and Close
        current_price = random.uniform(
            self.df.loc[self.current_step, "Open"], self.df.loc[self.current_step, "Close"])
        
        ### what is the action list?
        action_type = action[0]
        amount = action[1]
        
        if action_type < 1:
            # Buy amount % of balance in shares
            total_possible = int(self.balance / current_price)
            shares_bought = int(total_possible * amount)
            prev_cost = self.cost_basis * self.shares_held
            additional_cost = shares_bought * current_price

            self.balance -= additional_cost
            self.cost_basis = (
                prev_cost + additional_cost) / (self.shares_held + shares_bought)
            self.shares_held += shares_bought

        elif action_type < 2:
            # Sell amount % of shares held
            shares_sold = int(self.shares_held * amount)
            self.balance += shares_sold * current_price
            self.shares_held -= shares_sold
            self.total_shares_sold += shares_sold
            self.total_sales_value += shares_sold * current_price

        self.net_worth = self.balance + self.shares_held * current_price

        # set the net_worth
        if self.net_worth > self.max_net_worth:
            self.max_net_worth = self.net_worth

        if self.shares_held == 0:
            self.cost_basis = 0
            
    # step includes action by agent
    def step(self, action):
        # take action
        self._take_action(action)
        
        # add current_step
        self.current_step += 1
        
        # we have limited df. so we need to check the end of the training.
        if self.current_step > len(self.df.loc[:, 'Open'].values) - 6:
            self.current_step = 0
            
        delay_modifier = (self.current_step / MAX_STEPS)

        reward = self.balance * delay_modifier
        done = self.net_worth <= 0

        obs = self._next_observation()

        return obs, reward, done, {}
    
    def reset(self):
        # Reset the state of the environment to an initial state
        self.balance = INITIAL_ACCOUNT_BALANCE
        self.net_worth = INITIAL_ACCOUNT_BALANCE
        self.max_net_worth = INITIAL_ACCOUNT_BALANCE
        self.shares_held = 0
        self.cost_basis = 0
        self.total_shares_sold = 0
        self.total_sales_value = 0

        # Set the current step to a random point within the data frame
        self.current_step = random.randint(
            0, len(self.df.loc[:, 'Open'].values) - 6)

        return self._next_observation()
    
    def render(self, mode='human', close=False):
        # Render the environment to the screen
        profit = self.net_worth - INITIAL_ACCOUNT_BALANCE

        print(f'Step: {self.current_step}')
        print(f'Balance: {self.balance}')
        print(
            f'Shares held: {self.shares_held} (Total sold: {self.total_shares_sold})')
        print(
            f'Avg cost for held shares: {self.cost_basis} (Total sales value: {self.total_sales_value})')
        print(
            f'Net worth: {self.net_worth} (Max net worth: {self.max_net_worth})')
        print(f'Profit: {profit}')

In [18]:
env = gym.make([lambda: StockTradingEnv(df)])

TypeError: expected string or bytes-like object

In [3]:
import tianshou as ts

In [10]:
import pandas as pd
import numpy as np
df = pd.read_csv("aapl.csv")

### DummyVectorEnv

In [13]:
train_envs = ts.env.DummyVectorEnv([lambda: StockTradingEnv(df) for _ in range(10)])
test_envs = ts.env.DummyVectorEnv([lambda: StockTradingEnv(df) for _ in range(10)])

  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


### Build the Network

In [17]:
import torch

state_shape = env.observation_space.shape or env.observation_space.n
action_shape = env.action_space.shape or env.action_space.n
net = ts.utils.net.common.Net(state_shape, action_shape, activation='ReLu')
optim = torch.optim.Adam(net.parameters(), lr=1e-3)

AttributeError: 'list' object has no attribute 'shape'

In [None]:
np.random.seed(args.seed)
torch.manual_seed(args.seed)
train_envs.seed(args.seed)
test_envs.seed(args.seed)

In [4]:
import gym
import json
import datetime as dt
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2
from env.StockTradingEnv import StockTradingEnv
import pandas as pd
df = pd.read_csv('aapl.csv')
df = df.sort_values('Date')
# The algorithms require a vectorized environment to run
env = DummyVecEnv([lambda: StockTradingEnv(df)])


model = PPO2(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=20000)
obs = env.reset()
for i in range(2000):
  action, _states = model.predict(obs)
  obs, rewards, done, info = env.step(action)
  env.render()

ModuleNotFoundError: No module named 'stable_baselines'

In [None]:
train_envs = gym.make()