# Code implementation exploration
This Notebook has been created with the goal to explore the way the code implementation of stable-baselines and openai actually work, based on small "dummy"-cases.
   
   
1. Environment
2. Agent framework

In [6]:
# inspired by: 
# https://costa.sh/blog-the-32-implementation-details-of-ppo.html

## 1. Environment

In [16]:
import numpy as np
import pandas as pd
import gym
from gym import error, spaces, utils
from env import FinancialMarketEnv

In [123]:
# import dummy data
dumdat = pd.read_csv("data/preprocessed/done_data.csv", index_col=0)
dumdat = dumdat[["datadate", "tic", "adjcp"]]
dumdat.index = dumdat["datadate"].factorize()[0]
# only pick 3 stocks
dumdat = dumdat.loc[dumdat["tic"].isin(["AAPL", "AXP", "BA"])]
# reduce data set to 5 days (4 possible steps)
episode_length_steps = 5 
dumdat = dumdat.loc[1-1:episode_length_steps-1]
print("number of possible steps: ", len(dumdat.index.unique())-1)
dumdat.head(20)

number of possible steps:  4


Unnamed: 0,datadate,tic,adjcp
0,20090102,AAPL,12.964286
0,20090102,AXP,19.33
0,20090102,BA,45.25
1,20090105,AAPL,13.511429
1,20090105,AXP,19.95
1,20090105,BA,46.17
2,20090106,AAPL,13.288571
2,20090106,AXP,21.07
2,20090106,BA,46.31
3,20090107,AAPL,13.001429


In [124]:
# creating a dummy environment for FinancialMarketEnv: Dummy_FinancialMarketEnv
# it is very similar to the actual environment used in the thesis, FinancialMarketEnv, 
# but there are less variables tracked and the action consists of simply appending 
# sampled actions to the state space, sice it is just for illustration

class Dummy_FinancialMarketEnv(gym.Env):
    def __init__(self, df, day=0, assets_dim=3, initial_cash_balance=10000):
        self.df = df
        self.day = day
        self.data = self.df.loc[self.day, :] 
        self.assets_dim = assets_dim
        self.initial_cash_balance = initial_cash_balance
        self.action_space = spaces.Box(low=-1, high=1, shape=(self.assets_dim,))
        # we observe 1 feature (adjcp) for each of the 3 stocks + 1 cash account = 30+1
        self.observation_space = spaces.Box(low=0, high=np.inf, shape=(self.assets_dim+1,))
        self.actions = [0]*self.assets_dim
        self.state = [self.initial_cash_balance] + \
                            self.data["adjcp"].values.tolist() + self.actions
        self.reward = 0
        self.terminal_state = False
        self.step_counter = 0

    def step(self, action=None):
        self.action = action      
        #print("action: ",self.action not None)
        self.terminal_state = self.day >= self.df.index.unique()[-1] # :bool
        if self.terminal_state: 
            print("reached end of dataset at step", self.step_counter)
            return self.state, self.reward, self.terminal_state, {}
        else: 
            self.step_counter+=1
            self.day +=1
            self.data = self.df.loc[self.day, :]
            if isinstance(self.action,(list,pd.core.series.Series,np.ndarray)): 
                self.actions = self.action.tolist()  
            self.state = [self.initial_cash_balance] + self.data["adjcp"].values.tolist() + \
                             self.actions
            self.state = np.asarray(self.state) # state must be array, also easier for computation
            return self.state, self.reward, self.terminal_state, {}

    def reset(self):
        self.day = 0 
        self.data = self.df.loc[self.day, :]
        self.actions = [0]*self.assets_dim
        self.state = [self.initial_cash_balance] + \
                            self.data["adjcp"].values.tolist() + self.actions
        self.state = np.asarray(self.state)
        self.reward = 0
        self.terminal_state = False
        self.step_counter = 0
        return self.state
    def render(self, mode="human", close=False):
        return self.state

# instantiate environment object
env = Dummy_FinancialMarketEnv(df=dumdat)

# when resetting the environment, it should return the current state
# and by definition this current state after reset should be the starting state
# (we have defined the starting state as cash balance 10'000 and then the first asset prices 
# of the provided data set and at the end, asset holdings at the beginning for each asset
# must be 0, as we start out with cash only before the first trade)
# state = [initial_cash_balance, price AAPL, price AXP, 
#          price BA, holdings AAPL, holdings AXP, holdings BA]
env.reset()

array([10000.        ,    12.96428571,    19.33      ,    45.25      ,
           0.        ,     0.        ,     0.        ])

In [125]:
# create a Dummy_agent 
class Dummy_agent():
    def __init__(self, env=env, state=None, reward=None, action_sampling=True):
        self.env = env
        self.state = state
        self.reward = reward
        self.action_sampling = action_sampling
    def sample_action(self):
        if self.action_sampling: 
            # if the env is passed (hence not none), sampke from action space in the environment
            action = self.env.action_space.sample()
        else:
            # if the environment is None (no environment passed to the agent), take no action
            # and just observe the new day sampled in the environment
            action = None
        return action

# instantiate agent object
agent = Dummy_agent()

In [126]:
# create the training algorithm
def training_algorithm(instantiated_agent, instantiated_env, n_episodes=3):
    for i in range(1, n_episodes+1):
        all_obs = [env.reset()]
        while True:
            action = agent.sample_action()
            obs, reward, done, info = env.step(action=action)
            all_obs += [obs]
            if done:
                print(f"all observations in episode {i}:")
                print(pd.DataFrame(all_obs, columns=["cash", "AAPL_price", "AXP_price", 
                                                     "BA_price","AAPL_holdings", "AXP_holdings", 
                                                     "BA_holdings"]))
                print("true termination\n")
                print()
                break
    print("=========")

### 1.1. How the termination and seeding works
There are two ways an episode can be terminated:
- **True termination** of an episode happens, when the condition of terminal_state is true (see example below, defined in the environment). 
This condition is coded within the environment. The termination condition can be the death of an agent (like falling out of the grid world), or reaching a certain goal (like the end of the labirinth).   
In this thesis, the "true termination condition" is such that if the ending of the provided (train, validation, test) data set is reached, the episode naturally ends.   
Note: in other use cases in financial trading it could also make sense to terminate the episode when a certain wealth level (=> goal) is reached (and then liquidate all stocks and migrate to an island and receive an extra reward etc.).    
Since in asset / investment management, usually the goal is to stay invested over longer periods (e.g. years, until retirement, reaching financial independence, over multiple generations,...) and not to reach short-term gains (although this would be nice, it is not the main goal of classical investment management), hence a termination when the end of the data set is reached makes most sense.

- **Time limit termination**: When the episode could go on indefinitely, we can set a limit to the maximum number of time steps within one episode. 

#### True episode termination - 3 episodes, no actions (agent only moves from one day to the other within the environment)

**Description**:
Below in the output, we see that for every episode, the same data in the same sequential order is sampled.   
The first entry of a list (10000) is the initial cash balance, the next 3 entries are the asset prices of the 3 assets we look at, the final 3 empty entries (0) are number of asset holdings.    
Currently, we don't take any action, we only sample a new day at each step. This is done to make it simpler to understand what is going on.

In [147]:
# get data and reduce data set to 5 days (4 possible steps)
episode_length_steps = 5 
dumdat = dumdat.loc[1-1:episode_length_steps-1]

# instantiate environment object
env = Dummy_FinancialMarketEnv(df=dumdat)
env.reset()
# instantiate agent object (sample_action = Fals we wpn't take any action for now, just
# observe the state ofthe next day)
agent = Dummy_agent(env=env, state=None, reward=None, action_sampling=False)

# number of episodes chosen in total: 3; chosen arbitrarily, just for illustration
n_episodes = 3 
# run the training algorithm
training_algorithm(instantiated_agent=agent, instantiated_env=env, n_episodes=3)

reached end of dataset at step 4
all observations in episode 1:
      cash  AAPL_price  AXP_price  BA_price  AAPL_holdings  AXP_holdings  \
0  10000.0   12.964286      19.33     45.25            0.0           0.0   
1  10000.0   13.511429      19.95     46.17            0.0           0.0   
2  10000.0   13.288571      21.07     46.31            0.0           0.0   
3  10000.0   13.001429      20.01     44.76            0.0           0.0   
4  10000.0   13.242857      20.04     44.79            0.0           0.0   
5  10000.0   13.242857      20.04     44.79            0.0           0.0   

   BA_holdings  
0          0.0  
1          0.0  
2          0.0  
3          0.0  
4          0.0  
5          0.0  
true termination


reached end of dataset at step 4
all observations in episode 2:
      cash  AAPL_price  AXP_price  BA_price  AAPL_holdings  AXP_holdings  \
0  10000.0   12.964286      19.33     45.25            0.0           0.0   
1  10000.0   13.511429      19.95     46.17      

#### True episode termination - 3 episodes, sampled actions randomly from defined action space
now, we are going to have a look at what happens if we sample actions from the environment.  
Recall that we defined the action space in the environment above, and from this action space we are going to sample. Here, we don't set any random seed.  

The actions here are very simple and only for illustration: the actions vector is simply appended to the state vector, so instead of [0,0,0] at the end of each state vector, we are going to fill these numbers with the corresponding actions sampled.

In [130]:
# this is how the agent we defined above samples the action space
# every time this is run, a new sample is generated 
env.action_space.sample()
# (non-deterministic), since no seeding here

array([-0.98027253, -0.26714474,  0.53676194], dtype=float32)

**Description**:
Below in the output, we see that for every episode, again, the same data in the same sequential order is sampled.   
  
The actions were sampled without fixing the random seed, hence they are different in every episode 8=asset holdings).

In [131]:
# get data and reduce data set to 5 days (4 possible steps)
episode_length_steps = 5 
dumdat = dumdat.loc[1-1:episode_length_steps-1]
# instantiate environment object
env = Dummy_FinancialMarketEnv(df=dumdat)
env.reset()
# instantiate agent object (sample_action = True; this time, we will sample actions from the 
# pre-defined action space)
agent = Dummy_agent(env=env, state=None, reward=None, action_sampling=True)

# number of episodes chosen in total: 3; chosen arbitrarily, just for illustration
n_episodes = 3 
# run the training algorithm
training_algorithm(instantiated_agent=agent, instantiated_env=env, n_episodes=3)

reached end of dataset at step 4
all observations in episode 1:
      cash  AAPL_price  AXP_price  BA_price  AAPL_holdings  AXP_holdings  \
0  10000.0   12.964286      19.33     45.25       0.000000      0.000000   
1  10000.0   13.511429      19.95     46.17      -0.187715     -0.994984   
2  10000.0   13.288571      21.07     46.31       0.206030     -0.341867   
3  10000.0   13.001429      20.01     44.76      -0.398915      0.419919   
4  10000.0   13.242857      20.04     44.79      -0.914911      0.826951   
5  10000.0   13.242857      20.04     44.79      -0.914911      0.826951   

   BA_holdings  
0     0.000000  
1     0.524657  
2     0.260012  
3    -0.299572  
4     0.146937  
5     0.146937  
true termination


reached end of dataset at step 4
all observations in episode 2:
      cash  AAPL_price  AXP_price  BA_price  AAPL_holdings  AXP_holdings  \
0  10000.0   12.964286      19.33     45.25       0.000000      0.000000   
1  10000.0   13.511429      19.95     46.17      

**Description**: 
Now we are going to use seeding in order to check, how it works.  
We want to sample actions but we want this to be reproducible by using the same seed.  
  
env.seed(random_seed) is a function that simply returns the seed.    
In order to seed the action space, the function env.action_space.seed(random_seed) needs to be called.  
This returns a seeded numpy rng (random numbers generator), which then samples actions randomly but in the same order every time we start anew.  
This is demonstrated below.

In [132]:
# this function only returns the seed (or seeds list, in case of multiple envs)
env.seed(11)

11

In [178]:
# every time we run this code, the action space samples are different 
# from each other but occur in the same order
env.action_space.seed(11)
print(env.action_space.sample())
print(env.action_space.sample())
print(env.action_space.sample())
print(env.action_space.sample())
print(" ")
env.action_space.seed(11)
print(env.action_space.sample())
print(env.action_space.sample())
print(env.action_space.sample())
print(env.action_space.sample())

[-0.02584589 -0.76794004  0.3478417 ]
[ 0.41330826 -0.38325065  0.8453528 ]
[-0.00483404  0.18476054  0.6489266 ]
[-0.7800391  -0.5151238   0.47571707]
 
[-0.02584589 -0.76794004  0.3478417 ]
[ 0.41330826 -0.38325065  0.8453528 ]
[-0.00483404  0.18476054  0.6489266 ]
[-0.7800391  -0.5151238   0.47571707]


In [134]:
# https://harald.co/2019/07/30/reproducibility-issues-using-openai-gym/

#### what happens, if we create a vecorized environment (wrap the vector class around it)?
This is something I have to do in my work as well because the PPO and A2C agent need it (see also paper described in thesis). They do this to speed up processing /learning.

In [135]:
from stable_baselines3.common.vec_env import DummyVecEnv

In [169]:
vecenv = DummyVecEnv([lambda: Dummy_FinancialMarketEnv(df=dumdat)])
vecenv

<stable_baselines3.common.vec_env.dummy_vec_env.DummyVecEnv at 0x23119983f08>

In [170]:
# differences between vectorized env and normal env from before?
print(env)
print(vecenv)
print("")
# this is a list of environments, but since we have only passed one, it s length 1
print(vecenv.envs) 
print(vecenv.envs[0]) # this is the only environment we have passed
print("")
print(env.action_space)
print(vecenv.action_space)
print("")
print(env.reset()) # returns first state
#print(vecenv.reset()) # does not work, returns error
print(vecenv.envs[0].reset())
print("")

<Dummy_FinancialMarketEnv instance>
<stable_baselines3.common.vec_env.dummy_vec_env.DummyVecEnv object at 0x0000023119983F08>

[<__main__.Dummy_FinancialMarketEnv object at 0x0000023119983908>]
<Dummy_FinancialMarketEnv instance>

Box(-1.0, 1.0, (3,), float32)
Box(-1.0, 1.0, (3,), float32)

[10000.            12.96428571    19.33          45.25
     0.             0.             0.        ]
[10000.            12.96428571    19.33          45.25
     0.             0.             0.        ]



In [176]:

vecenv.action_space.seed(11)
vecenv.action_space.sample()


array([-0.02584589, -0.76794004,  0.3478417 ], dtype=float32)

In [146]:
# get data and reduce data set to 5 days (4 possible steps)
episode_length_steps = 5 
dumdat = dumdat.loc[1-1:episode_length_steps-1]
# instantiate environment object
env = Dummy_FinancialMarketEnv(df=dumdat)
env.reset()
vecenv = DummyVecEnv([lambda: env])
# instantiate agent object (sample_action = True; this time, we will sample actions from the 
# pre-defined action space)
agent = Dummy_agent(env=env, state=None, reward=None, action_sampling=True)

# number of episodes chosen in total: 3; chosen arbitrarily, just for illustration
n_episodes = 3 
# run the training algorithm
training_algorithm(instantiated_agent=agent, instantiated_env=vecenv, n_episodes=3)

reached end of dataset at step 4
all observations in episode 1:
      cash  AAPL_price  AXP_price  BA_price  AAPL_holdings  AXP_holdings  \
0  10000.0   12.964286      19.33     45.25       0.000000      0.000000   
1  10000.0   13.511429      19.95     46.17      -0.967771      0.839267   
2  10000.0   13.288571      21.07     46.31       0.390801      0.289887   
3  10000.0   13.001429      20.01     44.76      -0.685475      0.904185   
4  10000.0   13.242857      20.04     44.79      -0.502355      0.542519   
5  10000.0   13.242857      20.04     44.79      -0.502355      0.542519   

   BA_holdings  
0     0.000000  
1     0.411189  
2    -0.259051  
3    -0.001251  
4     0.905264  
5     0.905264  
true termination


reached end of dataset at step 4
all observations in episode 2:
      cash  AAPL_price  AXP_price  BA_price  AAPL_holdings  AXP_holdings  \
0  10000.0   12.964286      19.33     45.25       0.000000      0.000000   
1  10000.0   13.511429      19.95     46.17      

array([10000.        ,    12.96428571,    19.33      ,    45.25      ,
           0.        ,     0.        ,     0.        ])