In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import config


from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.specs import ArraySpec, BoundedArraySpec
from tf_agents.trajectories import time_step as ts
from sklearn.preprocessing import StandardScaler

In [231]:
class Environment(py_environment.PyEnvironment):
    def __init__(self):
        self._action_spec = BoundedArraySpec((len(config.TICKERS)+1,), 
                                             dtype=np.float64, 
                                             minimum=0, 
                                             maximum=1, 
                                             name='action')
        
        self._observation_spec = ArraySpec(shape=(len(config.OBS_COL),), 
                                           dtype=np.float64, 
                                           name='observation')
        self.reset()
        self._episode_ended = False
    
    def action_spec(self):
        return self._action_spec

    def observation_spec(self):
        return self._observation_spec
    
    def _reset(self):
        self.return_mem = pd.DataFrame(columns=[t+"_close" for t in config.TICKERS])
        self._episode_ended = False
        self.step_index = 0
        self.timeframe = pd.Timedelta(1,unit='d')
        self.init_cash = 1000
        self.current_cash = self.init_cash
        self.portfolio_value = self.init_cash

        # self.previous_price = {}
        self.prev_price_share = {}
        self.prev_price_cash = {}

        self.ratio = np.zeros((len(config.TICKERS)+1))
        self.ratio[0] = 1

        self.df = pd.read_csv(config.FILEPATH,index_col=0).reset_index(drop=True)
        self.scaler = StandardScaler()
        
        self.df["Date"] = pd.to_datetime(self.df["Date"])
        self.scaler.fit(self.df[config.EN_COLS].values)

        self.max_index = self.df.shape[0]
        start_point = (np.random.choice(np.arange(3,self.max_index - config.EPISODE_LENGTH))//3) *3
        end_point = start_point + config.EPISODE_LENGTH//3 *3
        self.df = self.df.loc[start_point:end_point+2].reset_index(drop=True)


        self.init_time = self.df.loc[0,"Date"]
        self.current_time = self.init_time
        self.df_time_slice = self.df[self.df['Date'] == self.init_time]

        self.current_share_distribution = self.calculate_actual_shares_from_money_split()
        self.previous_value = self.portfolio_value
        self.current_money_distribution,self.portfolio_value  = self.calculate_money_from_num_stocks()

        
        self.step_reward = 0
        
        self._state = self.get_observations()[config.OBS_COL].values.flatten()
        self.info_ =  {"state":self._state,\
                "money_split":self.ratio,"share_num":self.current_share_distribution,\
                "value":self.portfolio_value,"time":self.current_time,\
                "reward":self.step_reward,\
                # "raw_output":self.get_observations_unscaled(),
                "scaled_output":self.get_observations()}
        reward = self.info_["reward"]
        # self._episode_ended = True if self.index==config.EPISODE_LENGTH//3 else False
        

        return ts.restart(self._state)
    
    def _step(self, action):
 
        if self._episode_ended:
            return self.reset()
        if sum(action)<=1e-3:
            self.ratio = [1/len(action) for a in action]
        else:
            self.ratio = [a/sum(action) for a in action]

        self.current_share_distribution = self.calculate_actual_shares_from_money_split()
        self.step_time()
        self.step_index +=1

        self._state = self.get_observations()[config.OBS_COL].values.flatten()
        self.info_ =  {"state":self._state,\
                "money_split":self.ratio,"share_num":self.current_share_distribution,\
                "value":self.portfolio_value,"time":self.current_time,\
                "reward":self.step_reward,\
                # "raw_output":self.get_observations_unscaled(),
                "scaled_output":self.get_observations()}
        reward = self.info_["reward"]
        self._episode_ended = True if self.step_index==config.EPISODE_LENGTH//3 else False

        if self._episode_ended:
            reward = 0
            return ts.termination(self._state , reward)
        else:
            try:
                return ts.transition(
                    self._state, reward=reward, discount=1)
            except Exception as e:
                print(self._state)
                print(reward)
                print(action)
                print(self.step_index)
                print(self.df_time_slice)
                print(self.current_time)
                print(self.ratio)
                print(e)
                raise ValueError
        
    def step_time(self):
        self.current_time += self.timeframe
        self.df_time_slice = self.df[self.df['Date'] == self.current_time]
        self.previous_value = self.portfolio_value
        self.current_money_distribution,self.portfolio_value  = self.calculate_money_from_num_stocks()
        self.ratio = self.normalize_money_dist()
        self.step_reward = self.portfolio_value - self.previous_value
    
    def get_observations(self):
        obs = pd.DataFrame()
        if self.df_time_slice.empty == False:
            for i,grp in self.df_time_slice.groupby("Ticker"):
                tempdf = pd.DataFrame(self.scaler.transform(grp[config.EN_COLS].values))
                tempdf.columns = [i+"_"+c for c in config.EN_COLS]
                if obs.empty:
                    obs = tempdf
                else:
                    obs = obs.merge(tempdf,right_index=True,left_index=True,how='inner')
            self.prev_obs = obs
            return obs
        else:
            return self.prev_obs
    
    def get_observations_unscaled(self):
        obs = pd.DataFrame()
        if self.df_time_slice.empty == False:
            for i,grp in self.df_time_slice.groupby("Ticker"):
                tempdf = pd.DataFrame(grp[config.COLS].values)
                tempdf.columns = [i+"_"+c for c in config.COLS]
                if obs.empty:
                    obs = tempdf
                else:
                    obs = obs.merge(tempdf,right_index=True,left_index=True,how='inner')
            
            self.memory_return = pd.concat([self.memory_return,obs[[t+"_Close" for t in config.TICKERS]]],ignore_index=True)
            self.prev_obs_unscaled = obs
            return obs
        else:
            return self.prev_obs_unscaled

    def calculate_actual_shares_from_money_split(self):
        price_dict = self.df_time_slice[["Ticker","Open"]]\
                        .set_index("Ticker").to_dict()["Open"]
        
        num_shares = []
        if len(price_dict) != 0:
            for i,c in enumerate(config.TICKERS):
                if c in price_dict:
                    num_shares.append(self.ratio[i+1]*self.portfolio_value/price_dict[c] )
                else:
                    num_shares.append(self.ratio[i+1]*self.portfolio_value/self.prev_price_share[c] )
                
            self.current_cash = self.ratio[0]*self.portfolio_value
            for c in price_dict:
                self.prev_price_share[c] = price_dict[c]
            return num_shares
        else:
            return self.current_share_distribution

    def calculate_money_from_num_stocks(self):
        money_dist = []
        money_dist.append(self.current_cash)
        price_dict = self.df_time_slice[["Ticker","Open"]]\
                        .set_index("Ticker").to_dict()["Open"]
        
        if len(price_dict) != 0:
            for i,c in enumerate(config.TICKERS):
                if c in price_dict:
                    money_dist.append(self.current_share_distribution[i]*price_dict[c])
                else:
                    money_dist.append(self.current_share_distribution[i]*self.prev_price_cash[c])
            
            for c in price_dict:
                self.prev_price_cash[c] = price_dict[c]
            return money_dist,sum(money_dist)
        else:
            return self.current_money_distribution, self.portfolio_value
        
    def normalize_money_dist(self):
        normal = []
        for i,c in enumerate(self.current_money_distribution):
            normal.append(c/self.portfolio_value)
        return normal

In [243]:
env = Environment()

In [244]:
env.current_cash

1000.0

In [245]:
action=[0.1,0.2,0.3,0.4]

In [246]:
for i in range(1,10000):
    if env._episode_ended:
        print(env.step_index)
        print(env.portfolio_value)
        break
    else:
        env.step(action=action)
print(env.portfolio_value)

[100.0, 198.53550115661474, 301.60898014320827, 397.9015508663369]
[100.0, 198.53550115661474, 301.60898014320827, 397.9015508663369]
[100.0, 198.53550115661474, 301.60898014320827, 397.9015508663369]
[100.0, 198.53550115661474, 301.60898014320827, 397.9015508663369]
[100.0, 198.53550115661474, 301.60898014320827, 397.9015508663369]
[100.0, 198.53550115661474, 301.60898014320827, 397.9015508663369]
[100.0, 198.53550115661474, 301.60898014320827, 397.9015508663369]
[100.0, 198.53550115661474, 301.60898014320827, 397.9015508663369]
[100.0, 198.53550115661474, 301.60898014320827, 397.9015508663369]
[100.0, 198.53550115661474, 301.60898014320827, 397.9015508663369]
[100.0, 198.53550115661474, 301.60898014320827, 397.9015508663369]
[100.0, 198.53550115661474, 301.60898014320827, 397.9015508663369]
[99.80460321661599, 197.97673175882795, 297.3083228558131, 399.542332217978]
[99.80460321661599, 197.97673175882795, 297.3083228558131, 399.542332217978]
[99.80460321661599, 197.97673175882795, 29