# PPO with MLP policy

In [None]:
%load_ext autoreload
%autoreload 2

## Imports

In [None]:
import pandas as pd
import numpy as np
import time

from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3 import PPO

from envs.gym_market_env import CustomEnv

## Environments

In [None]:
train = pd.read_csv("../etl/train_dataset_after_pca.csv")
eval_df = pd.read_csv("../etl/val_dataset_after_pca.csv")

In [None]:
discount = 0.2
reward_multiplicator = 144.5
negative_reward_multiplicator = 140.6

train_py_env = CustomEnv(
    trades = train,
    features = [c for c in train.columns.values if "f_" in c] + ["feature_0", "weight"],
    reward_column = "resp",
    weight_column = "weight",
    include_weight=True,
    reward_multiplicator = reward_multiplicator,
    negative_reward_multiplicator = negative_reward_multiplicator
)

eval_py_env = CustomEnv(
    trades = train,
    features = [c for c in train.columns.values if "f_" in c] + ["feature_0", "weight"],
    reward_column = "resp",
    weight_column = "weight",
    include_weight=True,
    reward_multiplicator = 1,
    negative_reward_multiplicator = 1
)

## Metrics

In [None]:
def calculate_u_metric(df, model):
    print("evaluating policy")


    actions = model.predict(df[[c for c in df.columns if "f_" in c] + ["feature_0","weight"]].values)[0]
    assert not np.isnan(np.sum(actions))

    sum_of_actions = np.sum(actions)
    print("np_sum(actions)", sum_of_actions)

#     df["action"] = probs_df["action"]
    df["action"] = pd.Series(data=actions, index=df.index)

    df["trade_reward"] = df["action"]*df["weight"]*df["resp"]

    tmp = df.groupby(["date"])[["trade_reward"]].agg("sum")

    sum_of_pi = tmp["trade_reward"].sum()
    sum_of_pi_x_pi = (tmp["trade_reward"]*tmp["trade_reward"]).sum()

    print("sum of pi: {sum_of_pi}".format(sum_of_pi = sum_of_pi) )

    t = sum_of_pi/np.sqrt(sum_of_pi_x_pi) * np.sqrt(250/tmp.shape[0])
    print("t: {t}".format(t = t) )

    u  = np.min([np.max([t, 0]), 6]) * sum_of_pi
    print("u: {u}".format(u = u) )
    ratio_of_ones = sum_of_actions/len(actions)
    print("ration of ones", ratio_of_ones)
    print("length of df", len(actions))

    print("finished evaluating policy")

    return t, u, ratio_of_ones

## Hyperparameters

In [None]:
total_episodes = 1000

In [None]:
train_env = DummyVecEnv([lambda: train_py_env])


In [None]:
model = PPO('MlpPolicy', train_env, verbose=1)
model.learn(total_timesteps=2.4e6)


print(calculate_u_metric(eval_df, model))
print(calculate_u_metric(train, model))
            
        

In [None]:
eval_df[[c for c in train.columns.values if "f_" in c] + ["feature_0", "weight"]].values

In [None]:
model.predict(eval_df[[c for c in train.columns.values if "f_" in c] + ["feature_0", "weight"]].values)

In [None]:
calculate_u_metric(eval_df, model)