# 2) Train — Features & PPO (SB3)

- Builds indicators via `features.add_indicators`.
- Trains a PPO, A2C and DQN agent on a minimal discrete env defined **in this notebook**.
- Saves model and the list of feature columns used.

In [1]:
import sys, os
sys.path.insert(0, os.path.abspath('..')) # Path fix

import os, json
from pathlib import Path
import numpy as np
import pandas as pd
from dotenv import load_dotenv

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
import gymnasium as gym
from gymnasium import spaces

import features  # your features.py

load_dotenv()
SYMBOL = os.getenv("TRAINING_SYMBOL", "EURUSD")
TIMEFRAME = os.getenv("TIMEFRAME", "M15")
SPLIT_RATIO = float(os.getenv("SPLIT_RATIO", "0.8"))

DATA_CSV = Path("data") / f"ohlc_{SYMBOL}_{TIMEFRAME}.csv"
assert DATA_CSV.exists(), f"Missing dataset {DATA_CSV}. Run 1_Data.ipynb or provide CSV."

df = pd.read_csv(DATA_CSV, parse_dates=["time"], index_col="time")
df_feat = features.add_indicators(df.copy())

# --- ADD THIS LINE ---
df_feat.dropna(inplace=True)
# ---------------------

# Choose numeric columns as features (exclude obvious targets if any)
candidates = df_feat.select_dtypes(include=[np.number]).columns.tolist()
# Keep OHLCV + indicators for now
feature_cols = candidates

# Split
n_split = int(len(df_feat) * SPLIT_RATIO)
df_train = df_feat.iloc[:n_split].copy()
df_val   = df_feat.iloc[n_split:].copy()

print("Train:", df_train.shape, "Val:", df_val.shape, "Features:", len(feature_cols))


Train: (39672, 17) Val: (9918, 17) Features: 17


In [2]:

# Minimal discrete trading environment (buy/hold/sell).
class TradingEnv(gym.Env):
    metadata = {"render_modes": []}
    def __init__(self, df_feat, feature_cols, trade_cost=1e-4):
        super().__init__()
        self.df = df_feat
        self.cols = feature_cols
        self.trade_cost = float(trade_cost)
        self.n = len(self.df)
        self.idx = 0
        self.position = 0  # -1, 0, +1
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(len(self.cols),), dtype=np.float32
        )
        self.action_space = spaces.Discrete(3)  # 0=sell, 1=hold, 2=buy

    def _obs(self):
        row = self.df.iloc[self.idx][self.cols].astype(float).values
        return row.astype(np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.idx = 1  # need a previous bar for return
        self.position = 0
        obs = self._obs()
        return obs, {}

    def step(self, action):
        # map action -> position
        pos_new = {-1:0, 0:0, 1:0, 2:1}[action] if action in (-1,0,1,2) else 0
        pos_new = {-1: -1, 0: 0, 1: 1}.get({0:-1,1:0,2:1}[action], 0)

        # price return from t-1 -> t on close
        prev = self.df["close"].iloc[self.idx-1]
        curr = self.df["close"].iloc[self.idx]
        ret = (curr - prev) / (prev + 1e-12)

        # reward is position * return minus cost if changed position
        reward = pos_new * ret - (self.trade_cost if pos_new != self.position else 0.0)
        self.position = pos_new

        # next
        terminated = False
        self.idx += 1
        truncated = self.idx >= (self.n - 1)
        obs = self._obs()
        info = {"position": self.position, "ret": ret}
        return obs, float(reward), terminated, truncated, info

In [4]:

# Train PPO
import numpy as np
from stable_baselines3.common.env_util import make_vec_env

def make_env():
    return TradingEnv(df_train, feature_cols)

env = DummyVecEnv([make_env])
model = PPO("MlpPolicy", env, verbose=1)

total_timesteps = int(os.getenv("TOTAL_TIMESTEPS", "100000"))
model.learn(total_timesteps=total_timesteps)

Path("models").mkdir(exist_ok=True)
model_path = Path("models") / f"ppo_{SYMBOL}_{TIMEFRAME}.zip"
model.save(model_path.as_posix())

with open(Path("models") / "selected_features.json", "w", encoding="utf-8") as f:
    json.dump(feature_cols, f, indent=2)

print("Saved model to:", model_path.resolve())

Using cpu device
-----------------------------
| time/              |      |
|    fps             | 710  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 587         |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009429634 |
|    clip_fraction        | 0.0859      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | -8.04       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0214     |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00715    |
|    value_loss           | 0.00153     |
-----------------------------------------
-----------------

In [3]:
# === Train A2C (same env/data as PPO) ===
import os, json
from pathlib import Path
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor

def make_env():
    return Monitor(TradingEnv(df_train, feature_cols))

env_a2c = DummyVecEnv([make_env])

a2c = A2C("MlpPolicy", env_a2c,
          verbose=1,
          n_steps=5,        # SB3 default
          gamma=0.99,
          learning_rate=7e-4)

total_timesteps = int(os.getenv("TOTAL_TIMESTEPS", "100000"))
a2c.learn(total_timesteps=total_timesteps)

Path("models").mkdir(exist_ok=True)
a2c_path = Path("models") / f"a2c_{SYMBOL}_{TIMEFRAME}.zip"
a2c.save(a2c_path.as_posix())

# (Write once is enough; safe to overwrite)
with open(Path("models") / "selected_features.json", "w", encoding="utf-8") as f:
    json.dump(feature_cols, f, indent=2)

print("Saved A2C model to:", a2c_path.resolve())


Using cpu device
------------------------------------
| time/                 |          |
|    fps                | 359      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.05    |
|    explained_variance | -12.1    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 0.00607  |
|    value_loss         | 5.1e-05  |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 362      |
|    iterations         | 200      |
|    time_elapsed       | 2        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -1.05    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 199      |
|    policy_loss        | -0.00202 |
|    value_loss      

In [4]:
# === Train DQN (same env/data as PPO) ===
import os, json
from pathlib import Path
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor

def make_env():
    return Monitor(TradingEnv(df_train, feature_cols))

env_dqn = DummyVecEnv([make_env])

dqn = DQN("MlpPolicy", env_dqn,
          verbose=1,
          learning_rate=1e-3,
          buffer_size=100_000,
          learning_starts=1_000,
          batch_size=64,
          tau=1.0,
          gamma=0.99,
          train_freq=4,
          target_update_interval=1_000,
          exploration_fraction=0.1,
          exploration_final_eps=0.01)

total_timesteps = int(os.getenv("TOTAL_TIMESTEPS", "100000"))
dqn.learn(total_timesteps=total_timesteps)

Path("models").mkdir(exist_ok=True)
dqn_path = Path("models") / f"dqn_{SYMBOL}_{TIMEFRAME}.zip"
dqn.save(dqn_path.as_posix())

# (Write once is enough; safe to overwrite)
with open(Path("models") / "selected_features.json", "w", encoding="utf-8") as f:
    json.dump(feature_cols, f, indent=2)

print("Saved DQN model to:", dqn_path.resolve())


Using cpu device
Saved DQN model to: G:\My Drive\Bots DRL\DRL\DRL-MT5-Lab\notebooks\models\dqn_EURUSD_M15.zip
