In [11]:
import os

import gymnasium as gym
import pandas as pd
from gymnasium import spaces
import numpy as np
import logging
import torch
import mplfinance as mpf
import pandas as pd
import datetime

# Configure logging
logger = logging.getLogger(__name__)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [None]:
def render_to_file(**kwargs):
    log_header                  =   kwargs.get("log_header",False)
    log_filename                =   kwargs.get("log_filename","")
    printout                    =   kwargs.get("printout",False)
    balance                     =   kwargs.get("balance")
    balance_initial             =   kwargs.get("balance_initial")
    transaction_close_this_step =   kwargs.get("transaction_close_this_step",[])
    done_information            =   kwargs.get("done_information","")
    profit                      =   balance - balance_initial

    tr_lines                    =   ""
    tr_lines_comma              =   ""
    _header                     =   ""
    _header_comma               =   ""
    if log_header:
        _header = f'{"Ticket":>8} {"Type":>4} {"ActionStep":16} \
                    {"ActionPrice":>12} {"CloseStep":8} {"ClosePrice":>12} \
                    {"OpenBal":>12} {"CloseBal":>12} {"Status":8} {"Info":>8} {"PIPS":>6} {"SL":>6} {"PT":>6} {"DeltaStep":8}\n'


        _header_comma = f'{"Ticket,Type,ActionTime,ActionStep,ActionPrice,CloseTime,ClosePrice, OpenBal, CloseBal, Status, Info, PIPS,SL,PT,CloseStep,DeltaStep"}\n'
    if transaction_close_this_step:
        for _tr in transaction_close_this_step:
            if _tr["CloseStep"] >=0:
                tr_lines += f'{_tr["Ticket"]:>8} {_tr["Type"]:>4} {_tr["ActionStep"]:16} \
                    {_tr["ActionPrice"]:.5f} {_tr["CloseStep"]:8} {_tr["ClosePrice"]:.5f} \
                    {_tr["OpenBal"]:.2f} {_tr["CloseBal"]:.2f} {_tr["Status"]:8}  {_tr["Info"]:>8}  {_tr["PIPS"]:4.0f} {_tr["SL"]:4.0f} {_tr["PT"]:4.0f} {_tr["DeltaStep"]:8}\n'

                tr_lines_comma += f'{_tr["Ticket"]},{_tr["Type"]},{_tr["ActionTime"]},{_tr["ActionStep"]}, \
                    {_tr["ActionPrice"]},{_tr["CloseTime"]},{_tr["ClosePrice"]}, \
                    {_tr["OpenBal"]},{_tr["CloseBal"]}, {_tr["Status"]},{_tr["Info"]},{_tr["PIPS"]},{_tr["SL"]},{_tr["PT"]},{_tr["CloseStep"]},{_tr["DeltaStep"]}\n'

    log = _header_comma + tr_lines_comma
    # log = f"Step: {current_step}   Balance: {balance}, Profit: {profit} \
    #     MDD: {max_draw_down_pct}\n{tr_lines_comma}\n"
    if done_information:
        log += done_information
    if log:
        # os.makedirs(log_filename, exist_ok=True)
        dir_path = os.path.dirname(log_filename)
        if dir_path and not os.path.exists(dir_path):
            os.makedirs(dir_path, exist_ok=True)
        with open(log_filename, 'a+') as _f:
            _f.write(log)
            _f.close()

    tr_lines = _header + tr_lines
    if printout and tr_lines:
        print(tr_lines)
        if done_information:
            print(done_information)

In [None]:
class ForexTradingEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, file, cf, asset, features, sequence_length=24, logger_show=False, save_plot=False):
        super(ForexTradingEnv, self).__init__()
        # ကိန်းရှင်များကို စတင်သတ်မှတ်သည်။
        self._initialize_parameters(file, cf, asset, features, sequence_length, logger_show, save_plot)
        # Action နှင့် Observation Spaces ကို သတ်မှတ်သည်။
        self._initialize_spaces()
        # Environment ကို အစပြုအခြေအနေသို့ ပြန်လည်သတ်မှတ်သည်။
        self.reset()

    # ကိန်းရှင်များကို စတင်သတ်မှတ်သည်။
    def _initialize_parameters(self, file, cf, asset, features, sequence_length, logger_show, save_plot):
        # Params to variables
        self.csv_file               =   file
        self.cf                     =   cf
        self.symbol_col             =   asset
        self.features               =   features
        self.sequence_length        =   sequence_length
        self.logger_show            =   logger_show
        self.save_plot              =   save_plot

        self.data                   =   pd.read_csv(file)
        # We use sequence transformer, so max steps will be this
        self.max_steps              =   len(self.data) - self.sequence_length - 1

        # Configs to variables
        # Agent က Action က Continuous Action ကို Discrete Action သို့ပြောင်းပေးသော threshold
        self.action_threshold       =   self.cf.env_parameters('action_threshold')
        self.balance_initial        =   self.cf.env_parameters('balance')

        # position close မဖြစ်သေးရင်
        # buy ထားပြီး price up ဖြစ်နေရင် reward ပေး။ sell ထားပြီး price down ဖြစ်နေရင် reward ပေး
        # position management မှာလည်း သုံး။
        # buy မှာ မြတ်နေရင် tp အပေါ်ရွေ့ sl အပေါ်ရွေ့။  ရှုံးနေရင် tp အောက်ရွေ့ sl အပေါ်တင်,
        # sell မှာ မြတ်နေရင် tp အောက်ရွေ့ sl အောက်ရွေ့။ ရှုံးနေရင် tp အပေါ်တင် sl အောက်ချ
        self.good_position_reward_scale = self.cf.env_parameters("good_position_reward_scale") # ဥပမာ: 0.01        
        # ရည်ရွယ်ချက် ၂: SL/PT Trailing အတွက် တန်ဖိုး (Move Step Size)
        self.trailing_distance = self.cf.env_parameters("trailing_stop_distance_points")

        # အရှုံးနဲ့အမြတ် မျှတမှုရှိတဲ့ trading performance အတွက် ပေးတဲ့ bonus reward 0.01
        # self.consistency_reward = self.cf.env_parameters("consistency_reward")
        self.stop_loss = self.cf.symbol(self.symbol_col, "stop_loss_max")
        self.profit_taken = self.cf.symbol(self.symbol_col, "profit_taken_max")
        self.point = self.cf.symbol(self.symbol_col, "point")
        self.transaction_fee = self.cf.symbol(self.symbol_col, "transaction_fee")
        self.over_night_penalty = self.cf.symbol(self.symbol_col, "over_night_penalty")
        self.max_current_holding = self.cf.symbol(self.symbol_col, "max_current_holding")
        # Drawdown Penalty Factor
        self.drawdown_penalty_factor = self.cf.env_parameters("drawdown_penalty_factor")


    # Action နှင့် Observation Spaces ကို သတ်မှတ်သည်။
    def _initialize_spaces(self):
        # Continuous actions: [1 -> 0.5] LONG | [0.5 -> -0.5] HOLD |[-0.5 -> -1] SHORT
        self.action_space = spaces.Box(
            low=-1,
            high=1,
            shape=(1,),
            dtype=np.float32
        )
        # Transformer သုံးထားသော features တွေရဲ့ previous sequence length candle ကိုပါ တပြိုင်တည်းကြည့်
        obs_shape = (self.sequence_length, len(self.features))
        self.observation_space = spaces.Box(
            low=-np.inf,
            high=np.inf,
            shape=obs_shape,
            dtype=np.float32
        )

    # Environment ကို အစပြုအခြေအနေသို့ ပြန်လည်သတ်မှတ်သည်။
    def reset(self, *, seed = None, options = None):
        super().reset(seed=seed, options=options)

        self.ticket_id          =   0
        self.ttl_rewards        =   0 # total rewards

        self.balance            =   self.balance_initial
        self.positions          =   []

        # equity tracking
        self.equity_curve       =   [self.balance_initial] # Starting with initial balance
        # အမြင့်ဆုံးရောက်ဖူးတဲ့ eq value
        self.peak_equity        =   self.balance_initial # Start with initial balance as peak

        self.max_drawdown       =   0.0
        self.current_drawdown   =   0.0

        # transformer အသုံးပြုထားခြင်းကြောင့်
        self.current_step       =   self.sequence_length
        logger.info(f"--- Environment reset. Starting at step {self.current_step} --total rewards: {self.ttl_rewards}")

        observation             =   self._next_observation()
        info                    =   {}
        return  observation, info


    # AI model အတွက် လက်ရှိ market condition ကိုကိုယ်စားပြုတဲ့ observation data ကို ပြင်ဆင်ပေးဖို့ဖြစ်ပါတယ်။
    def _next_observation(self):
        # သင့်တော်တဲ့ obs Historical Data ယူခြင်း
        obs = self.data.iloc[
            self.current_step - self.sequence_length: self.current_step
        ][self.features].values

        # NumPy array → PyTorch tensor ပြောင်းမယ်
        # Data type ကို float32 လုပ်မယ်
        # GPU/CPU device ပေါ်ကို ရွှေ့မယ်
        obs = torch.tensor(obs, dtype=torch.float32).to(device)

        # Data Validation စစ်ဆေးခြင်း
        # NaN (Not a Number) values ရှိမရှိစစ်မယ်
        # Infinite values ရှိမရှိစစ်မယ်
        # Invalid data ရှိရင် error ပြမယ်
        if torch.isnan(obs).any() or torch.isinf(obs).any():
            logger.error(f"Invalid observation at step {self.current_step}")
            raise ValueError(f"Invalid observation at step {self.current_step}")

        # NumPy Array ပြန်ပြောင်းခြင်း
        # GPU memory → CPU memory ပြန်ရွှေ့မယ်
        # PyTorch tensor → NumPy array ပြန်ပြောင်းမယ်
        # Gym environment က NumPy arrays ကို ပိုကြိုက်တယ်။ Memory management အတွက် ကောင်းတယ်
        return obs.cpu().numpy()  # obs



    def _get_action_name(self, _action):
        """Convert continuous action to discrete action name"""
        if _action >= self.action_threshold:
            return "BUY"
        elif _action <= -self.action_threshold:
            return "SELL"
        else:
            return "HOLD"

    def step(self, action):
        _o, _h, _l, _c, _t, _day    =   self.data.iloc[self.current_step][['open', 'high', 'low', 'close', 'time', 'day']]
        reward                      =   0 # ဒီ step အတွက် စုစုပေါင်း reward
        position_reward             =   0 # Position ပိတ်ရင် ရတဲ့ reward
        action_hold_reward          =   0 # Hold action အတွက် reward/penalty

        _msg                        =   []
        _action                     =   action[0] # action value eg. [0.75]
        open_position               =   0
        for position in self.positions:
            if position['Status']   ==  0:
                position_reward, closed, _msg   =   self._calculate_reward(position)
                if not closed: open_position += 1  # Count what we already knew
                reward += position_reward

        # Continuous actions: [1 -> 0.5] LONG | [0.5 -> -0.5] HOLD |[-0.5 -> -1] SHORT
        action_name = self._get_action_name(_action)

        if open_position < self.max_current_holding and action_name in ['BUY', 'SELL']:
            self.ticket_id  +=  1
            position        =   {
                "Ticket"        :   self.ticket_id,
                "Symbol"        :   self.symbol_col,
                "ActionTime"    :   _t,
                "Type"          :   action_name,
                "Lot"           :   1,
                "ActionPrice"   :   _c,
                "SL"            :   self.stop_loss,
                "PT"            :   self.profit_taken,
                "MaxDD"         :   0,
                "Swap"          :   0.0,
                "CloseTime"     :   "",
                "ClosePrice"    :   0.0,
                "Point"         :   self.point,
                "Reward"        :   self.transaction_fee,
                "DateDuration"  :   _day,
                "Status"        :   0, # 0 is Position is currently OPEN and active
                #"PIPS"          :   self.transaction_fee, # Price Interest Point (profit/loss ကို measure လုပ်တဲ့ unit)
                "PIPS"          :   0,
                "ActionStep"    :   self.current_step,
                "CloseStep"     :   -1, # Step number when position closed, not close yet is -1
                "DeltaStep"     :   0,
                "OpenBal"       :   self.balance - 100,
                "CloseBal"       :   0,
                "HighestPrice"  :   _c,
                "LowestPrice"   :   _c,
            }

            self.positions.append(position)
            # do not use transaction_fee penalty
            # reward = self.transaction_fee #open cost
            # model က အလွန်အကျွံ position တွေ မဖွင့်မိအောင် ထိန်းချုပ်တဲ့ mechanism ဖြစ်ပါတယ်။
            # Real trading မှာ margin requirement ရှိသလိုမျိုး
            # Position ဖွင့်ရင် capital ချုပ်ငြားနေရတယ်
            # Position ပိတ်တဲ့အခါ ပြန်ပေါင်းထည့်ပေးတယ်
            self.balance -= 100 # hold up, this will make sure model can not open a lot of
            _msg.append(f'Step:{self.current_step} Tkt:{position["Ticket"]} {position["Type"]} Rwd:{position["PIPS"]} SL:{position["SL"]} PT:{position["PT"]}')

        # HOLD Penalty ကို အလွန်သေးငယ်သော တန်ဖိုး
        # (ဥပမာ: -0.0001) သို့ ပြောင်းပါ။ အကောင်းဆုံးမှာ 
        # Trading မလုပ်ခြင်းအတွက် Penalty မပေးဘဲ action_hold_reward = 0 ထားပါ။
        elif open_position < self.max_current_holding and action_name in ("HOLD"):
            action_hold_reward  =   0  # no open any position, encourage open position
        else:
            action_hold_reward  =   0




        reward              +=  action_hold_reward
        #self.ttl_rewards    +=  reward

        # Move to the next time step
        self.current_step   +=  1

        # check if episode is done
        done                =   self.current_step > self.max_steps or self.balance <= 0

        # get next observation
        obs                 =   self._next_observation()
        _msg.append(f'---idle----step:{self.current_step}, RF:{action_name} Action:{_action} Balance: {self.balance} reward:{reward} total_rewards:{self.ttl_rewards} position_reward:{position_reward} action_hold_reward:{action_hold_reward}')


        current_equity = self._calculate_current_equity()
        self.equity_curve.append(current_equity)
        self._calculate_drawdown()  # This updates peak_equity and drawdowns

        # =========================================================================
        # START: Drawdown Penalty Logic
        # =========================================================================
        # self.current_drawdown သည် Percentage (0.0 မှ 1.0) ဖြစ်သည်။

        
        drawdown_penalty = self.current_drawdown * self.drawdown_penalty_factor        
        # Reward တွင် နုတ်ပေးခြင်း
        reward -= drawdown_penalty
        
        # Log the penalty for debugging
        _msg.append(f'Drawdown Penalty: -{drawdown_penalty:.4f} (DD:{self.current_drawdown:.4f})')
        # =========================================================================
        # END: Drawdown Penalty Logic
        # =========================================================================
        # Drawdown Penalty နုတ်ပြီးမှသာ စုစုပေါင်း Reward ကို အပ်ဒိတ်လုပ်ပါ
        self.ttl_rewards += reward  # <--- ဤနေရာတွင် ပြန်ထည့်ပါ


        if done:
            buy_positions = [p for p in self.positions if p["Type"] == "BUY"]
            sell_positions = [p for p in self.positions if p["Type"] == "SELL"]

            buy_count = len(buy_positions)
            sell_count = len(sell_positions)
            total_positions = len(self.positions)

            # Calculate win rates
            buy_wins = len([p for p in buy_positions if p["PIPS"] > 0])
            sell_wins = len([p for p in sell_positions if p["PIPS"] > 0])

            buy_win_rate = buy_wins / buy_count if buy_count > 0 else 0
            sell_win_rate = sell_wins / sell_count if sell_count > 0 else 0

            _m = f'--- Positions: {total_positions} (Buy:{buy_count}, Sell:{sell_count}) | '
            _m += f'WinRates: Buy:{buy_win_rate:.1%}, Sell:{sell_win_rate:.1%} | '
            _m += f'TotalRewards: {self.ttl_rewards} Balance: {self.balance}'

            logger.info(_m)
            _msg.append(_m)

        # Additional info
        if self.logger_show:
            for _m in _msg:
                logger.info(_m)
        info = {
            "info": _msg,
            "sharpe_ratio": self._calculate_sharpe(),  # ✅ Now works!
            "max_drawdown": self.max_drawdown,         # ✅ Now accurate!
            "current_equity": current_equity,          # ✅ For debugging
            "peak_equity": self.peak_equity,           # ✅ For debugging
            "equity_curve_length": len(self.equity_curve)  # ✅ Monitor growth
        }
        truncated = False
        return obs, reward, done, truncated, info




    def _calculate_reward(self, position):
        _o, _h, _l, _c, _t, _day    =   self.data.iloc[self.current_step][['open', 'high', 'low', 'close', 'time', 'day']]
        _msg                        =   []

        entry_price                 =   position['ActionPrice']
        direction                   =   position['Type']
        profit_target_price         =   entry_price + position['PT']/ self.point if direction == 'BUY' else entry_price - position['PT']/self.point
        stop_loss_price             =   entry_price + position['SL']/ self.point if direction == 'BUY' else entry_price - position['SL']/self.point
        closed                      =   False
        close_position_reward       =   0.0
        good_position_reward        =   0.0

        # Check for stoploss hit
        if (direction == 'BUY' and _l <= stop_loss_price) or (direction == 'SELL' and _h >= stop_loss_price):
            close_position_reward   =   position['SL'] # position sl က minus value ဖြစ်တယ်

            position['CloseTime']   =   _t
            position['ClosePrice']  =   stop_loss_price
            position['Status']      =   1   # Status က open ဆို 0 close ဆို 1
            position['CloseStep']   =   self.current_step
            position['PIPS']        =   close_position_reward - self.transaction_fee
            position['DeltaStep']   =   self.current_step - position['ActionStep']
            position['Info']        =   f'{profit_target_price:.5f} | {stop_loss_price:.5f}'

            self.balance            +=  100 + position['PIPS'] # return 100 is margin hold
            position['CloseBal']    =   self.balance
            closed                  =   True
            _msg.append(f'Step:{self.current_step} Tkt:{position["Ticket"]}: Rwd:{position["PIPS"]}, SL:{position["SL"]}, DeltaStep:{position["DeltaStep"]}')

        elif (direction == 'BUY' and _h >= profit_target_price) or (direction == 'SELL' and _l <= profit_target_price):
            close_position_reward   =    position['PT'] # position tp က plus value ဖြစ်တယ်

            position['CloseTime']   =   _t
            position['ClosePrice']  =   profit_target_price
            position['Status']      =   2   # Status က open ဆို 0 close ဆို 1
            position['CloseStep']   =   self.current_step
            position['PIPS']        =   close_position_reward - self.transaction_fee
            position['DeltaStep']   =   self.current_step - position['ActionStep']
            position['Info']        =   f'{profit_target_price:.5f} | {stop_loss_price:.5f}'

            self.balance            +=  100 + position['PIPS'] # return 100 is margin hold
            position['CloseBal']    =   self.balance
            closed                  =   True
            _msg.append(f'Step:{self.current_step} Tkt:{position["Ticket"]}: Rwd:{position["PIPS"]}, SL:{position["SL"]}, DeltaStep:{position["DeltaStep"]}')

        else:
            if self.current_step + 5 + self.sequence_length >= len(self.data):
                close_position_reward   =   (_c - position["ActionPrice"] if direction == 'BUY' else position["ActionPrice"] - _c)* self.point

                position['CloseTime']   =   _t
                position['ClosePrice']  =   _c
                position['Status']      =   3   # Status က open ဆို 0 close ဆို 1, force close 2
                position['CloseStep']   =   self.current_step
                position['PIPS']        =   close_position_reward - self.transaction_fee
                position['DeltaStep']   =   self.current_step - position['ActionStep']
                position['Info']        =   f'{profit_target_price:.5f} | {stop_loss_price:.5f}'
                position['CloseBal']    =   self.balance
                self.balance            +=  100 + position["PIPS"] # return 100 is margin hold
                closed                  =   True
                _msg.append(f'Step:{self.current_step} Tkt:{position["Ticket"]}: Rwd:{position["PIPS"]}, Cls:End, DeltaStep:{position["DeltaStep"]}')

            else:
                # =========================================================================
                # Real Trailing Stop Logic (အမြင့်ဆုံး ရောက်ဖူးသော ဈေးနှုန်းကို မှတ်တမ်းတင်ခြင်း)
                # =========================================================================
                # 1. Highest/Lowest Price Update

                if direction == "BUY":
                  # Buy position အတွက် အမြင့်ဆုံး ရောက်ဖူးသော ဈေးနှုန်းကို မှတ်တမ်းတင်
                  if _c > position["HighestPrice"]:
                      position["HighestPrice"] = _c

                  # 2. New SL Target Price (Trailing Price) ကို တွက်ချက်ခြင်း
                  # New_SL_Price = HighestPrice - (Trailing Distance Pips ကို Price Change သို့ ပြောင်း)
                  trailing_price = position["HighestPrice"] - self.trailing_distance / self.point

                  # 3. SL ကို အဆင့်မြှင့်တင်ခြင်း
                  # လက်ရှိ SL ထက် ပိုကောင်းမှသာ ရွေ့ပါ
                  if trailing_price > stop_loss_price: 

                      stop_loss_price = trailing_price
                      # SL_Price အသစ်ကို Points သို့ ပြန်ပြောင်းပြီး position['SL'] ကို အပ်ဒိတ်လုပ်ပါ
                      position["SL"] = (stop_loss_price - entry_price) * self.point 
                      trailing_happened = True
                  else:
                      trailing_happened = False


                elif direction == "SELL":
                  # Sell position အတွက် အနိမ့်ဆုံး ရောက်ဖူးသော ဈေးနှုန်းကို မှတ်တမ်းတင်
                  if _c < position["LowestPrice"]:
                      position["LowestPrice"] = _c
                  
                  # New SL Target Price (Trailing Price) ကို တွက်ချက်ခြင်း
                  trailing_price = position["LowestPrice"] + self.trailing_distance / self.point
                  
                  # SL ကို အဆင့်မြှင့်တင်ခြင်း
                  if trailing_price < stop_loss_price: 
                      stop_loss_price = trailing_price
                      # SL_Price အသစ်ကို Points သို့ ပြန်ပြောင်းပြီး position['SL'] ကို အပ်ဒိတ်လုပ်ပါ
                      position["SL"] = (entry_price - stop_loss_price) * self.point
                      trailing_happened = True
                  else:
                      trailing_happened = False

                # =========================================================================
                # Reward Logic (Trailing လုပ်ခြင်းအတွက် Bonus ပေးခြင်း)
                # =========================================================================
                # Reward Sign ကို ယခင်အတိုင်း တွက်ပါ။
                delta = _c - entry_price
                if direction == "BUY":
                    reward_sign = 1 if delta >= 0 else -1
                elif direction == "SELL":
                    reward_sign = -1 if delta >= 0 else 1

                good_position_reward = reward_sign * self.good_position_reward_scale 
                
                # Trailing အမှန်တကယ် ဖြစ်သွားမှသာ Bonus Reward ကို ပေးပါ
                if trailing_happened: 
                    good_position_reward += 0.001

                position['Info']        =   f'{profit_target_price:.5f} | {stop_loss_price:.5f}'
                position['CloseBal']    =   self.balance
                _msg.append(f'Step:{self.current_step} Tkt:{position["Ticket"]}: NO_Close, PT:{position["PT"]}, SL:{position["SL"]}')

        return close_position_reward + good_position_reward, closed, _msg


    def _calculate_sharpe(self, risk_free_rate=0.0):
        """Calculate Sharpe ratio for the current episode"""
        if len(self.equity_curve) < 2:
            return 0.0

        returns = np.diff(self.equity_curve) / self.equity_curve[:-1]

        if np.std(returns) == 0:
            return 0.0

        sharpe = (np.mean(returns) - risk_free_rate) / np.std(returns)
        return float(sharpe * np.sqrt(288))  # Annualized (5-min bars → 288/day)

    def _calculate_drawdown(self):
        """Update max drawdown during episode"""
        current_equity          =   self.equity_curve[-1]
        self.peak_equity        =   max(self.peak_equity, current_equity)
        self.current_drawdown   =   (self.peak_equity - current_equity) / self.peak_equity
        self.max_drawdown       =   max(self.max_drawdown, self.current_drawdown)


    def _calculate_current_equity(self):
        """Calculate total current equity (balance + unrealized P/L)"""
        total_equity = self.balance  # Start with cash balance

        # Add unrealized P/L from open positions
        for position in self.positions:
            if position['Status'] == 0:  # Only open positions
                current_price = self.data.iloc[self.current_step]["close"]
                entry_price = position['ActionPrice']

                if position['Type'] == 'BUY':
                    unrealized_pnl = (current_price - entry_price) * self.point
                else:  # Sell
                    unrealized_pnl = (entry_price - current_price) * self.point

                total_equity += unrealized_pnl

        return total_equity

    def render(self, mode='human', title=None, **kwargs):
        # Render the environment to the screen
        if mode in ('human', 'file'):
            log_header      =   True
            printout        =   False
            if mode == 'human':
                printout    =   True

            log_file = self.csv_file.replace("split/", "log/")
            pm = {
                "log_header": log_header,
                "log_filename": log_file,
                "printout": printout,
                "balance": self.balance,
                "balance_initial": self.balance_initial,
                "transaction_close_this_step": self.positions,
                "done_information": False
            }
            render_to_file(**pm)
            if log_header:
                    log_header = False

In [None]:
from stable_baselines3.common.callbacks import BaseCallback
import numpy as np

class TrainingMetricsCallback(BaseCallback):
    def __init__(self, check_freq=1000, verbose=1):
        super().__init__(verbose)
        self.check_freq = check_freq
        self.sharpe_ratios = []
        self.drawdowns = []
        self.episode_count = 0

    def _on_step(self) -> bool:
        # Track metrics only when episodes complete
        if "sharpe" in self.locals['infos'][0] and "max_drawdown" in self.locals['infos'][0]:
            self.episode_count += 1
            self.sharpe_ratios.append(self.locals['infos'][0]['sharpe'])
            self.drawdowns.append(self.locals['infos'][0]['max_drawdown'])

            # Log to tensorboard every N episodes
            if self.episode_count % 10 == 0:
                self.logger.record('train/mean_sharpe', np.mean(self.sharpe_ratios[-10:]))
                self.logger.record('train/max_drawdown', np.mean(self.drawdowns[-10:]))
                self.logger.record('train/episodes', self.episode_count)

        return True

In [None]:

from stable_baselines3.common.vec_env import DummyVecEnv
from src.utils.transformer import linear_schedule
from src.utils.transformer import CustomCombinedExtractor
from stable_baselines3 import PPO
import torch.nn as nn
from stable_baselines3.common.utils import set_random_seed

BASE_SEED = 42
number_envs = 4
# Stable-Baselines3 ရဲ့ Global Seed ကို သတ်မှတ်ပါ
set_random_seed(BASE_SEED)


def single_csv_training(csv_file, env_config_file, asset, model_name ='', cf = None, number_envs = 1):
    features = cf.env_parameters("observation_list")
    sequence_length = cf.env_parameters("backward_window")
    print(features)
    lr_schedule = linear_schedule(3e-4, 1e-5, total_timesteps=1e6)
    policy_kwargs = dict(
        # Repo ရဲ့ custom feature extractor (Transformer + MLP ပေါင်းထားတာ၊ time series data အတွက် သင့်တော်တယ်)။
        features_extractor_class=CustomCombinedExtractor,
        # features_extractor_kwargs: Sequence length ကို ထည့်။
        features_extractor_kwargs=dict(sequence_length=sequence_length),
        # net_arch: Actor (pi - policy network) နဲ့ Critic (vf - value function) နှစ်ခု လုံး အတွက် hidden layers [256, 256] သုံး။
        net_arch=[dict(pi=[256, 256], vf=[256, 256])],
        # Activation function အနေနဲ့ ReLU သုံး (non-linear ဖြစ်အောင်)။
        activation_fn=nn.ReLU,
        # Orthogonal initialization မသုံး (financial data မှာ ပိုကောင်း တယ်လို့ comment မှာ ရေး ထားတယ်၊ ဒါက weights ကို ပိုရိုးရှင်း စ လုပ်တယ်)။
        ortho_init=False # better for finacial data
    )
    # env = ForexTradingEnv(csv_file, cf, asset, features=features, sequence_length=sequence_length, logger_show= True)

    # Environment Factories များ ဖန်တီးပါ
    env_fns = [
        lambda: ForexTradingEnv(
            csv_file,
            cf,
            asset,
            features=features,
            sequence_length=sequence_length,
            logger_show=True
        )
        for _ in range(number_envs)
    ]
    # DummyVecEnv ကို တည်ဆောက်ပါ
    env = DummyVecEnv(env_fns)
    # ဤနေရာသည် အဓိကကျသည်။ ၎င်းက Environment တစ်ခုချင်းစီကို
    # BASE_SEED, BASE_SEED+1, BASE_SEED+2... စသည်ဖြင့် Seed များ သတ်မှတ်ပေးပြီး
    # ၎င်းတို့၏ reset() ကို ပြန်လည်ခေါ်ပေးလိမ့်မည်။
    env.seed(BASE_SEED)
    # env.logger_show = True
    if model_name:
        model = PPO.load(model_name, env=env, learning_rate=lr_schedule)
    else:
        model = PPO(
            # 'CnnPolicy' , # support GPU
            'MlpPolicy', # CPU only
            env,
            device='cuda',
            verbose=1,
            vf_coef=0.5,  # Increase value loss weight
            target_kl=0.05,  # Add target KL for early stopping
            normalize_advantage=True,
            policy_kwargs=policy_kwargs,
            learning_rate=lr_schedule,  # Reduced learning rate
            max_grad_norm=0.5,    # Gradient clipping,
            seed=BASE_SEED,
        )

    # Train the agent
    logger.info("Starting model training...")
    callback = TrainingMetricsCallback()
    model.learn(
        total_timesteps=500000,
        callback=callback,
        tb_log_name=f"/content/drive/MyDrive/data/log/{asset}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}",
    )
    logger.info("Model training complete")
    model_filename = csv_file.replace("split/", "model/").replace(".csv", "_single_test.zip")
    model.save(model_filename)


In [None]:
from src.utils.read_config import EnvConfig

asset = "EURUSD"
env_config_file = '/content/drive/MyDrive/configure.json'
cf = EnvConfig(env_config_file)
split_cfg = cf.data_processing_parameters("train_eval_split")
base_path = split_cfg["base_path"].format(symbol=asset)
csv_file = f"{base_path}/{split_cfg["train_dir"]}/{asset}_2022_12.csv"
model_name = '' # f'./data/model/{asset}/weekly/{asset}_2023_71'
single_csv_training(csv_file=csv_file, env_config_file =env_config_file, asset= asset, model_name=model_name, cf=cf, number_envs=4)


['mean_std_open', 'mean_std_high', 'mean_std_low', 'mean_std_close', 'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos', 'london_session', 'ny_session', 'overlap_session', 'macd', 'boll_ub', 'boll_lb', 'rsi_30', 'atr', 'volatility_ratio', 'close_30_sma', 'close_60_sma', 'returns_5', 'returns_24']
Using cpu device




-----------------------------
| time/              |      |
|    fps             | 260  |
|    iterations      | 1    |
|    time_elapsed    | 31   |
|    total_timesteps | 8192 |
-----------------------------
-------------------------------------------
| time/                   |               |
|    fps                  | 14            |
|    iterations           | 2             |
|    time_elapsed         | 1111          |
|    total_timesteps      | 16384         |
| train/                  |               |
|    approx_kl            | 0.004656008   |
|    clip_fraction        | 0.0285        |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.42         |
|    explained_variance   | 3.0398369e-05 |
|    learning_rate        | 1.48e-05      |
|    loss                 | 1.42e+05      |
|    n_updates            | 10            |
|    policy_gradient_loss | 0.00102       |
|    std                  | 0.998         |
|    value_loss           | 3.28e+05      

KeyboardInterrupt: 

In [None]:
import numpy as np
import os
import glob
from stable_baselines3 import PPO

def eval(data_directory, env_config_file, model_file, asset, mode='human', save_plot=False, sequence_length=None):
    csv_files = glob.glob(os.path.join(data_directory, "*.csv"))
    cf = EnvConfig(env_config_file)
    features = cf.env_parameters("observation_list")
    if sequence_length is None:
        sequence_length = cf.env_parameters("backward_window")
    print(f"Using sequence_length: {sequence_length}")

    # Device setup - CONSISTENT device usage
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Force CPU if you're having device issues
    # device = torch.device('cpu')
    # print("Forcing CPU usage")

    file = '/content/drive/MyDrive/data/split/EURUSD/train/EURUSD_2022_14.csv'
    env = ForexTradingEnv(file, cf, asset, features, sequence_length, save_plot=False)
    # aggregator = ActionAggregator()
    #aggregator = ActionAggregatorOptimized(base_window_size=10, volatility_threshold=0.01)

    model = PPO.load(model_file, env=env, device=device)
    model.policy.to(device)
    observation, info = env.reset()

    done = False
    total_buy = 0
    total_sell = 0
    total_rewards = 0
    step = 0
    step_log_data = []
    while not done:
        action, _states = model.predict(observation) #deterministic=True

        # aggregated_action, reward = aggregator.add_action(action)
        print(f"Action: {action}") # {aggregated_action}

        observation, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        print(f"info: {info}") # {aggregated_action}
        # 3. Data ကို စုဆောင်းပါ
        # step_log_data.append({
        #     'Step': step,
        #     'Action': action.tolist(),
        #     'Reward': reward,
        #     'New_Observation_State': observation.tolist(),
        #     'Done': done,
        #     'Info': info # Info dict ကို လိုအပ်သလို ထည့်နိုင်ပါတယ်
        # })
        step += 1
        total_rewards += reward
        if action >= 0.5: total_buy += 1
        if action <= -0.5: total_sell += 1

    env.render(mode = mode)
    # print(f'------rewards:{total_rewards}-----buy:{total_buy}--sell:{total_sell}------')
    # # စုဆောင်းထားတဲ့ data ကို DataFrame အဖြစ် ပြောင်းပါ
    # df = pd.DataFrame(step_log_data)

    # # CSV ဖိုင်အဖြစ် သိမ်းပါ
    # csv_filename = "/content/drive/MyDrive/data/rl_prediction_steps_log.csv"
    # df.to_csv(csv_filename, index=False)

    # print(f"Data ကို {csv_filename} ထဲမှာ သိမ်းဆည်းပြီးပါပြီ။")

In [None]:
asset = "EURUSD"
env_config_file = '/content/drive/MyDrive/configure.json'
model_file = f'/content/drive/MyDrive/data/model/{asset}/train/{asset}_2022_12_single_test.zip'
data_directory = f"/content/drive/MyDrive/data/split/{asset}/train"
save_plot = False

eval(data_directory, env_config_file, model_file, asset, mode='human', save_plot=save_plot)
