In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install stable_baselines3
!pip install gymnasium
!pip install torch
!pip install scikit-learn
!pip install pandas
!pip install numpy
!pip install finta
!pip install mplfinance

In [None]:
import os
import json

import logging
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
# from stable_baselines3.common.callbacks import LearningRateSchedule
from stable_baselines3 import PPO
import gymnasium as gym
from gymnasium import spaces
import torch.nn.functional as F
import torch
import torch.nn as nn
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
import datetime

def linear_schedule(start_lr=3e-4, end_lr=1e-5):
     # ဤသည်မှာ မှန်ကန်သော Decay Logic ဖြစ်ပါသည်။
     return lambda progress_remaining: end_lr + (start_lr - end_lr) * progress_remaining

# Configure logging
logger = logging.getLogger(__name__)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


class TimeSeriesTransformer(nn.Module):
    """
    A Transformer-based model for time series data.
    This class projects input features to an embedding, adds positional
    encodings, and then processes the inputs using a Transformer encoder.
    Finally, a decoder layer is used to produce the output.
    Args:
        input_size (int): Number of features in the input time series data.
        embed_dim (int): Dimensionality of the learned embedding space.
        num_heads (int): Number of attention heads in each Transformer layer.
        num_layers (int): Number of Transformer encoder layers.
        sequence_length (int): Length of the input sequences (time steps).
        dropout (float, optional): Dropout probability to apply in the
            Transformer encoder layers. Defaults to 0.1.
    Attributes:
        model_type (str): Identifier for the model type ('Transformer').
        embedding (nn.Linear): Linear layer for input feature embedding.
        positional_encoding (torch.nn.Parameter): Parameter storing the
            positional encodings used to retain temporal information.
        transformer_encoder (nn.TransformerEncoder): Stack of Transformer
            encoder layers with optional final LayerNorm.
        decoder (nn.Linear): Linear layer used to produce the final output
            dimensions.
    Forward Inputs:
        src (torch.Tensor): Input tensor of shape (batch_size, sequence_length,
            input_size).
    Forward Returns:
        torch.Tensor: Output tensor of shape (batch_size, embed_dim) from the
            last time step.
    Raises:
        ValueError: If the model output contains NaN or Inf values, indicating
            numerical instability.
    """
    # input_size: Input features အရေအတွက် (ဥပမာ 10၊ price + SMA/RSI indicators စတာ)။
    # embed_dim: Internal embedding အတိုင်းအတာ (ဥပမာ 64၊ data ကို ပိုနက်ရှိုင်း အောင် ပြောင်း)။
    # num_heads: Attention heads အရေအတွက် (multi-head attention အတွက်၊ မတူညီ အနေနဲ့ အာရုံ စိုက်)။
    # num_layers: Encoder layers အရေအတွက် (ဥပမာ 2၊ ရိုးရှင်း ထားတာ)။
    # sequence_length: Input sequence အရှည် (ဥပမာ 20 timesteps)။
    # dropout=0.1: Overfitting ကနေ ကာကွယ် တဲ့ dropout rate။
    def __init__(self, input_size, embed_dim, num_heads, num_layers,sequence_length, dropout=0.1):
        super(TimeSeriesTransformer, self).__init__()
        self.model_type = 'Transformer'
        self.embed_dim = embed_dim

        # Embedding layer to project input features to embed_dim dimensions
        self.embedding = nn.Linear(input_size, embed_dim).to(device)

        # Positional encoding parameter
        self.positional_encoding = nn.Parameter(torch.zeros(1, sequence_length, embed_dim).to(device))

        # Transformer encoder layer
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dropout=dropout,
            norm_first=True  # Apply LayerNorm before attention and feedforward
        ).to(device)
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer,
            num_layers=num_layers,
            norm=nn.LayerNorm(embed_dim).to(device) # Add LayerNorm at the end of the encoder
        )

        # Decoder layer to produce final output
        self.decoder = nn.Linear(embed_dim, embed_dim).to(device)

    def forward(self, src):
        # Apply embedding layer and add positional encoding
        src = self.embedding(src) + self.positional_encoding

        # Pass through the transformer encoder
        output = self.transformer_encoder(src)

        # Pass through the decoder layer
        output = self.decoder(output)

        # Check for NaN or Inf values for debugging
        if torch.isnan(output).any() or torch.isinf(output).any():
            logger.error("Transformer output contains NaN or Inf values")
            raise ValueError("Transformer output contains NaN or Inf values")

        # Return the output from the last time step
        return output[:, -1, :]

class CustomCombinedExtractor(BaseFeaturesExtractor):
    """
    A custom feature extractor that normalizes input observations and processes them
    using a transformer-based architecture for dimensionality reduction and enhanced
    feature representation.
    Parameters:
        observation_space (gym.spaces.Box): Defines the shape and limits of input data.
        sequence_length (int): The length of the time series to be processed.
    Attributes:
        layernorm_before (nn.LayerNorm): Normalizes input data to improve training stability.
        transformer (TimeSeriesTransformer): Processes normalized input sequences and extracts features.
    Methods:
        forward(observations):
            Applies layer normalization to the incoming observations, then passes them
            through the transformer. Raises a ValueError if invalid values (NaNs or inf)
            are detected in the output.
    """

    def __init__(self, observation_space: gym.spaces.Box, sequence_length):
        super(CustomCombinedExtractor, self).__init__(observation_space, features_dim=64)
        num_features = observation_space.shape[1]  # Should be 10 in this case

        # Ensure that embed_dim is divisible by num_heads
        embed_dim = 64
        num_heads = 2

        self.layernorm_before = nn.LayerNorm(num_features) # Added Layer Normalization before transformer

        self.transformer = TimeSeriesTransformer(
            input_size=num_features,
            embed_dim=embed_dim,
            num_heads=num_heads,
            num_layers=2,
            sequence_length =sequence_length
        )

    def forward(self, observations):
        # မူရင်း input tensor ရဲ့ device ကို မှတ်သားထားပါ
        input_device = observations.device

        # Apply layer normalization
        # Apply layer normalization, ဝင်လာတဲ့ observations ကို Transformer ရဲ့ device ပေါ်ကို ရွှေ့ပါ
        normalized_observations = self.layernorm_before(observations.float().to(device)) # Ensure float type

        x = self.transformer(normalized_observations)
        if torch.isnan(x).any() or torch.isinf(x).any():
            logger.error("Invalid values in transformer output")
            raise ValueError("Invalid values in transformer output")

        # ⚠️ ပြင်ဆင်ချက်: Output tensor ကို မူရင်း input tensor ရဲ့ device သို့ ပြန်ပို့ပါ
        # PPO Agent ရဲ့ Policy/Value Network က အလုပ်လုပ်တဲ့ device ပေါ်ကို ပြန်ပို့ဖို့ လိုပါတယ်။
        # သို့သော်လည်း၊ Stable-Baselines3 က Policy/Value Network ကို နောက်ပိုင်းမှာ to(device) နဲ့ ရွှေ့တဲ့အတွက်
        # ဒီနေရာမှာ အန္တရာယ်ကင်းအောင် မူရင်း input device ကို ပြန်ပို့တာ ဒါမှမဟုတ် Agent သုံးမယ့် device ပေါ်မှာပဲ ထားတာ နှစ်မျိုး လုပ်နိုင်ပါတယ်။
        # အကောင်းဆုံးကတော့ Policy Network တွေက GPU ပေါ်မှာရှိရင် GPU မှာပဲ ထားခဲ့တာပါ။

        # သို့သော်လည်း၊ SB3 ရဲ့ စံနှုန်းကို လိုက်နာဖို့၊ CPU ပေါ်ကလာရင် CPU ကို ပြန်ပို့တာ ပိုကောင်းပါတယ်။
        if str(input_device) == 'cpu':
            return x.to(input_device)
        else:
             # Agent က GPU မှာ Run ရင်တော့ GPU မှာပဲ ထားခဲ့ပါ
            return x

class EnvConfig():
    """environment configuration from json file
       tgym requires you configure your own parameters in json file.
        Args:
            config_file path/file.json

    """
    def __init__(self,config_file):
        self.config = {}
        with open(config_file) as j:
            self.config = json.load(j)

    def env_parameters(self,item=''):
        """environment variables
        """
        if item:
            return self.config["env"][item]
        else:
            return self.config["env"]

    def symbol(self, asset="GBPUSD", item='') :
        """get trading pair (symbol) information

        Args:
            asset (str, optional): symbol in config. Defaults to "GBPUSD".
            item (str, optional): name of item, if '' return dict, else return item value. Defaults to ''.

        Returns:
            [type]: [description]
        """
        if item:
            return self.config["symbol"][asset][item]
        else:
            return self.config["symbol"][asset]

    def data_processing_parameters(self, item=''):
        """Get data processing config"""
        if item:
            return self.config["data_processing"][item]
        return self.config["data_processing"]

    def trading_hour(self,place="New York"):
        """forex trading hour from different markets

        Args:
            place (str, optional): [Sydney,Tokyo,London] Defaults to "New York".

        Returns:
            [dict]: from time, to time
        """
        if place:
            return self.config["trading_hour"][place]
        else:
            return self.config["trading_hour"]


In [None]:
import pandas as pd
import logging
from finta import TA
from sklearn.preprocessing import StandardScaler

def patch_missing_data(df, dt_col_name='time', cf=None):
    min_bars = cf.data_processing_parameters("min_bars_per_week")

    # ["time","open", "high", "low", "close"]
    required_cols = cf.data_processing_parameters("required_cols")

    # df မှာ 6 columns ရှိရင် vol ပါထည့်မယ်
    if df.shape[1] == 6:
        df.columns = required_cols + ['vol']
    elif df.shape[1] == 5:
        df.columns = required_cols
    else:
        raise ValueError(f"Invalid number of columns: {df.shape[1]} =>{required_cols}")

    logger.warning(f"shape of  column: {df.shape[1]}")
    # 1. Column validation
    if missing := set(required_cols) - set(df.columns):
        raise ValueError(f"Missing columns: {missing}")

    # 2. Auto-detect datetime column
    dt_candidates = {'time', 'timestamp', 'date', 'datetime'}
    if dt_col_name not in df.columns:
        found = list(dt_candidates & set(df.columns))
        if not found:
            raise KeyError(f"No datetime column found. Tried: {dt_candidates}")
        dt_col_name = found[0]
        logger.info(f"Using datetime column: {dt_col_name}")

    # 3. Convert to datetime index
    df[dt_col_name] = pd.to_datetime(df[dt_col_name], utc=True)
    df = df.set_index(dt_col_name).sort_index()

    # Week by Week Group (Friday-end week)
    groups = df.groupby(pd.Grouper(freq='W-FRI'))

    patched_weeks = []  # patched weekly df storage

    for w, week_df in groups:
        if week_df.empty:
            continue

        if len(week_df) != min_bars:
            logger.warning(f"Week {w} has {len(week_df)}/{min_bars} bars")

        # Create 5-minute frequency index
        new_index = pd.date_range(
            start=week_df.index.min(),
            end=week_df.index.max(),
            freq='5min',
            tz='UTC'
        )

        # Reindex + forward fill
        week_df = week_df.reindex(new_index)
        fill_limit = 12 # ဥပမာ: 1 နာရီ (12 bars) ထက်ပိုတဲ့ ကွက်လပ်ကို မဖြည့်ပါ
        fill_cols = ['open', 'high', 'low', 'close', 'vol'] if 'vol' in df.columns else ['open', 'high', 'low', 'close']
        # FFill: ရှေ့က data ဖြင့် ဖြည့်ပါ
        week_df[fill_cols] = week_df[fill_cols].ffill(limit=fill_limit)
        patched_weeks.append(week_df)

    # Merge back all weeks
    if patched_weeks:
        all_df = pd.concat(patched_weeks)
    else:
        all_df = df.copy()

    return all_df.reset_index().rename(columns={'index': dt_col_name})


def add_time_feature(df, symbol):
    """Add temporal features with proper index handling"""

    if 'time' not in df.columns:
        raise KeyError("'time' column missing after patch_missing_data")

    df = df.set_index('time')
    df.index = pd.to_datetime(df.index, utc=True)

    # Cyclical time features
    df['weekday'] = df.index.dayofweek  # 0=Monday
    df['day'] = df.index.day
    df['week'] = df.index.isocalendar().week
    df['month'] = df.index.month
    df['year'] = df.index.year
    df['hour'] = df.index.hour
    df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24).round(6)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24).round(6)
    df['minute_block'] = df.index.minute // 5  # 0-11
    df['minute_sin'] = np.sin(2 * np.pi * df['minute_block']/12).round(6)
    df['minute_cos'] = np.cos(2 * np.pi * df['minute_block']/12).round(6)

    # Market sessions (GMT)
    df['london_session'] = ((df['hour'] >= 8) & (df['hour'] < 16)).astype(int)
    df['ny_session'] = ((df['hour'] >= 13) & (df['hour'] < 21)).astype(int)
    df['overlap_session'] = ((df['hour'] >= 13) & (df['hour'] < 16)).astype(int)

    df['symbol'] = symbol
    return df.reset_index()

def tech_indicators(df, cf=None):  # 288 = 24hrs in 5-min bars
    """Calculate technical indicators with proper NaN handling"""
    period = cf.data_processing_parameters("indicator_period")
    # 1. Preserve raw prices before normalization
    raw_cols = ['mean_std_open','mean_std_high','mean_std_low','mean_std_close']
    df[raw_cols] = df[['open','high','low','close']].copy()
    # Calculate indicators
    df['macd'] = TA.MACD(df).SIGNAL.ffill().round(6)
    bb = TA.BBANDS(df)
    df['boll_ub'] = bb['BB_UPPER'].ffill()
    df['boll_lb'] = bb['BB_LOWER'].ffill()

    df['rsi_30'] = TA.RSI(df, period=period).ffill()
    df['dx_30'] = TA.ADX(df, period=period).ffill()
    df['close_30_sma'] = TA.SMA(df, period=period).ffill()
    df['close_60_sma'] = TA.SMA(df, period=period*2).ffill()
    df['atr'] = TA.ATR(df, period=period).ffill()
     # Add returns and volatility ratio
    df['returns_5'] = df['close'].pct_change(5,fill_method=None).round(6)
    df['returns_24'] = df['close'].pct_change(24,fill_method=None).round(6)
    df['volatility_ratio'] = (df['high'] - df['low']) / df['close'].round(6)

    # Normalize
    scaler = StandardScaler()
    scale_cols = cf.data_processing_parameters("scale_cols")

    df[scale_cols] = scaler.fit_transform(df[scale_cols])
    df = df.replace([np.inf, -np.inf], np.nan).fillna(0)
    # 1. Identify numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    # 2. Apply clipping only to numeric features
    df[numeric_cols] = df[numeric_cols].clip(lower=-1e5, upper=1e5)
    # 3. Round decimal values
    df[numeric_cols] = df[numeric_cols].round(6).clip(-1e5, 1e5)
    return df
class TimeSeriesScaler:
    """
    Manages the MinMax Scaling process for time series features.
    It fits the scaler on the first chunk of data (expected to be the training start)
    and uses that fitted scaler to transform all subsequent data chunks (including eval).
    """
    def __init__(self):
        # MinMaxScaler ကို အသုံးပြုပြီး 0 နဲ့ 1 ကြားကို ပြောင်းပါ
        self.scaler = MinMaxScaler()
        self.is_fitted = False
        self.price_cols = ['mean_std_open', 'mean_std_high', 'mean_std_low', 'mean_std_close']

    def fit_and_transform(self, df):
        """Fit the scaler on the data and transform it."""
        logger.info("Fitting Scaler on current week data (TRAIN set base)")
        # .copy() လုပ်ပြီးမှ transform လုပ်ပါ
        df_copy = df.copy()
        df_copy[self.price_cols] = self.scaler.fit_transform(df_copy[self.price_cols])
        self.is_fitted = True
        return df_copy

    def transform(self, df):
        """Transform data using the previously fitted scaler."""
        if not self.is_fitted:
            raise ValueError("Scaler must be fitted on the training data first!")

        logger.info("Transforming current week data using fitted scaler.")
        # .copy() လုပ်ပြီးမှ transform လုပ်ပါ
        df_copy = df.copy()
        df_copy[self.price_cols] = self.scaler.transform(df_copy[self.price_cols])
        return df_copy


def split_time_series_v2(df, symbol='EURUSD', cf=None, scaler_manager=None):
    """
    Split data with weekly alignment, adds a lookback context (overlap) from the
    previous week for continuous sequence processing (e.g., Transformer),
    and performs MinMax scaling.

    Args:
        df (pd.DataFrame): Input Time Series Data.
        freq (str): Frequency string for pandas Grouper (e.g., 'W-FRI' for weekly split ending Friday).
        symbol (str): Trading symbol.
        cf (object): Configuration manager.
        scaler_manager (object): TimeSeriesScaler instance.
        sequence_length (int): The lookback window size needed for the Transformer.
    """
    if scaler_manager is None:
        raise ValueError("scaler_manager (TimeSeriesScaler instance) must be provided.")

    split_cfg = cf.data_processing_parameters("train_eval_split")
    base_path = split_cfg["base_path"].format(symbol=symbol)

    sequence_length = cf.data_processing_parameters("sequence_length")

    # Align with Forex week (Monday-Friday/Sunday)
    # df['time'] သည် ဤနေရာတွင် datetime object ဖြစ်ရမည်။
    if 'time' in df.columns:
        df['time'] = pd.to_datetime(df['time'], utc=True)
        df = df.set_index('time')
    elif not pd.api.types.is_datetime64_any_dtype(df.index):
        raise ValueError("DataFrame must have a 'time' column or a datetime index.")

    # W-FRI သည် သောကြာနေ့တွင် အဆုံးသတ်သော အပတ်ကို ကိုယ်စားပြုသည်။
    groups = df.groupby(pd.Grouper(freq='W-FRI'))

    # Indicators columns
    indicator_cols = ['macd', 'boll_ub', 'boll_lb', 'rsi_30', 'dx_30', 'close_30_sma', 'close_60_sma', 'atr']


    prev_week_df = None # ယခင် Week ရဲ့ DataFrame အပြည့်အစုံကို သိမ်းဆည်းရန်

    # Loop စတင်ခြင်း
    for week_start, week_df in groups:
        if week_df.empty:
            continue

        # 1. Context (Overlap Data) ကို ဆုံးဖြတ်ခြင်း
        # [NEW ACTION] ယခင် Week ရဲ့ နောက်ဆုံး sequence_length စာရှိတဲ့ data ကို ဖြတ်ယူပြီး ကပ်ပါ
        context_df = pd.DataFrame()
        if prev_week_df is not None:
            # နောက်ဆုံး sequence_length စာ rows ကို ဖြတ်ယူပါ
            # NOTE: Index Slicing မှန်စေရန် .iloc ကို အသုံးပြုပါ
            context_df = prev_week_df.iloc[-sequence_length:].copy()

        # [NEW ACTION] လက်ရှိ week_df နဲ့ Context ကို ပေါင်းစပ်ခြင်း
        # Concat လုပ်ရာတွင် index ကို ဆက်ထိန်းထားရပါမည် (ignore_index=False)
        # Context သည် week_df ၏ ရှေ့တွင် ရှိရမည်
        current_chunk = pd.concat([context_df, week_df])

        # 2. Check raw indicators to determine Eval set (Data Leakage မဖြစ်စေရန်)
        # Check လုပ်ရာတွင် context မပါသော week_df ကိုသာ အသုံးပြုသင့်သည်၊ သို့မဟုတ်
        # context မပါသော ပထမဆုံး row ကိုသာ အသုံးပြုသင့်သည်။
        first_row = week_df[indicator_cols].iloc[0] # week_df (context မပါ) ကိုသာ စစ်ဆေး
        has_nan = first_row.isna().any()
        has_zero = (first_row == 0).any()
        is_eval = has_nan or has_zero # Indicator များ မပြည့်စုံသေးသော အပတ်ကို Eval အဖြစ် သတ်မှတ်

        # # Data အရေအတွက် စစ်ဆေးခြင်း (1440 bars per week)
        # if len(week_df) < 1440: # Context ပါသော current_chunk ကို စစ်ဆေးရန် မလို
        #     logger.warning(f"Skipping {week_start}: {len(week_df)}/{1440} bars (original week)")
        #     continue

        # 3. Normalize and validate (Fit-Transform Logic)
        if not scaler_manager.is_fitted and not is_eval:
            # Scaler ကို ပထမဆုံးသော၊ Indicators ပြည့်စုံသော (is_eval=False) Training Set တွင် Fit လုပ်ပါ
            # [ACTION] Fit လုပ်ပြီး Transform လုပ်မည့် data မှာ Context ပါဝင်ရန် မလို၊ Original Data ကိုသာ Fit လုပ်ရမည်
            # [ACTION] Fit လုပ်ပြီး Transform လုပ်မည့် data မှာ Context မပါဝင်ရန်
            week_df_transformed = scaler_manager.fit_and_transform(current_chunk) # Context ပါသော chunk ကို Transform
            dir_type = 'train'
        elif scaler_manager.is_fitted:
            # Scaler Fit ပြီးပါက၊ Train နှင့် Eval နှစ်ခုလုံးကို Transform လုပ်ပါ
            # [ACTION] Context ပါသော chunk ကို Transform
            week_df_transformed = scaler_manager.transform(current_chunk)
            dir_type = 'eval' if is_eval else 'train'
        else:
            # Fit မလုပ်ရသေးဘဲ is_eval ဖြစ်နေရင် ကျော်သွားပါ
            logger.warning(f"Skipping {week_start}: Indicators not ready for fitting and not fitted yet.")
            # [ACTION] နောက်တစ်ကြိမ်အတွက် prev_week_df ကိုလည်း update လုပ်ရန် လိုအပ်သည် (မသိမ်းမီ)
            prev_week_df = week_df.copy()
            continue

        # 4. Save to appropriate directory
        path = os.path.join(base_path, split_cfg[f"{dir_type}_dir"])
        os.makedirs(path, exist_ok=True)

        iso_year, iso_week, _ = week_start.isocalendar()
        fname = f"{symbol}_{iso_year}_{iso_week:02d}.csv"

        # [ACTION] Context ပါဝင်ပြီး၊ Normalize ပြီးသော DataFrame ကိုသာ သိမ်းပါ
        week_df_transformed.reset_index().to_csv(f"{path}/{fname}", index=False)
        logger.critical(f"Saved {dir_type} file: {fname} (Total rows: {len(week_df_transformed)})")

        # 5. လက်ရှိ week_df ကို နောက်တစ်ကြိမ်အတွက် Context အဖြစ် မှတ်သားခြင်း
        # [ACTION] prev_week_df သည် Context မပါဝင်သေးသော Original Week Data ဖြစ်ရမည်။
        prev_week_df = week_df.copy()


In [None]:
symbol = 'EURUSD'
file = f'/content/drive/MyDrive/data/raw/{symbol}_M5.csv'
# 1. Load & clean
raw = pd.read_csv(file)
cf = EnvConfig('/content/drive/MyDrive/configure.json')
df = patch_missing_data(raw,cf=cf)
# 2. Feature engineering
df = add_time_feature(df, symbol=symbol)
df = tech_indicators(df, cf=cf)

# 3. Split & save
# Scaler instance ကို တည်ဆောက်ပါ
scaler_manager = TimeSeriesScaler()

# Function ကို ခေါ်ပါ
split_time_series_v2(df, symbol=symbol, cf=cf, scaler_manager=scaler_manager)


In [None]:
from stable_baselines3.common.callbacks import BaseCallback
import numpy as np
class TrainingMetricsCallback(BaseCallback):
    def __init__(self, check_freq=1000, verbose=1):
        super().__init__(verbose)
        self.check_freq = check_freq
        self.sharpe_ratios = []
        self.drawdowns = []
        self.episode_count = 0

    def _on_step(self) -> bool:
        # Track metrics only when episodes complete
        if "sharpe" in self.locals['infos'][0] and "drawdown" in self.locals['infos'][0]:
            self.episode_count += 1
            self.sharpe_ratios.append(self.locals['infos'][0]['sharpe'])
            self.drawdowns.append(self.locals['infos'][0]['drawdown'])

            # Log to tensorboard every N episodes
            if self.episode_count % 10 == 0:
                self.logger.record('train/mean_sharpe', np.mean(self.sharpe_ratios[-10:]))
                self.logger.record('train/max_drawdown', np.mean(self.drawdowns[-10:]))
                self.logger.record('train/episodes', self.episode_count)

        return True


In [None]:
def render_to_file(**kwargs):
    log_header                  =   kwargs.get("log_header",False)
    log_filename                =   kwargs.get("log_filename","")
    printout                    =   kwargs.get("printout",False)
    balance                     =   kwargs.get("balance")
    balance_initial             =   kwargs.get("balance_initial")
    transaction_close_this_step =   kwargs.get("transaction_close_this_step",[])
    done_information            =   kwargs.get("done_information","")
    profit                      =   balance - balance_initial

    tr_lines                    =   ""
    tr_lines_comma              =   ""
    _header                     =   ""
    _header_comma               =   ""
    if log_header:
        _header = f'{"Ticket":>8} {"Type":>4} {"ActionStep":16} \
                    {"ActionPrice":>12} {"CloseStep":8} {"ClosePrice":>12} \
                    {"OpenBal":>12} {"CloseBal":>12} {"Status":8} {"Info":>8} {"PIPS":>6} {"SL":>6} {"PT":>6} {"DeltaStep":8}\n'


        _header_comma = f'{"Ticket,Type,ActionTime,ActionStep,ActionPrice,CloseTime,ClosePrice, OpenBal, CloseBal, Status, Info, PIPS,SL,PT,CloseStep,DeltaStep"}\n'
    if transaction_close_this_step:
        for _tr in transaction_close_this_step:
            if _tr["CloseStep"] >=0:
                tr_lines += f'{_tr["Ticket"]:>8} {_tr["Type"]:>4} {_tr["ActionStep"]:16} \
                    {_tr["ActionPrice"]:.5f} {_tr["CloseStep"]:8} {_tr["ClosePrice"]:.5f} \
                    {_tr["OpenBal"]:.2f} {_tr["CloseBal"]:.2f} {_tr["Status"]:8}  {_tr["Info"]:>8}  {_tr["PIPS"]:4.0f} {_tr["SL"]:4.0f} {_tr["PT"]:4.0f} {_tr["DeltaStep"]:8}\n'

                tr_lines_comma += f'{_tr["Ticket"]},{_tr["Type"]},{_tr["ActionTime"]},{_tr["ActionStep"]}, \
                    {_tr["ActionPrice"]},{_tr["CloseTime"]},{_tr["ClosePrice"]}, \
                    {_tr["OpenBal"]},{_tr["CloseBal"]}, {_tr["Status"]},{_tr["Info"]},{_tr["PIPS"]},{_tr["SL"]},{_tr["PT"]},{_tr["CloseStep"]},{_tr["DeltaStep"]}\n'

    log = _header_comma + tr_lines_comma
    # log = f"Step: {current_step}   Balance: {balance}, Profit: {profit} \
    #     MDD: {max_draw_down_pct}\n{tr_lines_comma}\n"
    if done_information:
        log += done_information
    if log:
        # os.makedirs(log_filename, exist_ok=True)
        dir_path = os.path.dirname(log_filename)
        if dir_path and not os.path.exists(dir_path):
            os.makedirs(dir_path, exist_ok=True)
        with open(log_filename, 'a+') as _f:
            _f.write(log)
            _f.close()

    tr_lines = _header + tr_lines
    if printout and tr_lines:
        print(tr_lines)
        if done_information:
            print(done_information)

In [None]:
class ForexTradingEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, file, cf, asset, features, sequence_length=24, logger_show=False, save_plot=False):
        super(ForexTradingEnv, self).__init__()
        # ကိန်းရှင်များကို စတင်သတ်မှတ်သည်။
        self._initialize_parameters(file, cf, asset, features, sequence_length, logger_show, save_plot)
        # Action နှင့် Observation Spaces ကို သတ်မှတ်သည်။
        self._initialize_spaces()
        # Environment ကို အစပြုအခြေအနေသို့ ပြန်လည်သတ်မှတ်သည်။
        self.reset()

    # ကိန်းရှင်များကို စတင်သတ်မှတ်သည်။
    def _initialize_parameters(self, file, cf, asset, features, sequence_length, logger_show, save_plot):
        # Params to variables
        self.csv_file               =   file
        self.cf                     =   cf
        self.symbol_col             =   asset
        self.features               =   features
        self.sequence_length        =   sequence_length
        self.logger_show            =   logger_show
        self.save_plot              =   save_plot

        self.data                   =   pd.read_csv(file)
        # We use sequence transformer, so max steps will be this
        self.max_steps              =   len(self.data) - self.sequence_length - 1

        # Configs to variables
        # Agent က Action က Continuous Action ကို Discrete Action သို့ပြောင်းပေးသော threshold
        self.action_threshold       =   self.cf.env_parameters('action_threshold')
        self.balance_initial        =   self.cf.env_parameters('balance')

        # position close မဖြစ်သေးရင်
        # buy ထားပြီး price up ဖြစ်နေရင် reward ပေး။ sell ထားပြီး price down ဖြစ်နေရင် reward ပေး
        # position management မှာလည်း သုံး။
        # buy မှာ မြတ်နေရင် tp အပေါ်ရွေ့ sl အပေါ်ရွေ့။  ရှုံးနေရင် tp အောက်ရွေ့ sl အပေါ်တင်,
        # sell မှာ မြတ်နေရင် tp အောက်ရွေ့ sl အောက်ရွေ့။ ရှုံးနေရင် tp အပေါ်တင် sl အောက်ချ
        self.good_position_reward_scale = self.cf.env_parameters("good_position_reward_scale") # ဥပမာ: 0.01
        # ရည်ရွယ်ချက် ၂: SL/PT Trailing အတွက် တန်ဖိုး (Move Step Size)
        self.trailing_distance = self.cf.env_parameters("trailing_stop_distance_points")

        # အရှုံးနဲ့အမြတ် မျှတမှုရှိတဲ့ trading performance အတွက် ပေးတဲ့ bonus reward 0.01
        # self.consistency_reward = self.cf.env_parameters("consistency_reward")
        self.stop_loss = self.cf.symbol(self.symbol_col, "stop_loss_max")
        self.profit_taken = self.cf.symbol(self.symbol_col, "profit_taken_max")
        self.point = self.cf.symbol(self.symbol_col, "point")
        self.transaction_fee = self.cf.symbol(self.symbol_col, "transaction_fee")
        self.over_night_penalty = self.cf.symbol(self.symbol_col, "over_night_penalty")
        self.max_current_holding = self.cf.symbol(self.symbol_col, "max_current_holding")
        # Drawdown Penalty Factor
        self.drawdown_penalty_factor = self.cf.env_parameters("drawdown_penalty_factor")
        self.margin_requirement = self.cf.env_parameters('margin_requirement')


    # Action နှင့် Observation Spaces ကို သတ်မှတ်သည်။
    def _initialize_spaces(self):
        # Continuous actions: [1 -> 0.5] LONG | [0.5 -> -0.5] HOLD |[-0.5 -> -1] SHORT
        self.action_space = spaces.Box(
            low=-1,
            high=1,
            shape=(1,),
            dtype=np.float32
        )
        # Transformer သုံးထားသော features တွေရဲ့ previous sequence length candle ကိုပါ တပြိုင်တည်းကြည့်
        obs_shape = (self.sequence_length, len(self.features))
        self.observation_space = spaces.Box(
            low=-np.inf,
            high=np.inf,
            shape=obs_shape,
            dtype=np.float32
        )

    # Environment ကို အစပြုအခြေအနေသို့ ပြန်လည်သတ်မှတ်သည်။
    def reset(self, *, seed = None, options = None):
        super().reset(seed=seed, options=options)

        self.ticket_id          =   0
        self.ttl_rewards        =   0 # total rewards

        self.balance            =   self.balance_initial
        self.positions          =   []

        # equity tracking
        self.equity_curve       =   [self.balance_initial] # Starting with initial balance
        # အမြင့်ဆုံးရောက်ဖူးတဲ့ eq value
        self.peak_equity        =   self.balance_initial # Start with initial balance as peak

        self.max_drawdown       =   0.0
        self.current_drawdown   =   0.0

        # transformer အသုံးပြုထားခြင်းကြောင့်
        self.current_step       =   self.sequence_length
        logger.info(f"--- Environment reset. Starting at step {self.current_step} --total rewards: {self.ttl_rewards}")

        observation             =   self._next_observation()
        info                    =   {}
        return  observation, info


    # AI model အတွက် လက်ရှိ market condition ကိုကိုယ်စားပြုတဲ့ observation data ကို ပြင်ဆင်ပေးဖို့ဖြစ်ပါတယ်။
    def _next_observation(self):

        # သင့်တော်တဲ့ obs Historical Data ယူခြင်း
        obs = self.data.iloc[
            self.current_step - self.sequence_length: self.current_step
        ][self.features].values

        # NumPy array → PyTorch tensor ပြောင်းမယ်
        # Data type ကို float32 လုပ်မယ်
        # GPU/CPU device ပေါ်ကို ရွှေ့မယ်
        obs = torch.tensor(obs, dtype=torch.float32).to(device)

        # Data Validation စစ်ဆေးခြင်း
        # NaN (Not a Number) values ရှိမရှိစစ်မယ်
        # Infinite values ရှိမရှိစစ်မယ်
        # Invalid data ရှိရင် error ပြမယ်
        if torch.isnan(obs).any() or torch.isinf(obs).any():
            logger.error(f"Invalid observation at step {self.current_step}")
            raise ValueError(f"Invalid observation at step {self.current_step}")

        # NumPy Array ပြန်ပြောင်းခြင်း
        # GPU memory → CPU memory ပြန်ရွှေ့မယ်
        # PyTorch tensor → NumPy array ပြန်ပြောင်းမယ်
        # Gym environment က NumPy arrays ကို ပိုကြိုက်တယ်။ Memory management အတွက် ကောင်းတယ်
        return obs.cpu().numpy()  # obs



    def _get_action_name(self, _action):
        """Convert continuous action to discrete action name"""
        if _action >= self.action_threshold:
            return "BUY"
        elif _action <= -self.action_threshold:
            return "SELL"
        else:
            return "HOLD"

    def step(self, action):
        _o, _h, _l, _c, _t, _day    =   self.data.iloc[self.current_step][['open', 'high', 'low', 'close', 'time', 'day']]
        reward                      =   0 # ဒီ step အတွက် စုစုပေါင်း reward
        position_reward             =   0 # Position ပိတ်ရင် ရတဲ့ reward
        action_hold_reward          =   0 # Hold action အတွက် reward/penalty

        _msg                        =   []
        _action                     =   action[0] # action value eg. [0.75]
        open_position               =   0
        for position in self.positions:
            if position['Status']   ==  0:
                position_reward, closed, _msg   =   self._calculate_reward(position)
                if not closed: open_position += 1  # Count what we already knew
                reward += position_reward

        # Continuous actions: [1 -> 0.5] LONG | [0.5 -> -0.5] HOLD |[-0.5 -> -1] SHORT
        action_name = self._get_action_name(_action)

        if open_position < self.max_current_holding and action_name in ['BUY', 'SELL']:
            self.ticket_id  +=  1

            # Real trading မှာ margin requirement ရှိသလိုမျိုး
            # Position ဖွင့်ရင် capital ချုပ်ငြားနေရတယ်
            # Position ပိတ်တဲ့အခါ ပြန်ပေါင်းထည့်ပေးတယ်
            self.balance -= self.margin_requirement # hold up, this will make sure model can not open a lot of

            position        =   {
                "Ticket"        :   self.ticket_id,
                "Symbol"        :   self.symbol_col,
                "ActionTime"    :   _t,
                "Type"          :   action_name,
                "Lot"           :   1,
                "ActionPrice"   :   _c,
                "SL"            :   self.stop_loss,
                "PT"            :   self.profit_taken,
                "MaxDD"         :   0,
                "Swap"          :   0.0,
                "CloseTime"     :   "",
                "ClosePrice"    :   0.0,
                "Point"         :   self.point,
                "Reward"        :   self.transaction_fee,
                "DateDuration"  :   _day,
                "Status"        :   0, # 0 is Position is currently OPEN and active
                #"PIPS"          :   self.transaction_fee, # Price Interest Point (profit/loss ကို measure လုပ်တဲ့ unit)
                "PIPS"          :   0,
                "ActionStep"    :   self.current_step,
                "CloseStep"     :   -1, # Step number when position closed, not close yet is -1
                "DeltaStep"     :   0,
                "OpenBal"       :   self.balance,
                "CloseBal"       :   0,
                "HighestPrice"  :   _c,
                "LowestPrice"   :   _c,
            }

            self.positions.append(position)
            # do not use transaction_fee penalty
            # reward = self.transaction_fee #open cost
            # model က အလွန်အကျွံ position တွေ မဖွင့်မိအောင် ထိန်းချုပ်တဲ့ mechanism ဖြစ်ပါတယ်။
            _msg.append(f'Step:{self.current_step} Tkt:{position["Ticket"]} {position["Type"]} Rwd:{position["PIPS"]} SL:{position["SL"]} PT:{position["PT"]}')

        # HOLD Penalty ကို အလွန်သေးငယ်သော တန်ဖိုး
        # (ဥပမာ: -0.0001) သို့ ပြောင်းပါ။ အကောင်းဆုံးမှာ
        # Trading မလုပ်ခြင်းအတွက် Penalty မပေးဘဲ action_hold_reward = 0 ထားပါ။
        elif open_position < self.max_current_holding and action_name == "HOLD":
            action_hold_reward  =   0  # no open any position, encourage open position
        else:
            action_hold_reward  =   0




        reward              +=  action_hold_reward

        # Move to the next time step
        self.current_step   +=  1

        # check if episode is done
        terminated          =   (self.balance <= 0)
        truncated           =   (self.current_step > self.max_steps)

        # get next observation
        obs                 =   self._next_observation()
        _msg.append(f'---idle----step:{self.current_step}, RF:{action_name} Action:{_action} Balance: {self.balance} reward:{reward} total_rewards:{self.ttl_rewards} position_reward:{position_reward} action_hold_reward:{action_hold_reward}')


        current_equity = self._calculate_current_equity()
        self.equity_curve.append(current_equity)
        self._calculate_drawdown()  # This updates peak_equity and drawdowns

        # =========================================================================
        # START: Drawdown Penalty Logic
        # =========================================================================
        # self.current_drawdown သည် Percentage (0.0 မှ 1.0) ဖြစ်သည်။


        drawdown_penalty = self.current_drawdown * self.drawdown_penalty_factor
        # Reward တွင် နုတ်ပေးခြင်း
        reward -= drawdown_penalty

        # Log the penalty for debugging
        _msg.append(f'Drawdown Penalty: -{drawdown_penalty:.4f} (DD:{self.current_drawdown:.4f})')
        # =========================================================================
        # END: Drawdown Penalty Logic
        # =========================================================================
        # Drawdown Penalty နုတ်ပြီးမှသာ စုစုပေါင်း Reward ကို အပ်ဒိတ်လုပ်ပါ
        self.ttl_rewards += reward  # <--- ဤနေရာတွင် ပြန်ထည့်ပါ

        info = {}

        if terminated or truncated:
            buy_positions = [p for p in self.positions if p["Type"] == "BUY"]
            sell_positions = [p for p in self.positions if p["Type"] == "SELL"]

            buy_count = len(buy_positions)
            sell_count = len(sell_positions)
            total_positions = len(self.positions)

            # Calculate win rates
            buy_wins = len([p for p in buy_positions if p["PIPS"] > 0])
            sell_wins = len([p for p in sell_positions if p["PIPS"] > 0])

            buy_win_rate = buy_wins / buy_count if buy_count > 0 else 0
            sell_win_rate = sell_wins / sell_count if sell_count > 0 else 0

            _m = f'--- Positions: {total_positions} (Buy:{buy_count}, Sell:{sell_count}) | '
            _m += f'WinRates: Buy:{buy_win_rate:.1%}, Sell:{sell_win_rate:.1%} | '
            _m += f'TotalRewards: {self.ttl_rewards} Balance: {self.balance}'

            logger.info(_m)
            _msg.append(_m)

            # Additional info
            if self.logger_show:
                for _m in _msg:
                    logger.info(_m)

            info["info"]                = _msg
            info["sharpe"]              = self._calculate_sharpe()  # ✅ Now works! 💡 'sharpe_ratio' မှ 'sharpe' သို့ပြောင်းပါ။
            info["drawdown"]            = self.max_drawdown         # ✅ Now accurate!'max_drawdown' မှ 'drawdown' သို့ပြောင်းပါ။
            info["current_equity"]      = current_equity            # ✅ For debugging
            info["peak_equity"]         = self.peak_equity          # ✅ For debugging
            info["equity_curve_length"] = len(self.equity_curve)    # ✅ Monitor growth

        return obs, reward, terminated, truncated, info




    def _calculate_reward(self, position):
        _o, _h, _l, _c, _t, _day    =   self.data.iloc[self.current_step][['open', 'high', 'low', 'close', 'time', 'day']]
        _msg                        =   []

        entry_price                 =   position['ActionPrice']
        direction                   =   position['Type']
        profit_target_price         =   entry_price + position['PT']/ self.point if direction == 'BUY' else entry_price - position['PT']/self.point
        stop_loss_price             =   entry_price + position['SL']/ self.point if direction == 'BUY' else entry_price - position['SL']/self.point
        closed                      =   False
        close_position_reward       =   0.0
        good_position_reward        =   0.0

        # Check for stoploss hit
        if (direction == 'BUY' and _l <= stop_loss_price) or (direction == 'SELL' and _h >= stop_loss_price):
            close_position_reward   =   position['SL'] # position sl က minus value ဖြစ်တယ်

            position['CloseTime']   =   _t
            position['ClosePrice']  =   stop_loss_price
            position['Status']      =   1   # Status က open ဆို 0 close ဆို 1
            position['CloseStep']   =   self.current_step
            position['PIPS']        =   close_position_reward - self.transaction_fee
            position['DeltaStep']   =   self.current_step - position['ActionStep']
            position['Info']        =   f'{profit_target_price:.5f} | {stop_loss_price:.5f}'

            self.balance            +=  self.margin_requirement + position['PIPS'] # return 100 is margin hold
            position['CloseBal']    =   self.balance
            closed                  =   True
            _msg.append(f'Step:{self.current_step} Tkt:{position["Ticket"]}: Rwd:{position["PIPS"]}, SL:{position["SL"]}, DeltaStep:{position["DeltaStep"]}')

        elif (direction == 'BUY' and _h >= profit_target_price) or (direction == 'SELL' and _l <= profit_target_price):
            close_position_reward   =    position['PT'] # position tp က plus value ဖြစ်တယ်

            position['CloseTime']   =   _t
            position['ClosePrice']  =   profit_target_price
            position['Status']      =   2   # Status က open ဆို 0 close ဆို 1
            position['CloseStep']   =   self.current_step
            position['PIPS']        =   close_position_reward - self.transaction_fee
            position['DeltaStep']   =   self.current_step - position['ActionStep']
            position['Info']        =   f'{profit_target_price:.5f} | {stop_loss_price:.5f}'

            self.balance            +=  self.margin_requirement + position['PIPS'] # return 100 is margin hold
            position['CloseBal']    =   self.balance
            closed                  =   True
            _msg.append(f'Step:{self.current_step} Tkt:{position["Ticket"]}: Rwd:{position["PIPS"]}, SL:{position["SL"]}, DeltaStep:{position["DeltaStep"]}')

        else:
            if self.current_step + 5 + self.sequence_length >= len(self.data):
                close_position_reward   =   (_c - position["ActionPrice"] if direction == 'BUY' else position["ActionPrice"] - _c)* self.point

                position['CloseTime']   =   _t
                position['ClosePrice']  =   _c
                position['Status']      =   3   # Status က open ဆို 0 close ဆို 1, force close 2
                position['CloseStep']   =   self.current_step
                position['PIPS']        =   close_position_reward - self.transaction_fee
                position['DeltaStep']   =   self.current_step - position['ActionStep']
                position['Info']        =   f'{profit_target_price:.5f} | {stop_loss_price:.5f}'
                self.balance            +=  self.margin_requirement + position["PIPS"] # return 100 is margin hold
                position['CloseBal']    =   self.balance

                closed                  =   True
                _msg.append(f'Step:{self.current_step} Tkt:{position["Ticket"]}: Rwd:{position["PIPS"]}, Cls:End, DeltaStep:{position["DeltaStep"]}')

            else:
                # =========================================================================
                # Real Trailing Stop Logic (အမြင့်ဆုံး ရောက်ဖူးသော ဈေးနှုန်းကို မှတ်တမ်းတင်ခြင်း)
                # =========================================================================
                # 1. Highest/Lowest Price Update

                if direction == "BUY":
                  # Buy position အတွက် အမြင့်ဆုံး ရောက်ဖူးသော ဈေးနှုန်းကို မှတ်တမ်းတင်
                  if _c > position["HighestPrice"]:
                      position["HighestPrice"] = _c

                  # 2. New SL Target Price (Trailing Price) ကို တွက်ချက်ခြင်း
                  # New_SL_Price = HighestPrice - (Trailing Distance Pips ကို Price Change သို့ ပြောင်း)
                  trailing_price = position["HighestPrice"] - self.trailing_distance / self.point

                  # 3. SL ကို အဆင့်မြှင့်တင်ခြင်း
                  # လက်ရှိ SL ထက် ပိုကောင်းမှသာ ရွေ့ပါ
                  if trailing_price > stop_loss_price:

                      stop_loss_price = trailing_price
                      # SL_Price အသစ်ကို Points သို့ ပြန်ပြောင်းပြီး position['SL'] ကို အပ်ဒိတ်လုပ်ပါ
                      position["SL"] = (stop_loss_price - entry_price) * self.point
                      if position["SL"] > 0:
                          position["SL"]    =   -abs(position["SL"])
                      trailing_happened = True
                  else:
                      trailing_happened = False


                elif direction == "SELL":
                  # Sell position အတွက် အနိမ့်ဆုံး ရောက်ဖူးသော ဈေးနှုန်းကို မှတ်တမ်းတင်
                  if _c < position["LowestPrice"]:
                      position["LowestPrice"] = _c

                  # New SL Target Price (Trailing Price) ကို တွက်ချက်ခြင်း
                  trailing_price = position["LowestPrice"] + self.trailing_distance / self.point

                  # SL ကို အဆင့်မြှင့်တင်ခြင်း
                  if trailing_price < stop_loss_price:
                      stop_loss_price = trailing_price
                      # SL_Price အသစ်ကို Points သို့ ပြန်ပြောင်းပြီး position['SL'] ကို အပ်ဒိတ်လုပ်ပါ
                      position["SL"] = (entry_price - stop_loss_price) * self.point
                      if position["SL"] > 0:
                          position["SL"]    =   -abs(position["SL"])
                      trailing_happened = True
                  else:
                      trailing_happened = False

                # =========================================================================
                # Reward Logic (Trailing လုပ်ခြင်းအတွက် Bonus ပေးခြင်း)
                # =========================================================================
                # Reward Sign ကို ယခင်အတိုင်း တွက်ပါ။
                delta = _c - entry_price
                if direction == "BUY":
                    reward_sign = 1 if delta >= 0 else -1
                elif direction == "SELL":
                    reward_sign = -1 if delta >= 0 else 1

                good_position_reward = reward_sign * self.good_position_reward_scale

                # Trailing အမှန်တကယ် ဖြစ်သွားမှသာ Bonus Reward ကို ပေးပါ
                if trailing_happened:
                    good_position_reward += 0.001

                position['Info']        =   f'{profit_target_price:.5f} | {stop_loss_price:.5f}'
                position['CloseBal']    =   self.balance
                _msg.append(f'Step:{self.current_step} Tkt:{position["Ticket"]}: NO_Close, PT:{position["PT"]}, SL:{position["SL"]}')

        return close_position_reward + good_position_reward, closed, _msg


    def _calculate_sharpe(self, risk_free_rate=0.0):
        """Calculate Sharpe ratio for the current episode"""
        if len(self.equity_curve) < 2:
            return 0.0

        returns = np.diff(self.equity_curve) / self.equity_curve[:-1]

        if np.std(returns) == 0:
            return 0.0

        sharpe = (np.mean(returns) - risk_free_rate) / np.std(returns)
        return float(sharpe * np.sqrt(288))  # Annualized (5-min bars → 288/day)

    def _calculate_drawdown(self):
        """Update max drawdown during episode"""
        current_equity          =   self.equity_curve[-1]
        self.peak_equity        =   max(self.peak_equity, current_equity)
        self.current_drawdown   =   (self.peak_equity - current_equity) / self.peak_equity
        self.max_drawdown       =   max(self.max_drawdown, self.current_drawdown)


    def _calculate_current_equity(self):
        """Calculate total current equity (balance + unrealized P/L)"""
        total_equity = self.balance  # Start with cash balance

        # Add unrealized P/L from open positions
        for position in self.positions:
            if position['Status'] == 0:  # Only open positions
                current_price = self.data.iloc[self.current_step]["close"]
                entry_price = position['ActionPrice']

                if position['Type'] == 'BUY':
                    unrealized_pnl = (current_price - entry_price) * self.point
                else:  # Sell
                    unrealized_pnl = (entry_price - current_price) * self.point

                total_equity += unrealized_pnl

        return total_equity

    def render(self, mode='human', title=None, **kwargs):
        # Render the environment to the screen
        if mode in ('human', 'file'):
            log_header      =   True
            printout        =   False
            if mode == 'human':
                printout    =   True

            log_file = self.csv_file.replace("split/", "log/")
            pm = {
                "log_header": log_header,
                "log_filename": log_file,
                "printout": printout,
                "balance": self.balance,
                "balance_initial": self.balance_initial,
                "transaction_close_this_step": self.positions,
                "done_information": False
            }
            render_to_file(**pm)
            if log_header:
                    log_header = False

In [None]:
import os
import datetime
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3 import PPO
import torch.nn as nn
from stable_baselines3.common.utils import set_random_seed
# Assume logger is defined elsewhere, e.g., import logging; logger = logging.getLogger(__name__)

BASE_SEED = 42
number_envs = 4
# Stable-Baselines3 ရဲ့ Global Seed ကို သတ်မှတ်ပါ
set_random_seed(BASE_SEED)


def single_csv_training(csv_file, env_config_file, asset, model_name='', cf=None, number_envs=1, week_num=0):  # Added week_num for varying seed
    # 1. Log Root Directory နှင့် Run Name ကို တွက်ချက်ခြင်း
    BASE_LOG_DIR = "/content/drive/MyDrive/data/log"
    RUN_NAME = f"{asset}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}"

    # 2. Log Root Directory ရှိမရှိ စစ်ဆေးပြီး ဖန်တီးခြင်း
    os.makedirs(BASE_LOG_DIR, exist_ok=True)

    features = cf.env_parameters("observation_list")
    sequence_length = cf.env_parameters("backward_window")
    print(features)
    lr_schedule = linear_schedule(1e-4, 5e-6)
    policy_kwargs = dict(
        # Repo ရဲ့ custom feature extractor (Transformer + MLP ပေါင်းထားတာ၊ time series data အတွက် သင့်တော်တယ်)။
        features_extractor_class=CustomCombinedExtractor,
        # features_extractor_kwargs: Sequence length ကို ထည့်။
        features_extractor_kwargs=dict(sequence_length=sequence_length),
        # net_arch: Actor (pi - policy network) နဲ့ Critic (vf - value function) နှစ်ခု လုံး အတွက် hidden layers [256, 256] သုံး။ (Updated vf to [512,256] for better explained variance)
        net_arch=dict(pi=[256, 256], vf=[512, 256]),  # Increased vf capacity
        # Activation function အနေနဲ့ ReLU သုံး (non-linear ဖြစ်အောင်)။
        activation_fn=nn.ReLU,
        # Orthogonal initialization မသုံး (financial data မှာ ပိုကောင်း တယ်လို့ comment မှာ ရေး ထားတယ်၊ ဒါက weights ကို ပိုရိုးရှင်း စ လုပ်တယ်)။
        ortho_init=False  # better for financial data
    )

    # Environment Factories များ ဖန်တီးပါ
    env_fns = [
        lambda: ForexTradingEnv(
            csv_file,
            cf,
            asset,
            features=features,
            sequence_length=sequence_length,
            logger_show=True
        )
        for _ in range(number_envs)
    ]
    # DummyVecEnv ကို တည်ဆောက်ပါ (SubprocVecEnv)
    env = SubprocVecEnv(env_fns)
    # ဤနေရာသည် အဓိကကျသည်။ ၎င်းက Environment တစ်ခုချင်းစီကို
    # BASE_SEED, BASE_SEED+1, BASE_SEED+2... စသည်ဖြင့် Seed များ သတ်မှတ်ပေးပြီး
    # ၎င်းတို့၏ reset() ကို ပြန်လည်ခေါ်ပေးလိမ့်မည်။
    # Vary seed per week to avoid overfitting in incremental training
    varied_seed = BASE_SEED + week_num  # Example: Pass week_num=1 for week 2, etc.
    env.seed(varied_seed)

    if model_name:
        model = PPO.load(model_name, env=env, learning_rate=lr_schedule)
    else:
        model = PPO(
            # 'CnnPolicy' , # support GPU
            'MlpPolicy',  # CPU only
            env,
            device='cuda',
            verbose=1,
            # ✅ အရေးကြီးဆုံး ပြင်ဆင်ချက်
            tensorboard_log=BASE_LOG_DIR,
            # Updated params based on plots analysis
            vf_coef=0.5,  # Reduced from 0.7 to balance policy vs value
            target_kl=0.02,  # Increased from 0.005 for better updates
            normalize_advantage=True,
            policy_kwargs=policy_kwargs,
            learning_rate=lr_schedule,  # Reduced learning rate
            max_grad_norm=0.3,  # Tighter gradient control
            seed=varied_seed,  # Use varied seed

            # ⬆️ ပြင်ဆင်ချက် ၂: Trajectory Length ကို လျှော့ချပြီး Batch Size တိုးမြှင့်ခြင်း
            n_steps=1024,  # n_steps သည် Policy Update မလုပ်မီ စုဆောင်းမည့် Data ပမာဏ ဖြစ်သည်။ သင်၏ data အရေအတွက်နှင့် ညီမျှသော သိုမဟုတ် နီးစပ်သော တန်ဖိုး (ဥပမာ: 1024 သိုမဟုတ် 512) သည် အပတ်စဉ် data ကို ကောင်းစွာ အသုံးချစေသည်။
            batch_size=256,

            # ⬇️ ပြင်ဆင်ချက် ၃: Epochs ကို လျှော့ချခြင်း
            n_epochs=5,  # Prevent overfitting to recent week

            # 🛠️ အခြား Fine-tuning များ
            ent_coef=0.005,  # Reduced from 0.01 for controlled exploration
            clip_range=0.01,  # Increased from 0.002 to reduce high clip_fraction
            gamma=0.99,
            # Added GAE for better returns estimation
            use_gae=True,
            gae_lambda=0.95,
        )

    # Train the agent
    logger.info("Starting model training...")
    callback = TrainingMetricsCallback()
    model.learn(
        total_timesteps=100000,
        callback=callback,
        # 🚨 ပြင်ဆင်ချက် ၂: tb_log_name နေရာမှာ Run Folder Name ကိုပဲ ပေးပါ။
        tb_log_name=RUN_NAME,
        reset_num_timesteps=False if model_name else True  # 🔄 Existing model ဆိုရင် timesteps ဆက်မှတ်
    )
    logger.info("Model training complete")
    model_filename = csv_file.replace("split/", "model/").replace(".csv", "_single_test.zip")
    model.save(model_filename)

In [None]:
%load_ext tensorboard
log_dir = '/content/drive/MyDrive/data/log'

%tensorboard --logdir $log_dir

In [None]:
asset = "EURUSD"
env_config_file = '/content/drive/MyDrive/configure.json'
cf = EnvConfig(env_config_file)
split_cfg = cf.data_processing_parameters("train_eval_split")
base_path = split_cfg["base_path"].format(symbol=asset)
csv_file = f"{base_path}/{split_cfg["train_dir"]}/{asset}_2022_12.csv"
model_name = '' #f'/content/drive/MyDrive/data/model/{asset}/train/{asset}_2022_12_single_test.zip'
single_csv_training(csv_file=csv_file, env_config_file =env_config_file, asset= asset, model_name=model_name, cf=cf, number_envs=4, week_num=1)


In [None]:
csv_file = f"{base_path}/{split_cfg["train_dir"]}/{asset}_2022_13.csv"
model_name = f'/content/drive/MyDrive/data/model/{asset}/train/{asset}_2022_12_single_test.zip'
single_csv_training(csv_file=csv_file, env_config_file =env_config_file, asset= asset, model_name=model_name, cf=cf, number_envs=4, week_num=2)



In [None]:
csv_file = f"{base_path}/{split_cfg["train_dir"]}/{asset}_2022_14.csv"
model_name = f'/content/drive/MyDrive/data/model/{asset}/train/{asset}_2022_13_single_test.zip'
single_csv_training(csv_file=csv_file, env_config_file =env_config_file, asset= asset, model_name=model_name, cf=cf, number_envs=4, week_num=3)



In [None]:
csv_file = f"{base_path}/{split_cfg["train_dir"]}/{asset}_2022_15.csv"
model_name = f'/content/drive/MyDrive/data/model/{asset}/train/{asset}_2022_14_single_test.zip'
single_csv_training(csv_file=csv_file, env_config_file =env_config_file, asset= asset, model_name=model_name, cf=cf, number_envs=4, week_num=4)


In [None]:
csv_file = f"{base_path}/{split_cfg["train_dir"]}/{asset}_2022_16.csv"
model_name = f'/content/drive/MyDrive/data/model/{asset}/train/{asset}_2022_15_single_test.zip'
single_csv_training(csv_file=csv_file, env_config_file =env_config_file, asset= asset, model_name=model_name, cf=cf, number_envs=4, week_num=5)



In [None]:
csv_file = f"{base_path}/{split_cfg["train_dir"]}/{asset}_2022_17.csv"
model_name = f'/content/drive/MyDrive/data/model/{asset}/train/{asset}_2022_16_single_test.zip'
single_csv_training(csv_file=csv_file, env_config_file =env_config_file, asset= asset, model_name=model_name, cf=cf, number_envs=4, week_num=6)
