In [16]:
import os
import pandas as pd
import numpy as np
from numba import njit
from hmmlearn.hmm import GaussianHMM
import matplotlib.pyplot as plt

1-1. Standardize MT5 Data

In [None]:
def fix_data(folder_path, fixed_path):
    # Define the standard column names after dropping <VOL> and renaming <TICKVOL>
    standard_columns = ['Datetime', 'Open', 'High', 'Low', 'Close', 'Vol', 'Spread']
    
    # Create the fixed folder if it doesn't exist
    if not os.path.exists(fixed_path):
        os.makedirs(fixed_path)
    
    # Loop through all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            fixed_file_path = os.path.join(fixed_path, filename)
            
            # Read the CSV file
            df = pd.read_csv(file_path, sep='\t')  # Assuming tab-separated values
            
            # Drop the <VOL> column
            df.drop(columns=['<VOL>'], inplace=True)
            
            # Rename <TICKVOL> to 'Vol'
            df.rename(columns={'<TICKVOL>': 'Vol'}, inplace=True)
            
            # Check if the <TIME> column exists
            if '<TIME>' in df.columns:
                # Combine <DATE> and <TIME> into a single 'Datetime' column
                df['Datetime'] = pd.to_datetime(df['<DATE>'] + ' ' + df['<TIME>'], format='%Y.%m.%d %H:%M:%S')
                # Drop the original <DATE> and <TIME> columns
                df.drop(columns=['<DATE>', '<TIME>'], inplace=True)
            else:
                # If no <TIME> column, just convert <DATE> to datetime
                df['Datetime'] = pd.to_datetime(df['<DATE>'], format='%Y.%m.%d')
                # Drop the original <DATE> column
                df.drop(columns=['<DATE>'], inplace=True)
            
            # Reorder columns to ensure 'Datetime' is the first column
            df = df[['Datetime'] + [col for col in df.columns if col != 'Datetime']]
            df['Datetime'].to_date
            
            # Rename other columns to standard names
            df.columns = standard_columns
            
            # Save the corrected DataFrame to the fixed folder
            df.to_csv(fixed_file_path, index=False)
            
            print(f"Processed and saved: {filename}")

# Example usage
folder_path = '../data/mt5'
fixed_path = '../data/gold'
fix_data(folder_path, fixed_path)

Processed and saved: Daily_201708100000_202503140000.csv
Processed and saved: H1_201708100000_202503142200.csv
Processed and saved: H4_201708100000_202503142000.csv
Processed and saved: M15_202012161445_202503131615.csv
Processed and saved: M2_202408190410_202503142258.csv
Processed and saved: M5_202310121825_202503142255.csv


1-2. Add Features

In [26]:
def calculate_atr(df, period=14):
    """Calculate the Average True Range (ATR) for a given DataFrame."""
    high_low = df['High'] - df['Low']
    high_close = abs(df['High'] - df['Close'].shift())
    low_close = abs(df['Low'] - df['Close'].shift())
    true_range = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
    atr = true_range.rolling(window=period).mean()
    return atr

def add_features(df):
    """Add features like day of the week, ATR, and EMAs to the DataFrame."""
    # Add day of the week (0 = Monday, 6 = Sunday)
    df['Datetime'] = pd.to_datetime(df['Datetime'])
    df['DayOfWeek'] = df['Datetime'].dt.dayofweek
    
    # Calculate ATR (Average True Range)
    df['ATR'] = calculate_atr(df)
    
    # Calculate EMAs (Exponential Moving Averages)
    df['EMA_20'] = df['Close'].ewm(span=20, adjust=False).mean()
    df['EMA_50'] = df['Close'].ewm(span=50, adjust=False).mean()
    df['EMA_100'] = df['Close'].ewm(span=100, adjust=False).mean()
    df['EMA_200'] = df['Close'].ewm(span=200, adjust=False).mean()
    
    return df

df = pd.read_csv('..\\data\\gold\\H1_201708100000_202503142200.csv')
features = add_features(df)
print(features.tail(10))

                 Datetime     Open     High      Low    Close    Vol  Spread  \
44864 2025-03-14 13:00:00  2993.40  2999.19  2993.28  2993.43   7649      10   
44865 2025-03-14 14:00:00  2993.42  2999.64  2993.29  2993.87   8761      10   
44866 2025-03-14 15:00:00  2993.85  2996.13  2983.46  2985.73  11013      10   
44867 2025-03-14 16:00:00  2985.73  2993.40  2982.06  2985.42  10756      10   
44868 2025-03-14 17:00:00  2985.40  2986.46  2978.50  2984.54   8762      10   
44869 2025-03-14 18:00:00  2984.55  2990.07  2982.92  2989.79   6906      11   
44870 2025-03-14 19:00:00  2989.81  2990.35  2986.44  2986.60   5383      11   
44871 2025-03-14 20:00:00  2986.67  2987.06  2983.14  2983.35   4636      11   
44872 2025-03-14 21:00:00  2983.36  2986.03  2982.12  2984.01   4735      11   
44873 2025-03-14 22:00:00  2984.02  2988.43  2981.75  2984.24   2029      11   

       DayOfWeek       ATR       EMA_20       EMA_50      EMA_100      EMA_200  
44864          4  7.377143  2982.89609