In [1]:
import pandas as pd
import numpy as np
import talib
import lightgbm as lgb
import matplotlib.pyplot as plt
import pickle
import time
import mplfinance as mpf
from ipywidgets import interact, widgets
import sys
import os

# Add the path to the system path
lib_path = os.path.abspath(os.path.join('lib'))
sys.path.append(lib_path)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

def load_data(file_names):
    dfs = []
    for file_name in file_names:
        df = pd.read_csv(file_name, index_col="date")
        dfs.append(df)
    return dfs

def predict(data_row):
    if "5T_target" in data_row.index:
        data_row = data_row.drop("5T_target")
    
    # Reshape the data_row into the correct format
    data_row = data_row.values.reshape(1, -1)
    prediction_proba = model.predict(data_row)
    predicted_class = [1 if prob > 0.5 else 0 for prob in prediction_proba]

    return predicted_class[0]

# feature engineering
def feature_engineering(df, prefix):
    open = df[f'{prefix}_open'].values
    high = df[f'{prefix}_high'].values
    low = df[f'{prefix}_low'].values
    close = df[f'{prefix}_close'].values
    # volume = df[f'{prefix}_volume'].values
    hilo = (high + low) / 2

    rsi = talib.RSI(close, timeperiod=8) # default = 14
    df[f'{prefix}_RSI'] = rsi
    df[f'{prefix}_RSI_BB_UPPER'], df[f'{prefix}_RSI_BB_MIDDLE'], df[f'{prefix}_RSI_BB_LOWER'] = talib.BBANDS(rsi, timeperiod=20, nbdevup=2, nbdevdn=2, matype=0)
    
    df[f'{prefix}_RSI_ST'] = talib.RSI(close)/close
    df[f'{prefix}_RSI_LOG'] = log_transform_feature(talib.RSI(close))
    df[f'{prefix}_MACD'], _, _ = talib.MACD(close)
    df[f'{prefix}_MACD_ST'], _, _ = talib.MACD(close)/close
    df[f'{prefix}_ATR'] = talib.ATR(high, low, close)
    df[f'{prefix}_ADX'] = talib.ADX(high, low, close, timeperiod=14)
    df[f'{prefix}_ADXR'] = talib.ADXR(high, low, close, timeperiod=14)
    
    df[f'{prefix}_SMA20'] = talib.SMA(close, timeperiod=20)
    df[f'{prefix}_SMA50'] = talib.SMA(close, timeperiod=50)
    df[f'{prefix}_SMA100'] = talib.SMA(close, timeperiod=100)
    
    df[f'{prefix}_BB_UPPER'], df[f'{prefix}_BB_MIDDLE'], df[f'{prefix}_BB_LOWER'] = talib.BBANDS(close, timeperiod=20, nbdevup=2, nbdevdn=2, matype=0)
    df[f'{prefix}_BBANDS_upperband'] = (df[f'{prefix}_BB_UPPER'] - hilo) / close
    df[f'{prefix}_BBANDS_middleband'] = (df[f'{prefix}_BB_MIDDLE'] - hilo) / close
    df[f'{prefix}_BBANDS_lowerband'] = (df[f'{prefix}_BB_LOWER'] - hilo) / close
    # df[f'{prefix}_BBANDS_bandwidth'] = (df[f'{prefix}_BB_UPPER'] - df[f'{prefix}_BB_LOWER']) / df[f'{prefix}_BB_MIDDLE']
    # df[f'{prefix}_BBANDS_squeeze'] = (df[f'{prefix}_BBANDS_bandwidth'] < df[f'{prefix}_BBANDS_bandwidth'].rolling(window=20).mean()).astype(int)

    df[f'{prefix}_STOCH_K'], df[f'{prefix}_STOCH_D'] = talib.STOCH(high, low, close)/close
    df[f'{prefix}_MON'] = talib.MOM(close, timeperiod=5)
    # df[f'{prefix}_OBV'] = talib.OBV(close, volume)

    df[f'{prefix}_support'] = df[f'{prefix}_low'].rolling(window=20, min_periods=1).min()
    df[f'{prefix}_resistance'] = df[f'{prefix}_high'].rolling(window=20, min_periods=1).max()

    df = df.fillna(method='ffill')

    return df

def log_transform_feature(X):
    X[X <= 0] = np.finfo(float).eps
    return np.log(X)

def create_label(df, prefix, lookbehind=1):
    df[f'{prefix}_target'] = (df[f'{prefix}_close'] > df[f'{prefix}_close'].shift(lookbehind)).astype(int)
    df = df.fillna(method='ffill')
    return df

def price_relation(df, short_prefix, long_prefix):
    short_close = df[f'{short_prefix}_close']
    long_support = df[f'{long_prefix}_support']
    long_resistance = df[f'{long_prefix}_resistance']
    df[f'{short_prefix}_close_to_{long_prefix}_support'] = (short_close - long_support) / long_support
    df[f'{short_prefix}_close_to_{long_prefix}_resistance'] = (short_close - long_resistance) / long_resistance
    return df

def summarize_trade_results(trade_results):
    num_trades = len(trade_results['profits'])
    num_wins = sum(1 for x in trade_results['profits'] if x > 0)
    num_losses = num_trades - num_wins

    total_return = sum(trade_results['profits'])

    average_win = sum(x for x in trade_results['profits'] if x > 0) / num_wins if num_wins > 0 else 0
    average_loss = sum(x for x in trade_results['profits'] if x < 0) / num_losses if num_losses > 0 else 0

    profit_factor = -average_win / average_loss if average_loss != 0 else 0

    long_win_rate = sum(1 for x in trade_results['long_profits'] if x > 0) / len(trade_results['long_profits']) if trade_results['long_profits'] else 0
    short_win_rate = sum(1 for x in trade_results['short_profits'] if x > 0) / len(trade_results['short_profits']) if trade_results['short_profits'] else 0

    # Create cumulative return series
    cumulative_returns = np.cumsum(trade_results['profits'])
    cumulative_long_returns = np.cumsum(trade_results['long_profits'])
    cumulative_short_returns = np.cumsum(trade_results['short_profits'])

    # Calculate maximum drawdown using numpy
    cumulative_max = np.maximum.accumulate(cumulative_returns)
    non_zero_max = cumulative_max > 0
    drawdowns = np.full_like(cumulative_returns, fill_value=0)
    drawdowns[non_zero_max] = 1 - cumulative_returns[non_zero_max] / cumulative_max[non_zero_max]
    max_drawdown = np.max(drawdowns)
    maximum_single_trade_loss = min(trade_results['profits']) if trade_results['profits'] else 0

    # Print the statistics
    print(f"Total Return: {total_return:.2f}")
    print(f"Total Trade Num: {len(trade_results['profits'])}")
    print(f"Max Drawdown: {max_drawdown * 100:.2f}%")
    print(f"Max Single Trade Loss: {maximum_single_trade_loss:.2f}")
    print(f"Profit Factor: {profit_factor:.2f}")
    print(f"Long Trade Num: {len(trade_results['long_profits'])}")
    print(f"Long Win Rate: {long_win_rate * 100:.2f}%")
    print(f"Short Trade Num: {len(trade_results['short_profits'])}")
    print(f"Short Win Rate: {short_win_rate * 100:.2f}%")

    # Plot the results
    plt.figure(figsize=(10,6))
    plt.plot(cumulative_returns, label='Total Returns')
    plt.title("Cumulative Total Returns")
    plt.xlabel("Trade")
    plt.ylabel("Cumulative Return")
    plt.grid(True)
    plt.show()
    
    plt.figure(figsize=(10,6))
    plt.plot(cumulative_long_returns, label='Long Returns', color='blue')
    plt.plot(cumulative_short_returns, label='Short Returns', color='red')
    plt.title("Cumulative Long and Short Returns")
    plt.xlabel("Trade")
    plt.ylabel("Cumulative Return")
    plt.legend()
    plt.grid(True)
    plt.show()

def calc_trade_fee(price):
    fee_rate = 0.001
    return price * fee_rate

def resample_data(df):    
    # Convert the index to datetime
    df.index = pd.to_datetime(df.index)

    # Resample data to different timeframes
    timeframes = ['1T', '5T', '15T', '30T']

    resampled_dfs = []

    for tf in timeframes:
        resampled = df.resample(tf).agg({'open': 'first', 'high': 'max', 'low': 'min', 'close': 'last'})
        resampled.dropna(inplace=True)
        resampled.columns = [f"{tf}_{col}" for col in resampled.columns]  # df.columns -> resampled.columns
        prefix = resampled.columns[0].split('_')[0]
        processed_df = feature_engineering(resampled, prefix)
        processed_df = create_label(processed_df, prefix)
        resampled_dfs.append(processed_df)

    # Combine resampled data
    combined_resampled = pd.concat(resampled_dfs, axis=1).fillna(0)
    return combined_resampled


def initPortfolio():
    portfolio = {
        'position': None,  # "long" or "short"
        'entry_price': None,
        'entry_point': 0
    }
    return portfolio

if __name__ == "__main__":
    start_time = time.time() 

    file_names = ["test_combined_data.csv"]
    dfs = load_data(file_names)

    # Assuming you have only one DataFrame in the list
    combined_df = dfs[0]
    combined_df = resample_data(combined_df)

    # add feature support and resistance
    combined_df = price_relation(combined_df, '1T', '5T')
    combined_df = price_relation(combined_df, '5T', '15T')
    combined_df = price_relation(combined_df, '5T', '30T')

    model_path = "model/5m_model.pkl"
    with open(model_path, "rb") as f:
        model = pickle.load(f)

    # Initialize trade_results
    trade_results = {
        'profits': [],
        'long_profits': [],
        'short_profits': [],
    }

    # Initialize portfolio state
    portfolio = initPortfolio()

    # Loop through the data
    for i in range(0, len(combined_df)):
        profit = 0

        # Get the current row of data
        data_row = combined_df.iloc[i]
        prev_row = combined_df.iloc[i-1]

        # Retrieve the RSI and Bollinger Bands from the row
        rsi = data_row['1T_RSI']
        rsi_prev = prev_row['1T_RSI']
        rsi_upper = data_row['1T_RSI_BB_UPPER']
        rsi_lower = data_row['1T_RSI_BB_LOWER']
        bb_upper = data_row['1T_BB_UPPER']
        bb_lower = data_row['1T_BB_LOWER']
        bb_middle = data_row['1T_BB_MIDDLE']

        # Make a prediction with the model
        y_pred = predict(data_row)

        # Exit
        if portfolio['position'] is not None:
            if portfolio['position'] == 'long':
                if data_row['1T_high'] >= bb_upper:

                    profit = data_row['1T_close'] - portfolio['entry_price']
                    trade_results['profits'].append(profit)
                    trade_results['long_profits'].append(profit)
                    portfolio = initPortfolio()
                
                else:
                    trade_results['profits'].append(0)
            elif portfolio['position'] == 'short':
                if data_row['1T_low'] <= bb_lower:
                    
                    profit = portfolio['entry_price'] - data_row['1T_close']
                    trade_results['profits'].append(profit)
                    trade_results['short_profits'].append(profit)
                    portfolio = initPortfolio()

                else:
                    trade_results['profits'].append(0)
            else:
                trade_results['profits'].append(0)
        
        # Short Entry
        # elif rsi_prev > rsi_upper and rsi < rsi_upper:
        #     # if y_pred == 0:
        #         trade_results['profits'].append(0)
        #         portfolio = {
        #             'position': 'short',
        #             'entry_price': data_row['1T_close'],
        #             'entry_point': i
        #         }

        # Long Entry
        elif rsi_prev < rsi_lower and rsi > rsi_lower:
            # if y_pred == 1:
                trade_results['profits'].append(0)
                portfolio = {
                    'position': 'long',
                    'entry_price': data_row['1T_close'],
                    'entry_point': i
                }

        else:
            trade_results['profits'].append(0)

    summarize_trade_results(trade_results)

    # plot ====
    def plot_candlestick(start, end):
        start = pd.to_datetime(start)
        end = pd.to_datetime(end)
        
        plot_df = combined_df.loc[start:end]
        ohlc_df = plot_df[['1T_open', '1T_high', '1T_low', '1T_close']]
        ohlc_df.columns = ['open', 'high', 'low', 'close']
        
        mpf.plot(ohlc_df, type='line', style='yahoo', volume=False, tight_layout=True, warn_too_much_data=7000000)

    start_date = widgets.DatePicker(
        description='Start Date',
        value=pd.to_datetime(combined_df.index.min()),
        disabled=False
    )

    end_date = widgets.DatePicker(
        description='End Date',
        value=pd.to_datetime(combined_df.index.max()),
        disabled=False
    )

    interact(plot_candlestick, start=start_date, end=end_date)
    # plot ====

    elapsed_time = time.time() - start_time
    minutes, seconds = divmod(elapsed_time, 60)
    print(f"Execution time: {int(minutes)} minutes {seconds:.2f} seconds")

  df[f'{short_prefix}_close_to_{long_prefix}_support'] = (short_close - long_support) / long_support
  df[f'{short_prefix}_close_to_{long_prefix}_resistance'] = (short_close - long_resistance) / long_resistance
  df[f'{short_prefix}_close_to_{long_prefix}_support'] = (short_close - long_support) / long_support
  df[f'{short_prefix}_close_to_{long_prefix}_resistance'] = (short_close - long_resistance) / long_resistance
  df[f'{short_prefix}_close_to_{long_prefix}_support'] = (short_close - long_support) / long_support
  df[f'{short_prefix}_close_to_{long_prefix}_resistance'] = (short_close - long_resistance) / long_resistance
[LightGBM] [Fatal] The number of features in data (125) is not the same as it was in training data (95).
You can set ``predict_disable_shape_check=true`` to discard this error, but please be aware what you are doing.


LightGBMError: The number of features in data (125) is not the same as it was in training data (95).
You can set ``predict_disable_shape_check=true`` to discard this error, but please be aware what you are doing.