In [1]:
import os
import pandas as pd
import numpy as np
import talib
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, KFold
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from datetime import datetime, time, timedelta

pd.set_option('display.max_columns', None)

def load_data(file_names):
    dfs = []
    for file_name in file_names:
        df = pd.read_csv(file_name, index_col="date")
        dfs.append(df)
    return dfs

def feature_engineering(df):
    open = df['open'].values
    high = df['high'].values
    low = df['low'].values
    close = df['close'].values
    # volume = df['_volume'].values
    hilo = (high + low) / 2

    rsi = talib.RSI(close, timeperiod=8) # default = 14
    df['RSI'] = rsi
    df['RSI_BB_UPPER'], df['_RSI_BB_MIDDLE'], df['_RSI_BB_LOWER'] = talib.BBANDS(rsi, timeperiod=20, nbdevup=2, nbdevdn=2, matype=0)
    
    df['_RSI_ST'] = talib.RSI(close)/close
    df['_RSI_LOG'] = log_transform_feature(talib.RSI(close))
    df['_MACD'], _, _ = talib.MACD(close)
    df['_MACD_ST'], _, _ = talib.MACD(close)/close
    df['_ATR'] = talib.ATR(high, low, close)
    df['_ADX'] = talib.ADX(high, low, close, timeperiod=14)
    df['_ADXR'] = talib.ADXR(high, low, close, timeperiod=14)
    
    df['_SMA20'] = talib.SMA(close, timeperiod=20)
    df['_SMA50'] = talib.SMA(close, timeperiod=50)
    df['_SMA100'] = talib.SMA(close, timeperiod=100)
    
    df['_BB_UPPER'], df['_BB_MIDDLE'], df['_BB_LOWER'] = talib.BBANDS(close, timeperiod=20, nbdevup=2, nbdevdn=2, matype=0)
    df['_BBANDS_upperband'] = (df['_BB_UPPER'] - hilo) / close
    df['_BBANDS_middleband'] = (df['_BB_MIDDLE'] - hilo) / close
    df['_BBANDS_lowerband'] = (df['_BB_LOWER'] - hilo) / close
    # df['_BBANDS_bandwidth'] = (df['_BB_UPPER'] - df['_BB_LOWER']) / df['_BB_MIDDLE']
    # df['_BBANDS_squeeze'] = (df['_BBANDS_bandwidth'] < df['_BBANDS_bandwidth'].rolling(window=20).mean()).astype(int)

    df['_STOCH_K'], df['_STOCH_D'] = talib.STOCH(high, low, close)/close
    df['_MON'] = talib.MOM(close, timeperiod=5)
    # df['_OBV'] = talib.OBV(close, volume)

    df['_support'] = df['_low'].rolling(window=20, min_periods=1).min()
    df['_resistance'] = df['_high'].rolling(window=20, min_periods=1).max()

    df = df.fillna(method='ffill')

    return df

def log_transform_feature(X):
    X[X <= 0] = np.finfo(float).eps
    return np.log(X)

def price_relation(df, short_prefix, long_prefix):
    short_close = df[f'{short_prefix}_close']
    long_support = df[f'{long_prefix}_support']
    long_resistance = df[f'{long_prefix}_resistance']
    df[f'{short_prefix}_close_to_{long_prefix}_support'] = (short_close - long_support) / long_support
    df[f'{short_prefix}_close_to_{long_prefix}_resistance'] = (short_close - long_resistance) / long_resistance
    return df

def create_label(df, prefix, lookbehind=1):
    df['_target'] = (df['_close'] > df['_close'].shift(lookbehind)).astype(int)
    df = df.fillna(method='ffill')
    return df

def plot_learning_curve(evals_result):
    fig, ax = plt.subplots(1, 1, figsize=(10, 5))
    ax.plot(np.arange(len(evals_result['training']['binary_error'])),
            evals_result['training']['binary_error'], label='Training')
    ax.plot(np.arange(len(evals_result['valid_1']['binary_error'])),
            evals_result['valid_1']['binary_error'], label='Validation')
    ax.set_title('Learning Curve')
    ax.set_xlabel('Iteration')
    ax.set_ylabel('Binary Error')
    ax.legend()
    plt.show()

def plot_feature_importance(importances, feature_names):
    importance = pd.DataFrame({"Feature": feature_names,
                               "Importance": importances})
    importance.sort_values(by="Importance", ascending=False, inplace=True)
    plt.figure(figsize=(15, 30))
    sns.barplot(x="Importance", y="Feature", data=importance)
    plt.title("Feature Importance")
    plt.tight_layout()
    plt.show()

def train_and_evaluate(df, n_splits=2):
    features = df.drop('15T_target', axis=1)
    labels = df['15T_target']

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    accuracies = []
    feature_importances = []

    for train_index, test_index in kf.split(features):
        X_train, X_test = features.iloc[train_index], features.iloc[test_index]
        y_train, y_test = labels.iloc[train_index], labels.iloc[test_index]

        train_data = lgb.Dataset(X_train, label=y_train)
        test_data = lgb.Dataset(X_test, label=y_test)

        params = {
            'objective': 'binary',
            'metric': 'binary_error',
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.9
        }

        evals_result = {}

        model = lgb.train(
            params=params,
            train_set=train_data,
            valid_sets=[train_data, test_data],
            num_boost_round=10000,
            callbacks=[
                lgb.callback.early_stopping(10),
                lgb.callback.log_evaluation(period=100),
                lgb.callback.record_evaluation(evals_result)
            ],
        )

        y_pred = model.predict(X_test)
        y_pred = np.round(y_pred).astype(int)

        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        print(f"Accuracy: {accuracy}")
        print(classification_report(y_test, y_pred))
        
        plot_learning_curve(evals_result)
        feature_importances.append(model.feature_importance())
    
    mean_accuracy = np.mean(accuracies)
    print(f"Mean accuracy: {mean_accuracy}")

    mean_importance = np.mean(feature_importances, axis=0)
    plot_feature_importance(mean_importance, features.columns)

    return model, evals_result

if __name__ == "__main__":
    file_names = ["combined_data.csv"]
    dfs = load_data(file_names)

    # Assuming you have only one DataFrame in the list
    combined_df = dfs[0]
    combined_df.index = pd.to_datetime(combined_df.index)

    # Apply feature engineering and label creation directly on the original data
    combined_df = feature_engineering(combined_df, '1T')
    combined_df = create_label(combined_df, '1T')

    model, evals_result = train_and_evaluate(combined_df)

    model_path = os.path.join("model", "simple_1m_model.pkl")
    with open(model_path, "wb") as f:
        pickle.dump(model, f)

    print(f"Model saved to {model_path}")

KeyError: '1T_open'