In [1]:
import os
import pandas as pd
import numpy as np

import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, KFold
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from datetime import datetime, time, timedelta
import pytz

from feature_engineer import FeatureEngineer

In [2]:
def train_and_evaluate(df, n_splits=5):
    features = df.drop('target', axis=1)
    labels = df['target']

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    accuracies = []
    feature_importances = []

    for train_index, test_index in kf.split(features):
        X_train, X_test = features.iloc[train_index], features.iloc[test_index]
        y_train, y_test = labels.iloc[train_index], labels.iloc[test_index]

        train_data = lgb.Dataset(X_train, label=y_train)
        test_data = lgb.Dataset(X_test, label=y_test)

        params = {
            'objective': 'binary',
            'metric': 'binary_error',
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.9
        }

        evals_result = {}

        model = lgb.train(
            params=params,
            train_set=train_data,
            valid_sets=[train_data, test_data],
            num_boost_round=10000,
            callbacks=[
                lgb.callback.early_stopping(10),
                lgb.callback.log_evaluation(period=100),
                lgb.callback.record_evaluation(evals_result)
            ],
        )

        y_pred = model.predict(X_test)
        y_pred = np.round(y_pred).astype(int)

        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        print(f"Accuracy: {accuracy}")
        print(classification_report(y_test, y_pred))
        
        plot_learning_curve(evals_result)
        feature_importances.append(model.feature_importance())
    
    mean_accuracy = np.mean(accuracies)
    print(f"Mean accuracy: {mean_accuracy}")

    mean_importance = np.mean(feature_importances, axis=0)
    plot_feature_importance(mean_importance, features.columns)

    return model, evals_result

def create_label(df, lookbehind=1):
    df['target'] = (df['close'] > df['close'].shift(lookbehind)).astype(int)
    df = df.fillna(method='ffill')
    return df

def plot_learning_curve(evals_result):
    fig, ax = plt.subplots(1, 1, figsize=(10, 5))
    ax.plot(np.arange(len(evals_result['training']['binary_error'])),
            evals_result['training']['binary_error'], label='Training')
    ax.plot(np.arange(len(evals_result['valid_1']['binary_error'])),
            evals_result['valid_1']['binary_error'], label='Validation')
    ax.set_title('Learning Curve')
    ax.set_xlabel('Iteration')
    ax.set_ylabel('Binary Error')
    ax.legend()
    plt.show()

def plot_feature_importance(importances, feature_names):
    importance = pd.DataFrame({"Feature": feature_names,
                               "Importance": importances})
    importance.sort_values(by="Importance", ascending=False, inplace=True)
    plt.figure(figsize=(15, 30))
    sns.barplot(x="Importance", y="Feature", data=importance)
    plt.title("Feature Importance")
    plt.tight_layout()
    plt.show()

In [3]:
df = pd.read_csv('../csv/BTCUSDT_1m_20210801_20221231.csv')
fe = FeatureEngineer(df)
df = fe.feature_engineering(df)
display(df)

Unnamed: 0,date,open,high,low,close,volume,RSI_ST,MACD,MACD_ST,ATR,...,MON,OBV,High_Close_Comparison,consecutive_up,consecutive_down,double_top,double_bottom,ascending_trendline,descending_trendline,triangle_pattern
0,1627755600000,41306.41,41350.00,41306.41,41335.68,0.002180,68.860868,31.000378,31.000378,31.109313,...,70.38,-0.442464,1,2,0,0,0,4.335178e+04,3.912580e+04,0
1,1627755720000,41344.84,41384.23,41344.84,41384.23,0.145714,73.288201,38.847941,38.847941,32.355076,...,95.56,-0.296750,1,3,0,0,0,4.370691e+04,3.880490e+04,0
2,1627755780000,41384.23,41384.23,41377.79,41377.81,0.001080,71.833764,44.041468,44.041468,30.503999,...,82.45,-0.297830,0,0,1,0,0,4.371975e+04,3.879206e+04,0
3,1627755840000,41377.81,41416.00,41377.81,41407.93,0.051798,74.400584,50.011308,50.011308,31.052999,...,122.80,-0.246032,1,1,0,1,0,4.405504e+04,3.848854e+04,0
4,1627755900000,41407.93,41407.93,41388.26,41389.59,0.001933,70.205262,52.655585,52.655585,30.239928,...,83.18,-0.247965,0,0,1,0,0,4.406946e+04,3.847412e+04,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
740596,1672413540000,16508.20,16512.86,16498.28,16508.61,15.606035,80.867625,16.851140,16.851140,17.342422,...,56.93,-8254.604605,0,14,0,0,0,5.003070e+06,-4.970132e+06,0
740597,1672413600000,16508.61,16523.98,16504.91,16504.91,18.356253,77.945828,18.792983,18.792983,17.465821,...,49.90,-8272.960858,0,0,1,0,0,5.003077e+06,-4.970139e+06,0
740598,1672413660000,16504.91,16513.00,16502.58,16510.18,7.890198,79.103896,20.520603,20.520603,16.962548,...,48.55,-8265.070660,0,1,0,0,0,5.003084e+06,-4.970145e+06,0
740599,1672413720000,16510.18,16517.09,16499.36,16517.09,9.953238,80.546334,22.191525,22.191525,17.017366,...,22.48,-8255.117422,1,2,0,0,0,4.855675e+06,-4.822733e+06,0


In [4]:
model, evals_result = train_and_evaluate(df)

KeyError: "['target'] not found in axis"