In [1]:
import os
import pandas as pd
import numpy as np
import talib
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import joblib

In [2]:
# データの読み込み
def load_data(file_names):
    dfs = []
    for file_name in file_names:
        df = pd.read_csv(file_name)
        dfs.append(df)
    return dfs

In [3]:
# 特徴量エンジニアリング
def feature_engineering(df):
    open = df['open'].values
    high = df['high'].values
    low = df['low'].values
    close = df['close'].values
    volume = df['volume'].values

    # TA-Libを使用して一般的なテクニカル指標を計算
    df['RSI'] = talib.RSI(close)
    df['MACD'], _, _ = talib.MACD(close)
    df['ATR'] = talib.ATR(high, low, close)
    df['ADX'] = talib.ADX(high, low, close)
    df['SMA'] = talib.SMA(close)
    df['BB_UPPER'], df['BB_MIDDLE'], df['BB_LOWER'] = talib.BBANDS(close)

    # 欠損値の削除
    df = df.dropna()
    df = df.reset_index(drop=True)

    return df

In [7]:
# ラベルデータ作成
def create_label(df, lookahead=1):
    df['target'] = (df['close'].shift(-lookahead) > df['close']).astype(int)
    df = df.dropna()
    return df

In [8]:
# 学習と評価
def train_and_evaluate(df):
    features = df.drop('target', axis=1)
    labels = df['target']
    
    # timestamp列をDatetime型に変換
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    # timestamp列をインデックスに設定
    df = df.set_index('timestamp')
    
    print(df.dtypes)

    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_test, label=y_test)

    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9
    }

    verbose_eval = 0  # この数字を1にすると学習時のスコア推移がコマンドライン表示される

    model = lgb.train(
        params=params,
        train_set=train_data,
        valid_sets=[train_data, test_data],
        num_boost_round=10000,  # 最大学習サイクル数。early_stopping使用時は大きな値を入力
        callbacks=[lgb.early_stopping(stopping_rounds=10, 
                verbose=True), # early_stopping用コールバック関数
                lgb.log_evaluation(verbose_eval)] # コマンドライン出力用コールバック関数
    )

    y_pred = model.predict(X_test)
    y_pred = np.round(y_pred).astype(int)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")
    print(classification_report(y_test, y_pred))

    return model

In [9]:
if __name__ == "__main__":
    file_names = [
        "data/BTCUSD_15m_20210801_20211231.csv", 
        "data/BTCUSD_1h_20210801_20211231.csv", 
        "data/BTCUSD_4h_20210801_20211231.csv"]
    dfs = load_data(file_names)

    # 各タイムフレームのデータに対して特徴量エンジニアリングとラベル作成を行う
    processed_dfs = []
    for df in dfs:
        processed_df = feature_engineering(df)
        processed_df = create_label(processed_df)
        processed_dfs.append(processed_df)

    # 複数のタイムフレームのデータを結合（インデックスが一致するように注意）
    combined_df = pd.concat(processed_dfs, axis=1).dropna()
    display(combined_df)

    # モデルの学習と評価を行う
    model = train_and_evaluate(combined_df)
 
    # モデルを保存する
    model_path = os.path.join("model", "model.pkl")
    joblib.dump(model, model_path)

Unnamed: 0,timestamp,open,high,low,close,volume,RSI,MACD,ATR,ADX,...,volume.1,RSI.1,MACD.1,ATR.1,ADX.1,SMA,BB_UPPER,BB_MIDDLE,BB_LOWER,target
0,1627774200000,41796.87,41908.72,41714.43,41794.32,0.873083,55.441502,131.711757,172.466386,27.472171,...,21.549914,57.019532,-131.022970,1037.249518,24.843619,39396.676667,41791.413460,40260.346,38729.278540,0.0
1,1627775100000,41794.32,41837.48,41322.13,41447.67,4.987125,43.706032,97.899487,196.958073,26.388406,...,19.129770,55.285221,-48.150814,1013.320267,23.523107,39361.436667,41049.255767,40606.786,40164.316233,1.0
2,1627776000000,41447.67,41537.68,41146.99,41471.68,4.528700,44.581035,72.208056,210.796068,26.165502,...,42.918174,64.244558,146.994958,1071.828819,22.568489,39390.152333,42223.561408,40899.474,39575.386592,1.0
3,1627776900000,41471.68,41646.94,41422.70,41638.83,0.097362,50.365107,64.590462,211.756349,25.276055,...,33.835406,67.497790,358.360734,1101.593903,22.747542,39445.473333,43381.314174,41307.422,39233.529826,0.0
4,1627777800000,41638.83,41697.96,41549.33,41686.15,0.150340,51.895777,61.661002,207.247324,24.145421,...,18.851379,66.871069,514.123656,1063.561482,22.913806,39544.833333,43860.950841,41829.472,39797.993159,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
962,1628643600000,45733.99,45776.15,45607.33,45609.53,0.381589,53.688718,74.458403,219.453481,11.577857,...,138.907750,47.673857,106.248639,752.678343,17.127152,42509.531000,43201.416575,42744.930,42288.443425,0.0
963,1628644500000,45609.53,45890.64,45580.54,45821.40,0.786056,60.165205,87.291942,225.928232,11.994144,...,247.716618,41.022625,25.983293,760.287033,17.495688,42513.939333,43013.363014,42507.534,42001.704986,1.0
964,1628645400000,45821.40,45928.28,45698.36,45727.84,1.277983,56.413399,88.888454,226.213359,12.598503,...,402.430348,53.154117,48.511545,817.010102,16.725390,42562.432333,43269.546940,42585.286,41901.025060,1.0
965,1628646300000,45727.84,45781.19,45651.95,45773.46,0.171445,57.795388,92.765510,219.286690,12.757616,...,128.335931,53.251464,66.414745,803.264380,15.870366,42575.708333,43483.452164,42689.678,41895.903836,0.0


ValueError: cannot assemble with duplicate keys