In [1]:
from pathlib import Path
import pandas as pd

In [2]:
DATA_PATH = Path.home() / "data" / "BTCUSDT"

In [3]:
transac_path = DATA_PATH / "transactions"

In [6]:
csv_files = sorted(list(transac_path.glob("*.csv")))
df = pd.concat([pd.read_csv(file, index_col="Datetime", parse_dates=True) for file in csv_files])
df

Unnamed: 0_level_0,side,size,price
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-01-01 09:00:03+09:00,Buy,0.001,28922.000
2021-01-01 09:00:04+09:00,Buy,10.313,28922.000
2021-01-01 09:00:04+09:00,Sell,0.001,28921.500
2021-01-01 09:00:05+09:00,Buy,0.016,28922.000
2021-01-01 09:00:05+09:00,Sell,0.003,28921.500
...,...,...,...
2022-08-29 08:59:57+09:00,Buy,0.331,19547.625
2022-08-29 08:59:57+09:00,Sell,0.007,19547.250
2022-08-29 08:59:58+09:00,Buy,1.382,19548.000
2022-08-29 08:59:58+09:00,Sell,0.497,19547.500


In [9]:
df.to_pickle(DATA_PATH / "transaction.pkl")

In [17]:
from scripts.extract_features import attach_features

In [18]:
ohlcv_path = DATA_PATH / "ohlcv"
ohlcv_with_features_path = DATA_PATH / "ohlcv_with_features"
ohlcv_path.mkdir(exist_ok=True, parents=True)
ohlcv_with_features_path.mkdir(exist_ok=True, parents=True)

In [22]:
df = pd.read_pickle(DATA_PATH / "transaction.pkl")
rules = ["1T", "5T", "15T", "1H", "4H", "1D"]
for rule in rules:
    ohlcv: pd.DataFrame = df["price"].resample(rule).ohlc()
    ohlcv["Volume"] = df["size"].resample(rule).sum()
    ohlcv = ohlcv.rename(columns=str.capitalize)
    ohlcv = ohlcv.fillna(method="ffill").fillna(method="bfill")
    ohlcv.to_pickle(ohlcv_path / f"{rule}.pkl")

    ohlcv_with_features = attach_features(ohlcv)
    ohlcv_with_features.to_pickle(ohlcv_with_features_path / f"{rule}.pkl")

In [25]:
# １分足を月別に保存する
ohlcv = pd.read_pickle(ohlcv_path / "1T.pkl")
ohlcv["Year"] = ohlcv.index.year
ohlcv["Month"] = ohlcv.index.month
for year in ohlcv["Year"].unique():
    for month in ohlcv["Month"].unique():
        df_month = ohlcv.loc[(ohlcv["Year"] == year) & (ohlcv["Month"] == month), :]
        if not df_month.empty:
            df_month[["Open", "High", "Low", "Close", "Volume"]].to_csv(ohlcv_path / f"{year}-{month}.csv")

In [27]:
# divide into train and test with scale
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
for rule in rules:
    df = pd.read_pickle(ohlcv_with_features_path / f"{rule}.pkl")
    fe_cols = [col for col in df.columns if col.startswith("feature_")]
    print(fe_cols)
    df_train, df_test = train_test_split(df, test_size=0.1, shuffle=False)
    scaler = StandardScaler()
    df_train[fe_cols] = pd.DataFrame(scaler.fit_transform(df_train[fe_cols]), columns=fe_cols, index=df_train.index)
    df_test[fe_cols] = pd.DataFrame(scaler.transform(df_test[fe_cols]), columns=fe_cols, index=df_test.index)
    df_train.to_pickle(ohlcv_with_features_path / f"{rule}_train.pkl")
    df_test.to_pickle(ohlcv_with_features_path / f"{rule}_test.pkl")
    with open(ohlcv_with_features_path / f"{rule}_scaler.pkl", "wb") as f:
        pickle.dump(scaler, f)

['feature_candle_value', 'feature_candle_value_mean_10', 'feature_candle_value_mean_20', 'feature_candle_value_mean_5', 'feature_exceed_high_10', 'feature_exceed_high_20', 'feature_exceed_high_3', 'feature_exceed_high_5', 'feature_exceed_low_10', 'feature_exceed_low_20', 'feature_exceed_low_3', 'feature_exceed_low_5', 'feature_gap_ma_10', 'feature_gap_ma_20', 'feature_gap_ma_3', 'feature_gap_ma_5', 'feature_log_return', 'feature_log_return_mean_10', 'feature_log_return_mean_20', 'feature_log_return_mean_5', 'feature_lower_shadow', 'feature_lower_shadow_mean_10', 'feature_lower_shadow_mean_20', 'feature_lower_shadow_mean_5', 'feature_price_momentum_10', 'feature_price_momentum_20', 'feature_price_momentum_3', 'feature_price_momentum_5', 'feature_range', 'feature_real_body', 'feature_real_body_mean_10', 'feature_real_body_mean_20', 'feature_real_body_mean_5', 'feature_shadow_range', 'feature_shadow_range_mean_10', 'feature_shadow_range_mean_20', 'feature_shadow_range_mean_5', 'feature_tr