In [1]:
import os
import pandas as pd

In [2]:
DATA_PATH = "framework/data/BTC_USDT_5m_raw.csv"

if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Data file not found: {DATA_PATH}")

print(f"Loading data from {DATA_PATH}...")
df = pd.read_csv(DATA_PATH)

print(f"Data Rows: {len(df)}, Columns: {', '.join(df.columns)}")

Loading data from framework/data/BTC_USDT_5m_raw.csv...
Data Rows: 105082, Columns: timestamp, open, high, low, close, volume, rel_body, upper_wick, lower_wick, ebsw, macd, macd_signal, macd_hist, rsi, sqz, sqz_on, sqz_off, sqz_no, pvo, pvo_hist, pvo_signal, bop, willr, st, st_direction, vwap, dd, dd_pct, dd_log, log_return, entropy, adx, adx_osc, aroon_up, aroon_down, aroon, chop, psar, psar_direction, vortex_p, vortex_m, vortex, natr, bb_lower, bb_mid, bb_upper, bb_width, bb_pct, ui, cmf, efi, mfi, obv


In [3]:
from sklearn.model_selection import train_test_split

# First split: 70% Train, 30% Temp (Val + Test)
train_df, temp_df = train_test_split(df, test_size=0.3, shuffle=False)

# Second split: Split the 30% Temp into 50% Val (15% total) and 50% Test (15% total)
val_df, test_df = train_test_split(temp_df, test_size=0.5, shuffle=False)

# Reset indices to ensure environment works correctly
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

print("Train Set:")
print(train_df.describe())

print("Val Set:")
print(val_df.describe())

print("Test Set:")
print(test_df.describe())

Train Set:
                open           high            low          close  \
count   73557.000000   73557.000000   73557.000000   73557.000000   
mean   104312.384348  104386.745268  104237.276893  104312.584006   
std     12446.694223   12437.410469   12454.854089   12446.673597   
min     74610.000000   74987.410000   74508.000000   74610.000000   
25%     94540.010000   94600.000000   94480.000000   94540.010000   
50%    107963.120000  108016.200000  107889.990000  107963.470000   
75%    114071.990000  114136.600000  114003.820000  114071.990000   
max    126011.180000  126199.630000  125868.130000  126011.180000   

             volume      rel_body    upper_wick    lower_wick          ebsw  \
count  73557.000000  73557.000000  7.355700e+04  7.355700e+04  73557.000000   
mean      69.822572      0.000003  3.205470e-04  3.281300e-04      0.003916   
std      107.873589      0.001335  5.132153e-04  5.312521e-04      0.830586   
min        0.909580     -0.073382  0.000000e+00  0.

In [4]:
from framework.analysis.technical_indicators import TechnicalIndicators

ti = TechnicalIndicators()

ti.fit_scalers(train_df)

train_df = ti.normalize_indicators(train_df)
val_df = ti.normalize_indicators(val_df)
test_df = ti.normalize_indicators(test_df)

print("Train Set:")
print(train_df.describe())

print("Val Set:")
print(val_df.describe())

print("Test Set:")
print(test_df.describe())

Train Set:
           rel_body    upper_wick    lower_wick          ebsw          macd  \
count  73557.000000  73557.000000  73557.000000  73557.000000  73557.000000   
mean       0.000445      0.318476      0.315052      0.003916     -0.012484   
std        0.997205      0.861742      0.856190      0.830586      0.994285   
min       -3.000000     -0.365936     -0.368647     -1.000000     -3.000000   
25%       -0.500247     -0.365714     -0.368428     -0.907893     -0.493679   
50%        0.000000      0.000000      0.000000      0.015398      0.000000   
75%        0.499753      0.634286      0.631572      0.909926      0.506321   
max        3.000000      3.000000      3.000000      0.999997      3.000000   

          macd_hist           rsi           sqz        sqz_on           pvo  \
count  73557.000000  73557.000000  73557.000000  73557.000000  73557.000000   
mean       0.013863     -0.020916     -0.002743      0.125644      0.086938   
std        0.971097      0.259825      0