In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import joblib
from statsmodels.tsa.stattools import adfuller

In [3]:
root = '/content/drive/MyDrive/Bitcoin/'
df = pd.read_csv(f'{root}btc_1d_data_2018_to_2025.csv')

print(f"Original shape: {df.shape}")
print(f"Date range: {df['Open time'].min()} to {df['Open time'].max()}")
print(f"Missing values:\n{df.isnull().sum()}")

Original shape: (2881, 12)
Date range: 2018-01-01 00:00:00.000000 UTC to 2025-11-20 00:00:00.000000 UTC
Missing values:
Open time                       0
Open                            0
High                            0
Low                             0
Close                           0
Volume                          0
Close time                      0
Quote asset volume              0
Number of trades                0
Taker buy base asset volume     0
Taker buy quote asset volume    0
Ignore                          0
dtype: int64


In [4]:
df["Open time"] = pd.to_datetime(df["Open time"])
df = df.set_index("Open time").sort_index()
df = df[~df.index.duplicated(keep='first')]

print("Data loaded:", df.shape)

Data loaded: (2881, 11)


In [5]:
df["log_return"] = np.log(df["Close"] / df["Close"].shift(1))

# Check stationarity
adf_result = adfuller(df["log_return"].dropna())
print(f"ADF Statistic: {adf_result[0]:.4f}")
print(f"p-value: {adf_result[1]:.4f}")
print(f"Stationary: {'YES' if adf_result[1] < 0.05 else 'NO'}")

ADF Statistic: -25.8745
p-value: 0.0000
Stationary: YES


In [6]:
# Rolling volatility (shifted to avoid leakage)
df["vol_7"]  = df["log_return"].rolling(7).std().shift(1)
df["vol_14"] = df["log_return"].rolling(14).std().shift(1)
df["vol_30"] = df["log_return"].rolling(30).std().shift(1)

# Exponentially weighted volatility
df["ewm_vol_7"] = df["log_return"].ewm(span=7).std().shift(1)

# Volatility-normalized returns (main noise-reduction target)
df["log_return_norm"] = df["log_return"] / df["ewm_vol_7"]

In [7]:
# Moving Average smoothing of returns
df["ma_return_3"] = df["log_return"].rolling(3).mean().shift(1)
df["ma_return_5"] = df["log_return"].rolling(5).mean().shift(1)

In [8]:
# Lag features
for lag in [1, 2, 3]:
    df[f"lag_{lag}"] = df["log_return"].shift(lag)

In [9]:
# Additional features
df["vol_ratio"] = df["vol_7"] / df["vol_30"]

df["volume_change"] = df["Volume"].pct_change().shift(1)
df["volume_vol_5"] = df["Volume"].pct_change().shift(1).rolling(5).std()

df["high_low_range"] = ((df["High"] - df["Low"]) / df["Open"]).shift(1)
df["close_open_ratio"] = (df["Close"] / df["Open"]).shift(1)
df["momentum_3"] = df["Close"].pct_change(3).shift(1)

In [10]:
# Feature list
base_features = [
    # raw lagged returns
    'lag_1', 'lag_2', 'lag_3',

    # volatility features
    'vol_7', 'vol_14', 'vol_30', 'vol_ratio', 'ewm_vol_7',

    # smoothed returns
    'ma_return_3', 'ma_return_5',

    # volume features
    'volume_change', 'volume_vol_5',

    # price-action features
    'high_low_range', 'close_open_ratio', 'momentum_3'
]

df.dropna(inplace=True)
print("Features created:", len(base_features))

Features created: 15


In [11]:
train_size = int(len(df) * 0.8)
train_df = df.iloc[:train_size]
test_df  = df.iloc[train_size:]

In [12]:
scaler = StandardScaler()
scaler.fit(train_df[base_features])

df_scaled = scaler.transform(df[base_features])
scaled_cols = [f"{c}_scaled" for c in base_features]

df_scaled = pd.DataFrame(df_scaled, index=df.index, columns=scaled_cols)

# Attach scaled features back
df[scaled_cols] = df_scaled

In [13]:
def create_sequences(data, target, seq=7):
    X, y = [], []
    for i in range(len(data) - seq):
        X.append(data[i : i + seq])
        y.append(target[i + seq])
    return np.array(X), np.array(y)

seq_len = 7

In [14]:
X_all, y_all = create_sequences(
    df[scaled_cols].values,
    df["log_return_norm"].values,
    seq_len
)

print("All sequences:", X_all.shape, y_all.shape)

All sequences: (2843, 7, 15) (2843,)


In [15]:
train_seq_size = train_size - seq_len

X_train_lstm = X_all[:train_seq_size]
y_train_lstm = y_all[:train_seq_size]

X_test_lstm  = X_all[train_seq_size:]
y_test_lstm  = y_all[train_seq_size:]

print("Train sequences:", X_train_lstm.shape, y_train_lstm.shape)
print("Test sequences:",  X_test_lstm.shape,  y_test_lstm.shape)

Train sequences: (2273, 7, 15) (2273,)
Test sequences: (570, 7, 15) (570,)


In [16]:
train_df.to_csv(f"{root}preprocessed/btc_train.csv")
test_df.to_csv(f"{root}preprocessed/btc_test.csv")

np.save(f"{root}preprocessed/X_train_lstm.npy", X_train_lstm)
np.save(f"{root}preprocessed/y_train_lstm.npy", y_train_lstm)
np.save(f"{root}preprocessed/X_test_lstm.npy", X_test_lstm)
np.save(f"{root}preprocessed/y_test_lstm.npy", y_test_lstm)

joblib.dump(scaler, f"{root}preprocessed/scaler.pkl")

feature_info = {
    'base_features': base_features,
    'scaled_features': scaled_cols,
    'seq_length': seq_len,
    'target': 'log_return_norm'
}
joblib.dump(feature_info, f"{root}preprocessed/feature_info.pkl")

print("\nPreprocessing complete!")
print("Saved: train/test CSVs, LSTM sequences, scaler, feature metadata.")


Preprocessing complete!
Saved: train/test CSVs, LSTM sequences, scaler, feature metadata.
