# SENG 474 Project - Initial Model

## Initial Model

In [15]:
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import joblib
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix


# Filepath
filepath = "Kraken_OHLCVT/XBTUSD_15.csv"

# Threshold for determining if a coin went up
threshold = 0

#############################################
# 1) Load CSV & Rename columns
#############################################
df = pd.read_csv(filepath)
df.columns = ["Timestamp", "Open", "High", "Low", "Close", "Volume", "Trades"]

#############################################
# 2) Define feature-engineering functions
#############################################
def add_datetime_features(df):
    df["Timestamp"] = pd.to_datetime(df["Timestamp"], unit="s")  # Convert to datetime

    df["Weekday"] = df["Timestamp"].dt.weekday  # 0=Mon, 6=Sun
    df["Day"] = df["Timestamp"].dt.day
    df["Year"] = df["Timestamp"].dt.year.astype(float)

    # Time of day
    df["TOD"] = df["Timestamp"].dt.hour + df["Timestamp"].dt.minute / 60.0

    # Cyclical encoding for Month
    month = df["Timestamp"].dt.month
    df["Month_Sin"] = np.sin(2 * np.pi * month / 12.0)
    df["Month_Cos"] = np.cos(2 * np.pi * month / 12.0)

    # Cyclical encoding for TOD (24 hours in a day)
    df["TOD_Sin"] = np.sin(2 * np.pi * df["TOD"] / 24.0)
    df["TOD_Cos"] = np.cos(2 * np.pi * df["TOD"] / 24.0)

    # Drop the original columns
    df.drop(columns=["Timestamp", "TOD"], inplace=True)
    return df

def compute_rsi(series, period=14):
    delta = series.diff().dropna()
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / (loss + 1e-10)  # Avoid division by zero
    return 100 - (100 / (1 + rs))

def add_features(df, threshold):
    df["SMA_10"] = df["Close"].rolling(window=10).mean()
    df["SMA_50"] = df["Close"].rolling(window=50).mean()
    df["RSI_14"] = compute_rsi(df["Close"])
    df["Return"] = df["Close"].pct_change()

    # Bollinger Bands
    df["Middle_Band"] = df["Close"].rolling(window=20).mean()
    stddev = df["Close"].rolling(window=20).std()
    df["Upper_Band"] = df["Middle_Band"] + (2 * stddev)
    df["Lower_Band"] = df["Middle_Band"] - (2 * stddev)

    # Return_Signal in {-1, 0, 1}
    df["Return_Signal"] = df["Return"].apply(
        lambda x: 1 if x > threshold else (0 if x >= 0 else -1)
    )

    # Add time-based features (which won't leak future data if done carefully)
    df = add_datetime_features(df)
    return df

#############################################
# 3) Split into Train/Val/Test BEFORE fitting scaler
#############################################
# e.g., 70% train, 15% val, 15% test
df = df.reset_index(drop=True)
n = len(df)
train_end = int(0.7 * n)
val_end = int(0.85 * n)

train_df = df.iloc[:train_end].copy()
val_df = df.iloc[train_end:val_end].copy()
test_df = df.iloc[val_end:].copy()

#############################################
# 4) Add features to the full dataframe,
#    after split to void leakage
#############################################
train_df = add_features(train_df, threshold)
val_df = add_features(val_df, threshold)
test_df = add_features(test_df, threshold)

#############################################
# 5) Fill NaNs using means of respective sets
#############################################
train_means = train_df.mean(numeric_only=True)
train_df.fillna(train_means, inplace=True)
val_df.fillna(train_means, inplace=True)
test_df.fillna(train_means, inplace=True)


#############################################
# 6) Scale using only the TRAIN set
#############################################
scaler = MinMaxScaler(feature_range=(0, 1))

# Exclude the label "Return_Signal" from scaling
exclude_cols = ["Return_Signal", "Month_Sin", "Month_Cos", "TOD_Sin", "TOD_Cos"]
features_to_scale = [col for col in train_df.columns if col not in exclude_cols]

# Fit scaler on train
scaler.fit(train_df[features_to_scale])

# Transform train, val, test
train_df[features_to_scale] = scaler.transform(train_df[features_to_scale])
val_df[features_to_scale]  = scaler.transform(val_df[features_to_scale])
test_df[features_to_scale] = scaler.transform(test_df[features_to_scale])

# Save scaler
joblib.dump(scaler, "scaler.pkl")

#############################################
# 7) Reorder columns so that Return_Signal is last
#############################################
def reorder_columns(df):
    cols = list(df.columns)
    cols.remove("Return_Signal")
    cols.append("Return_Signal")
    return df[cols]

train_df = reorder_columns(train_df)
val_df = reorder_columns(val_df)
test_df = reorder_columns(test_df)

#############################################
# 8) Convert to NumPy & Shift labels {-1,0,1} -> {0,1,2}
#############################################
def to_numpy_and_shift_labels(df):
    data = df.to_numpy()
    label_idx = df.columns.get_loc("Return_Signal")

    # Single pass: -1 -> 0, 0 -> 1, 1 -> 2
    data[:, label_idx] = np.where(
        data[:, label_idx] == -1, 0,
        np.where(
            data[:, label_idx] == 0, 1,
            2
        )
    )
    return data


train_data = to_numpy_and_shift_labels(train_df)
val_data = to_numpy_and_shift_labels(val_df)
test_data = to_numpy_and_shift_labels(test_df)

#############################################
# 9) Create sequences (X, y) from each split
#############################################
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        # X: the past seq_length rows, all columns except the label
        X.append(data[i : i + seq_length, :-1])
        # y: the label in the next row
        y.append(data[i + seq_length, -1])
    return np.array(X), np.array(y)

seq_length = 30
X_train, y_train = create_sequences(train_data, seq_length)
X_val,   y_val   = create_sequences(val_data,   seq_length)
X_test,  y_test  = create_sequences(test_data,  seq_length)

#############################################
# 10) Build LSTM Classification Model
#############################################
num_features = X_train.shape[2]

model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(seq_length, num_features)),
    Dropout(0.2),
    LSTM(50, return_sequences=False),
    Dropout(0.2),
    Dense(25, activation="relu"),
    Dense(3, activation="softmax")  # 3 classes: 0=down, 1=neutral, 2=up
])

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",  # for integer labels
    metrics=["accuracy"]
)

#############################################
# 11) Early Stopping & Training
#############################################
early_stopping = EarlyStopping(
    monitor="val_loss",
    patience=5,
    restore_best_weights=True
)

history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=32,
    validation_data=(X_val, y_val),
    callbacks=[early_stopping],
    verbose=1
)

#############################################
# 11) Evaluate on Test Set
#############################################
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

# Classification report and confusion matrix
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

print("Classification Report:")
print(classification_report(y_test, y_pred_classes))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_classes))


  super().__init__(**kwargs)


Epoch 1/50
[1m6951/6951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 8ms/step - accuracy: 0.4877 - loss: 0.7809 - val_accuracy: 0.5019 - val_loss: 0.7235
Epoch 2/50
[1m6951/6951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 8ms/step - accuracy: 0.4918 - loss: 0.7764 - val_accuracy: 0.5019 - val_loss: 0.7251
Epoch 3/50
[1m6951/6951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 8ms/step - accuracy: 0.4941 - loss: 0.7752 - val_accuracy: 0.4920 - val_loss: 0.7239
Epoch 4/50
[1m6951/6951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 8ms/step - accuracy: 0.4954 - loss: 0.7739 - val_accuracy: 0.5019 - val_loss: 0.7238
Epoch 5/50
[1m6951/6951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 8ms/step - accuracy: 0.4913 - loss: 0.7738 - val_accuracy: 0.5019 - val_loss: 0.7261
Epoch 6/50
[1m6951/6951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 8ms/step - accuracy: 0.4933 - loss: 0.7728 - val_accuracy: 0.5019 - val_loss: 0.7239
[1m1489/1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


---

In [14]:
from collections import Counter

print("Train distribution:", Counter(y_train))
print("Validation distribution:", Counter(y_val))
print("Test distribution:", Counter(y_test))

Train distribution: Counter({2.0: 110151, 0.0: 106575, 1.0: 5679})
Validation distribution: Counter({2.0: 23907, 0.0: 23470, 1.0: 258})
Test distribution: Counter({2.0: 23769, 0.0: 22951, 1.0: 915})


We retry with class weights:

In [60]:
print("Classification Report:")
print(classification_report(y_test, y_pred_classes))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_classes))

Classification Report:
              precision    recall  f1-score   support

         0.0       0.96      1.00      0.98     45837
         2.0       0.00      0.00      0.00      1798

    accuracy                           0.96     47635
   macro avg       0.48      0.50      0.49     47635
weighted avg       0.93      0.96      0.94     47635

Confusion Matrix:
[[45837     0]
 [ 1798     0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


---

## Regression version

In [17]:
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import joblib
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Filepath
filepath = "Kraken_OHLCVT/XBTUSD_15.csv"

#############################################
# 1) Load CSV & Rename columns
#############################################
df = pd.read_csv(filepath)
df.columns = ["Timestamp", "Open", "High", "Low", "Close", "Volume", "Trades"]

#############################################
# 2) Define feature-engineering functions
#############################################
def add_datetime_features(df):
    df["Timestamp"] = pd.to_datetime(df["Timestamp"], unit="s")  # Convert to datetime

    df["Weekday"] = df["Timestamp"].dt.weekday  # 0=Mon, 6=Sun
    df["Day"] = df["Timestamp"].dt.day
    df["Year"] = df["Timestamp"].dt.year.astype(float)

    # Time of day in hours + fraction
    df["TOD"] = df["Timestamp"].dt.hour + df["Timestamp"].dt.minute / 60.0

    # Cyclical encoding for Month (12 months)
    month = df["Timestamp"].dt.month
    df["Month_Sin"] = np.sin(2 * np.pi * month / 12.0)
    df["Month_Cos"] = np.cos(2 * np.pi * month / 12.0)

    # Cyclical encoding for TOD (24 hours)
    df["TOD_Sin"] = np.sin(2 * np.pi * df["TOD"] / 24.0)
    df["TOD_Cos"] = np.cos(2 * np.pi * df["TOD"] / 24.0)

    # Drop the original Timestamp & TOD columns
    df.drop(columns=["Timestamp", "TOD"], inplace=True)
    return df

def compute_rsi(series, period=14):
    delta = series.diff().dropna()
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / (loss + 1e-10)  # Avoid division by zero
    return 100 - (100 / (1 + rs))

def add_features(df):
    # Basic indicators
    df["SMA_10"] = df["Close"].rolling(window=10).mean()
    df["SMA_50"] = df["Close"].rolling(window=50).mean()
    df["RSI_14"] = compute_rsi(df["Close"])
    df["Return"] = df["Close"].pct_change()

    # Bollinger Bands
    df["Middle_Band"] = df["Close"].rolling(window=20).mean()
    stddev = df["Close"].rolling(window=20).std()
    df["Upper_Band"] = df["Middle_Band"] + (2 * stddev)
    df["Lower_Band"] = df["Middle_Band"] - (2 * stddev)

    # Add time-based features
    df = add_datetime_features(df)

    # For regression: predict the *next-step Close* price
    # Shift the 'Close' by -1 step to get the future price
    df["Target"] = df["Close"].shift(-1)

    return df

#############################################
# 3) Split into Train/Val/Test BEFORE feature engineering
#############################################
# We'll do a chronological split: 70% / 15% / 15%
df = df.reset_index(drop=True)
n = len(df)
train_end = int(0.7 * n)
val_end = int(0.85 * n)

train_df = df.iloc[:train_end].copy()
val_df = df.iloc[train_end:val_end].copy()
test_df = df.iloc[val_end:].copy()

#############################################
# 4) Add features
#############################################
train_df = add_features(train_df)
val_df = add_features(val_df)
test_df = add_features(test_df)

#############################################
# 5) Fill NaNs (rolling features & shift cause NaNs)
#############################################
train_means = train_df.mean(numeric_only=True)
train_df.fillna(train_means, inplace=True)
val_df.fillna(train_means, inplace=True)
test_df.fillna(train_means, inplace=True)

#############################################
# 6) Scale using only the TRAIN set
#############################################
scaler = MinMaxScaler(feature_range=(0, 1))

# Include 'Target' in your scaling:
exclude_cols = ["Month_Sin", "Month_Cos", "TOD_Sin", "TOD_Cos"]  # remove 'Target' from exclude
features_to_scale = [col for col in train_df.columns if col not in exclude_cols]

# Then fit/transform just like normal
scaler.fit(train_df[features_to_scale])
train_df[features_to_scale] = scaler.transform(train_df[features_to_scale])
val_df[features_to_scale]   = scaler.transform(val_df[features_to_scale])
test_df[features_to_scale]  = scaler.transform(test_df[features_to_scale])

# Now your 'Target' is in [0,1], so MSE will be in a more typical range.


# Transform train, val, test
train_df[features_to_scale] = scaler.transform(train_df[features_to_scale])
val_df[features_to_scale]  = scaler.transform(val_df[features_to_scale])
test_df[features_to_scale] = scaler.transform(test_df[features_to_scale])

joblib.dump(scaler, "scaler.pkl")

#############################################
# 7) Reorder columns so 'Target' is last
#############################################
def reorder_columns(df):
    cols = list(df.columns)
    cols.remove("Target")
    cols.append("Target")
    return df[cols]

train_df = reorder_columns(train_df)
val_df = reorder_columns(val_df)
test_df = reorder_columns(test_df)

#############################################
# 8) Convert to NumPy
#############################################
def to_numpy_array(df):
    return df.to_numpy()

train_data = to_numpy_array(train_df)
val_data   = to_numpy_array(val_df)
test_data  = to_numpy_array(test_df)

#############################################
# 9) Create sequences (X, y) for regression
# X is the past seq_length rows; y is the next-step 'Target'
#############################################
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        # X: the past seq_length rows, all columns except the last ('Target')
        X.append(data[i : i + seq_length, :-1])
        # y: the 'Target' in the next row
        y.append(data[i + seq_length, -1])
    return np.array(X), np.array(y)

seq_length = 30
X_train, y_train = create_sequences(train_data, seq_length)
X_val,   y_val   = create_sequences(val_data,   seq_length)
X_test,  y_test  = create_sequences(test_data,  seq_length)

#############################################
# 10) Build LSTM Regression Model
#############################################
num_features = X_train.shape[2]

model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(seq_length, num_features)),
    Dropout(0.2),
    LSTM(50, return_sequences=False),
    Dropout(0.2),
    Dense(25, activation="relu"),
    Dense(1)  # single value output for regression
])

model.compile(
    optimizer="adam",
    loss="mse",       # MSE loss for regression
    metrics=["mae"]   # Track MAE as well
)

#############################################
# 11) Early Stopping & Training
#############################################
early_stopping = EarlyStopping(
    monitor="val_loss",
    patience=5,
    restore_best_weights=True
)

history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=32,
    validation_data=(X_val, y_val),
    callbacks=[early_stopping],
    verbose=1
)

#############################################
# 12) Evaluate on Test Set (Regression)
#############################################
mse, mae = model.evaluate(X_test, y_test, verbose=1)
print("Test MSE:", mse)
print("Test MAE:", mae)

# Predict on the test set
y_pred = model.predict(X_test)

# Compute metrics manually
mse_manual = mean_squared_error(y_test, y_pred)
mae_manual = mean_absolute_error(y_test, y_pred)

print("Manual MSE:", mse_manual)
print("Manual MAE:", mae_manual)


  super().__init__(**kwargs)


Epoch 1/50
[1m6951/6951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 8ms/step - loss: 0.0013 - mae: 0.0077 - val_loss: 1.6653e-08 - val_mae: 1.2903e-04
Epoch 2/50
[1m6951/6951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 8ms/step - loss: 8.0495e-08 - mae: 8.0409e-05 - val_loss: 5.3759e-10 - val_mae: 2.3112e-05
Epoch 3/50
[1m6951/6951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 8ms/step - loss: 7.6138e-08 - mae: 6.7580e-05 - val_loss: 3.4308e-09 - val_mae: 5.8543e-05
Epoch 4/50
[1m6951/6951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 8ms/step - loss: 9.3232e-09 - mae: 6.6717e-05 - val_loss: 6.6360e-11 - val_mae: 7.9322e-06
Epoch 5/50
[1m6951/6951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 8ms/step - loss: 6.0842e-09 - mae: 5.5658e-05 - val_loss: 9.1056e-10 - val_mae: 3.0118e-05
Epoch 6/50
[1m6951/6951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 8ms/step - loss: 4.6130e-09 - mae: 4.3524e-05 - val_loss: 4.6326e-10 - va