In [4]:
import sys
import os

# Add the directory containing preprocessor.py to the Python path
script_dir = r"C:\Users\abrau\uvic\seng474\project\CryptoAI\AndrewFiles"
sys.path.append(script_dir)

# Import the preprocessor module
import preprocessor

# Define file paths
input_filepath = r"C:\Users\abrau\uvic\seng474\project\CryptoAI\Kraken_OHLCVT\XBTUSD_60.csv"
output_directory = r"C:\Users\abrau\uvic\seng474\project\CryptoAI\AndrewFiles"
output_filepath = os.path.join(output_directory, "XBTUSD_60_with_features.csv")

# Process the file using the preprocessor module
preprocessor.process_file(input_filepath, output_filepath)

Processed file saved to: C:\Users\abrau\uvic\seng474\project\CryptoAI\AndrewFiles\XBTUSD_60_with_features.csv


In [6]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler

# Load the training data
train_filepath = r'C:\Users\abrau\uvic\seng474\project\CryptoAI\AndrewFiles\XBTUSD_60_with_features_train.csv'
train_df = pd.read_csv(train_filepath)

# Drop the timestamp column
train_df = train_df.drop(columns=['Timestamp'])

# Replace infinite values with NaN
train_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with NaN values
train_df.dropna(inplace=True)

# Extract all features (excluding timestamp)
train_data = train_df.values  # Use all columns

# Normalize data
scaler = MinMaxScaler()
train_data_scaled = scaler.fit_transform(train_data)

# Create sequences for LSTM
sequence_length = 50  # Use past 50 timesteps to predict next
X_train, y_train = [], []
for i in range(len(train_data_scaled) - sequence_length):
    X_train.append(train_data_scaled[i:i+sequence_length])
    y_train.append(train_data_scaled[i+sequence_length, 3])  # Predict closing price (column index 3)

X_train, y_train = np.array(X_train), np.array(y_train)

# Define LSTM model
model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(sequence_length, train_data.shape[1])),
    Dropout(0.2),
    LSTM(50, return_sequences=False),
    Dropout(0.2),
    Dense(25, activation='relu'),
    Dense(1)  # Output layer for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae', 'mse'])

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2, verbose=1)

# Print accuracy metrics
for key in history.history.keys():
    print(f"{key}: {history.history[key][-1]}")

  super().__init__(**kwargs)


Epoch 1/20
[1m1706/1706[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 73ms/step - loss: 8.7435e-04 - mae: 0.0166 - mse: 8.7435e-04 - val_loss: 0.0014 - val_mae: 0.0312 - val_mse: 0.0014
Epoch 2/20
[1m1706/1706[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 68ms/step - loss: 1.9173e-04 - mae: 0.0077 - mse: 1.9173e-04 - val_loss: 0.0026 - val_mae: 0.0462 - val_mse: 0.0026
Epoch 3/20
[1m1706/1706[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 53ms/step - loss: 1.3583e-04 - mae: 0.0065 - mse: 1.3583e-04 - val_loss: 0.0030 - val_mae: 0.0472 - val_mse: 0.0030
Epoch 4/20
[1m1706/1706[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 54ms/step - loss: 1.1537e-04 - mae: 0.0058 - mse: 1.1537e-04 - val_loss: 0.0024 - val_mae: 0.0434 - val_mse: 0.0024
Epoch 5/20
[1m1706/1706[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 67ms/step - loss: 9.5595e-05 - mae: 0.0054 - mse: 9.5595e-05 - val_loss: 0.0027 - val_mae: 0.0484 - val_mse: 0.0027
Epoch 6/20
[1m17

In [7]:
# Load the test data
test_filepath = r'C:\Users\abrau\uvic\seng474\project\CryptoAI\AndrewFiles\XBTUSD_60_with_features_test.csv'
test_df = pd.read_csv(test_filepath)

# Drop the timestamp column
test_df = test_df.drop(columns=['Timestamp'])

# Replace infinite values with NaN
test_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with NaN values
test_df.dropna(inplace=True)

# Extract all features (excluding timestamp)
test_data = test_df.values  # Use all columns

# Normalize test data using the same scaler as training data
test_data_scaled = scaler.transform(test_data)

# Create sequences for LSTM
X_test, y_test = [], []
for i in range(len(test_data_scaled) - sequence_length):
    X_test.append(test_data_scaled[i:i+sequence_length])
    y_test.append(test_data_scaled[i+sequence_length, 3])  # Predict closing price (column index 3)

X_test, y_test = np.array(X_test), np.array(y_test)

# Evaluate the model on the test set
test_loss, test_mae, test_mse = model.evaluate(X_test, y_test, verbose=1)
print(f"\nTest Loss: {test_loss:.4f}")
print(f"Test MAE: {test_mae:.4f}")
print(f"Test MSE: {test_mse:.4f}")

# Make predictions on the test set
y_pred = model.predict(X_test)

# Optionally, you can compare the predicted vs actual values
for i in range(10):  # Print the first 10 predictions
    print(f"Predicted: {y_pred[i][0]:.4f}, Actual: {y_test[i]:.4f}")

[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 0.0045 - mae: 0.0495 - mse: 0.0045

Test Loss: 0.0160
Test MAE: 0.0990
Test MSE: 0.0160
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step
Predicted: 0.2460, Actual: 0.2773
Predicted: 0.2459, Actual: 0.2776
Predicted: 0.2469, Actual: 0.2775
Predicted: 0.2464, Actual: 0.2782
Predicted: 0.2474, Actual: 0.2780
Predicted: 0.2469, Actual: 0.2777
Predicted: 0.2463, Actual: 0.2782
Predicted: 0.2473, Actual: 0.2779
Predicted: 0.2466, Actual: 0.2778
Predicted: 0.2468, Actual: 0.2777


In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load the test data
test_filepath = r'C:\Users\abrau\uvic\seng474\project\CryptoAI\AndrewFiles\XBTUSD_60_with_features_test.csv'
test_df = pd.read_csv(test_filepath)

# Drop the timestamp column
test_df = test_df.drop(columns=['Timestamp'])

# Replace infinite values with NaN
test_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with NaN values
test_df.dropna(inplace=True)

# Extract all features (excluding timestamp)
test_data = test_df.values  # Use all columns

# Normalize test data using the same scaler as training data
test_data_scaled = scaler.transform(test_data)

# Create sequences for LSTM
X_test, y_test = [], []
for i in range(len(test_data_scaled) - sequence_length):
    X_test.append(test_data_scaled[i:i+sequence_length])
    y_test.append(test_data_scaled[i+sequence_length, 3])  # Predict closing price (column index 3)

X_test, y_test = np.array(X_test), np.array(y_test)

# Evaluate the model on the test set
test_loss, test_mae, test_mse = model.evaluate(X_test, y_test, verbose=1)
print(f"\nTest Loss: {test_loss:.4f}")
print(f"Test MAE: {test_mae:.4f}")
print(f"Test MSE: {test_mse:.4f}")

# Make predictions on the test set
y_pred = model.predict(X_test)

# Reshape y_test and y_pred for inverse_transform
# y_test and y_pred are 1D arrays, so we need to reshape them to 2D arrays
y_test_reshaped = y_test.reshape(-1, 1)
y_pred_reshaped = y_pred.reshape(-1, 1)

# Inverse transform the scaled values to original units
y_test_original = scaler.inverse_transform(y_test_reshaped)
y_pred_original = scaler.inverse_transform(y_pred_reshaped)

# Recalculate MAE in original units
mae_original = np.mean(np.abs(y_test_original - y_pred_original))
print(f"\nMAE in original units: {mae_original:.4f}")

# Optionally, you can compare the predicted vs actual values in original units
for i in range(10):  # Print the first 10 predictions
    print(f"Predicted: {y_pred_original[i][0]:.4f}, Actual: {y_test_original[i][0]:.4f}")

[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 0.0045 - mae: 0.0495 - mse: 0.0045

Test Loss: 0.0160
Test MAE: 0.0990
Test MSE: 0.0160
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step


ValueError: non-broadcastable output operand with shape (17035,1) doesn't match the broadcast shape (17035,19)