In [123]:
import sys
import os

# Add the directory containing preprocessor.py to the Python path
script_dir = r"C:\Users\abrau\uvic\seng474\project\CryptoAI\AndrewFiles"
sys.path.append(script_dir)

# Import the preprocessor module
import preprocessor

# Define file paths
input_filepath = r"C:\Users\abrau\uvic\seng474\project\CryptoAI\Kraken_OHLCVT\XBTUSD_60.csv"
output_directory = r"C:\Users\abrau\uvic\seng474\project\CryptoAI\AndrewFiles"
output_filepath = os.path.join(output_directory, "XBTUSD_60_with_features.csv")

# Process the file using the preprocessor module
preprocessor.process_file(input_filepath, output_filepath)

Processed file saved to: C:\Users\abrau\uvic\seng474\project\CryptoAI\AndrewFiles\XBTUSD_60_with_features.csv


In [126]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler

# Load the training data
train_filepath = r'C:\Users\abrau\uvic\seng474\project\CryptoAI\AndrewFiles\XBTUSD_60_with_features_train.csv'
train_df = pd.read_csv(train_filepath)

# Drop the timestamp column
train_df = train_df.drop(columns=['Timestamp'])

# Replace infinite values with NaN
train_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with NaN values
train_df.dropna(inplace=True)

# Extract all features (excluding timestamp)
train_data = train_df.values  # Use all columns

# Normalize data
scaler = MinMaxScaler()
train_data_scaled = scaler.fit_transform(train_data)

# Create sequences for LSTM
sequence_length = 50  # Use past 50 timesteps to predict next
X_train, y_train = [], []
for i in range(len(train_data_scaled) - sequence_length):
    X_train.append(train_data_scaled[i:i+sequence_length])
    y_train.append(train_data_scaled[i+sequence_length, 3])  # Predict closing price (column index 3)

X_train, y_train = np.array(X_train), np.array(y_train)

# Define LSTM model
model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(sequence_length, train_data.shape[1])),
    Dropout(0.2),
    LSTM(50, return_sequences=False),
    Dropout(0.2),
    Dense(25, activation='relu'),
    Dense(1)  # Output layer for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae', 'mse'])

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2, verbose=1)

# Print accuracy metrics
for key in history.history.keys():
    print(f"{key}: {history.history[key][-1]}")

  super().__init__(**kwargs)


Epoch 1/20
[1m1706/1706[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 30ms/step - loss: 0.0011 - mae: 0.0184 - mse: 0.0011 - val_loss: 7.5452e-04 - val_mae: 0.0215 - val_mse: 7.5452e-04
Epoch 2/20
[1m1706/1706[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 27ms/step - loss: 1.7862e-04 - mae: 0.0079 - mse: 1.7862e-04 - val_loss: 0.0011 - val_mae: 0.0292 - val_mse: 0.0011
Epoch 3/20
[1m1706/1706[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 24ms/step - loss: 1.2929e-04 - mae: 0.0067 - mse: 1.2929e-04 - val_loss: 0.0044 - val_mae: 0.0592 - val_mse: 0.0044
Epoch 4/20
[1m1706/1706[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 28ms/step - loss: 1.1209e-04 - mae: 0.0060 - mse: 1.1209e-04 - val_loss: 0.0031 - val_mae: 0.0488 - val_mse: 0.0031
Epoch 5/20
[1m1706/1706[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 30ms/step - loss: 8.7264e-05 - mae: 0.0054 - mse: 8.7264e-05 - val_loss: 0.0016 - val_mae: 0.0348 - val_mse: 0.0016
Epoch 6/20
[1m1706/

In [127]:
# Load the test data
test_filepath = r'C:\Users\abrau\uvic\seng474\project\CryptoAI\AndrewFiles\XBTUSD_60_with_features_test.csv'
test_df = pd.read_csv(test_filepath)

# Drop the timestamp column
test_df = test_df.drop(columns=['Timestamp'])

# Replace infinite values with NaN
test_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with NaN values
test_df.dropna(inplace=True)

# Extract all features (excluding timestamp)
test_data = test_df.values  # Use all columns

# Normalize test data using the same scaler as training data
test_data_scaled = scaler.transform(test_data)

# Create sequences for LSTM
X_test, y_test = [], []
for i in range(len(test_data_scaled) - sequence_length):
    X_test.append(test_data_scaled[i:i+sequence_length])
    y_test.append(test_data_scaled[i+sequence_length, 3])  # Predict closing price (column index 3)

X_test, y_test = np.array(X_test), np.array(y_test)

# Evaluate the model on the test set
test_loss, test_mae, test_mse = model.evaluate(X_test, y_test, verbose=1)
print(f"\nTest Loss: {test_loss:.4f}")
print(f"Test MAE: {test_mae:.4f}")
print(f"Test MSE: {test_mse:.4f}")

# Make predictions on the test set
y_pred = model.predict(X_test)

# Optionally, you can compare the predicted vs actual values
for i in range(10):  # Print the first 10 predictions
    print(f"Predicted: {y_pred[i][0]:.4f}, Actual: {y_test[i]:.4f}")

[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - loss: 0.0037 - mae: 0.0464 - mse: 0.0037

Test Loss: 0.0125
Test MAE: 0.0885
Test MSE: 0.0125
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step
Predicted: 0.2453, Actual: 0.2773
Predicted: 0.2455, Actual: 0.2776
Predicted: 0.2464, Actual: 0.2775
Predicted: 0.2462, Actual: 0.2782
Predicted: 0.2471, Actual: 0.2780
Predicted: 0.2468, Actual: 0.2777
Predicted: 0.2464, Actual: 0.2782
Predicted: 0.2471, Actual: 0.2779
Predicted: 0.2467, Actual: 0.2778
Predicted: 0.2463, Actual: 0.2777
