In [None]:
# Cell 0: Imports and Setup
import os
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error, mean_absolute_error
from IPython.display import display

# Suppress warnings
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Configure matplotlib for notebook environment
plt.style.use('default')

# Import our integrated utilities
from src import (
    BinanceDataOrganizer, DataConfig, GroupedScaler,
    create_lstm_model, evaluate_model
)

print("✅ All imports successful!")
print(f"📊 TensorFlow version: {tf.__version__}")
print(f"📊 NumPy version: {np.__version__}")
print(f"📊 Pandas version: {pd.__version__}")


## 1. Configuration and Data Setup

Configure the data parameters and create the BinanceDataOrganizer instance.


In [None]:
# Cell 1: Configuration
# Configuration parameters
SYMBOL = "BTCUSDT"
TIMEFRAME = "5m"
START_DATE = "2021-01"  # Updated to yyyy-mm format
END_DATE = "2021-01"    # Updated to yyyy-mm format
SEQUENCE_LENGTH = 30      # Number of timesteps to look back
PREDICTION_LENGTH = 10    # Number of future timesteps to predict
TRAIN_SPLIT = 0.8        # 80% for training, 20% for testing

# Model parameters
LSTM_UNITS = 100
DROPOUT_RATE = 0.2
EPOCHS = 50
BATCH_SIZE = 32
LEARNING_RATE = 0.001

# Create configuration
config = DataConfig(
    symbol=SYMBOL,
    timeframe=TIMEFRAME,
    start_time=START_DATE,
    end_time=END_DATE,
    sequence_length=SEQUENCE_LENGTH,
    prediction_length=PREDICTION_LENGTH,
    train_split=TRAIN_SPLIT
)

print(f"🔧 Configuration created:")
print(f"   Symbol: {config.symbol}")
print(f"   Timeframe: {config.timeframe}")
print(f"   Period: {config.start_time} to {config.end_time}")
print(f"   Sequence Length: {config.sequence_length}")
print(f"   Prediction Length: {config.prediction_length}")
print(f"   Train Split: {config.train_split}")
print(f"\n📊 Input Structure: OHLCV only")
print(f"   Features: 5 (Open, High, Low, Close, Volume)")
print(f"   Targets: 5 (Open, High, Low, Close, Volume)")


In [None]:
# Cell 2: Create and Initialize Organizer
# Create the BinanceDataOrganizer instance
organizer = BinanceDataOrganizer(config)

print("🚀 Starting complete data processing pipeline...")

# Process all data (load + create features)
if organizer.process_all():
    print("✅ Data processing completed successfully!")
    
    # Get feature information
    feature_info = organizer.get_feature_info()
    print(f"\n📊 Feature Information:")
    print(f"   Total features: {feature_info['num_features']}")
    print(f"   Sequence length: {feature_info['sequence_length']}")
    print(f"   Prediction length: {feature_info['prediction_length']}")
    print(f"   Data shape: {feature_info['data_shape']}")
    print(f"   Total sequences: {feature_info['total_sequences']}")
    print(f"\n🔧 Feature columns: {feature_info['feature_columns']}")
    
    # Display data summary
    data_summary = organizer.get_data_summary()
    print(f"\n📊 Data Summary:")
    print(f"   Total rows: {data_summary['total_rows']}")
    print(f"   Date range: {data_summary['date_range']}")
    print(f"   Columns: {data_summary['columns']}")
else:
    print("❌ Data processing failed!")
    raise Exception("Failed to process data")


## 2. Data Visualization and Analysis

Visualize the loaded data and analyze its characteristics.


In [None]:
# Cell 3: Data Visualization
print("📊 Visualizing loaded data...")

# Get unscaled data for visualization
unscaled_data = organizer.get_unscaled_data('all')

# Display basic statistics
print(f"\n📈 Data Statistics:")
print(f"   X_train shape: {unscaled_data['X_train'].shape}")
print(f"   y_train shape: {unscaled_data['y_train'].shape}")
print(f"   X_test shape: {unscaled_data['X_test'].shape}")
print(f"   y_test shape: {unscaled_data['y_test'].shape}")

# Show sample data
print(f"\n📊 Sample Input Data (OHLCV):")
print(f"   First sequence shape: {unscaled_data['X_train'][0].shape}")
print(f"   First sequence (Open): {unscaled_data['X_train'][0][:, 0]}")
print(f"   First sequence (High): {unscaled_data['X_train'][0][:, 1]}")
print(f"   First sequence (Low): {unscaled_data['X_train'][0][:, 2]}")
print(f"   First sequence (Close): {unscaled_data['X_train'][0][:, 3]}")
print(f"   First sequence (Volume): {unscaled_data['X_train'][0][:, 4]}")

print(f"\n📊 Sample Target Data (OHLCV):")
print(f"   First target: {unscaled_data['y_train'][0]}")
print(f"   Second target: {unscaled_data['y_train'][1]}")

print(f"\n📊 Data Structure:")
print(f"   Input features: 5 (Open, High, Low, Close, Volume)")
print(f"   Target values: 5 (Open, High, Low, Close, Volume)")
print(f"   Sequence length: {config.sequence_length} timesteps")
print(f"   Total sequences: {len(unscaled_data['X_train']) + len(unscaled_data['X_test'])}")


## 2.1. Simplified Data Structure Explanation

The data has been simplified to use only essential OHLCV data. The model uses OHLCV values as both input features and target predictions.


In [None]:
# Cell 3.1: Demonstrate Simplified Data Structure
print("📊 SIMPLIFIED DATA STRUCTURE DEMONSTRATION")
print("=" * 50)

# Show the essential data structure
print(f"\n1️⃣ Essential Data Columns:")
print(f"   {list(organizer.raw_data.columns)}")

print(f"\n2️⃣ Input Sequences (X):")
print(f"   Shape: {unscaled_data['X_train'].shape}")
print(f"   Meaning: [samples, timesteps, features]")
print(f"   Features: 5 (Open, High, Low, Close, Volume)")

print(f"\n3️⃣ Target Sequences (y):")
print(f"   Shape: {unscaled_data['y_train'].shape}")
print(f"   Meaning: [samples, prediction_values]")
print(f"   Values: 5 (Open, High, Low, Close, Volume)")

print(f"\n4️⃣ Example Sequence:")
print(f"   Input (OHLCV): {unscaled_data['X_train'][0]}")
print(f"   Target (OHLCV): {unscaled_data['y_train'][0]}")

print(f"\n5️⃣ Price Pattern Analysis:")
sample_sequence = unscaled_data['X_train'][0]
print(f"   Open prices: {sample_sequence[:, 0]}")
print(f"   High prices: {sample_sequence[:, 1]}")
print(f"   Low prices: {sample_sequence[:, 2]}")
print(f"   Close prices: {sample_sequence[:, 3]}")
print(f"   Volume: {sample_sequence[:, 4]}")

print(f"\n✅ This simplified structure uses OHLCV patterns to predict future OHLCV values!")


## 3. Model Training with Integrated Normalization

Train the LSTM model using the integrated normalization system.


In [None]:
# Cell 4: Get Scaled Data and Train Model
# Get scaled data (this will fit scalers if not already fitted)
print("🔢 Getting scaled data with integrated normalization...")
scaled_data = organizer.get_scaled_data('all')

X_train_scaled = scaled_data['X_train_scaled']
X_test_scaled = scaled_data['X_test_scaled']
y_train_scaled = scaled_data['y_train_scaled']
y_test_scaled = scaled_data['y_test_scaled']

print(f"✅ Scaled data ready:")
print(f"   X_train_scaled: {X_train_scaled.shape}")
print(f"   y_train_scaled: {y_train_scaled.shape}")
print(f"   X_train range: {X_train_scaled.min():.4f} to {X_train_scaled.max():.4f}")
print(f"   y_train range: {y_train_scaled.min():.4f} to {y_train_scaled.max():.4f}")

# Get scalers for later use
scalers = organizer.get_scalers()
scaler_X = scalers['X']  # GroupedScaler
scaler_y = scalers['y']  # MinMaxScaler

# Display feature groups
feature_groups = scaler_X.get_feature_groups()
print(f"\n📊 Feature Groups:")
for group_name, features in feature_groups.items():
    print(f"   {group_name.capitalize()}: {features}")

print(f"\n📊 Simplified Input Structure:")
print(f"   Input shape: {X_train_scaled.shape}")
print(f"   Features per timestep: {X_train_scaled.shape[2]} (OHLCV)")
print(f"   Target shape: {y_train_scaled.shape}")
print(f"   Target values: 5 (Open, High, Low, Close, Volume)")
print(f"   Sequence length: {X_train_scaled.shape[1]} timesteps")


In [None]:
# Cell 5: Create and Train LSTM Model
print(f"🏗️ Creating LSTM model...")
print(f"   LSTM units: {LSTM_UNITS}")
print(f"   Dropout rate: {DROPOUT_RATE}")
print(f"   Epochs: {EPOCHS}")
print(f"   Batch size: {BATCH_SIZE}")
print(f"   Input shape: {X_train_scaled.shape[1:]} (timesteps, features)")
print(f"   Output shape: {y_train_scaled.shape[1]} (OHLCV values)")
print(f"   Prediction length: {PREDICTION_LENGTH}")

# Create model
model = create_lstm_model(
    input_shape=(X_train_scaled.shape[1], X_train_scaled.shape[2]),
    lstm_units=LSTM_UNITS,
    dropout_rate=DROPOUT_RATE,
    learning_rate=LEARNING_RATE,
    prediction_length=PREDICTION_LENGTH
)

print(f"✅ Model created with {model.count_params():,} parameters")
print(f"\n📋 Model Architecture:")
model.summary()


In [None]:
# Cell 6: Train the Model
print(f"🚀 Starting model training...")

# Train the model
history = model.fit(
    X_train_scaled, y_train_scaled,
    validation_data=(X_test_scaled, y_test_scaled),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    verbose=1,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True),
        tf.keras.callbacks.ReduceLROnPlateau(patience=5, factor=0.5, min_lr=1e-7)
    ]
)

print(f"\n✅ Training completed!")


In [None]:
# Cell 7: Plot Training History
print("📈 Plotting training history...")

# Plot training history
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['mae'], label='Training MAE')
plt.plot(history.history['val_mae'], label='Validation MAE')
plt.title('Model MAE')
plt.xlabel('Epoch')
plt.ylabel('MAE')
plt.legend()

plt.tight_layout()
plt.show()


## 4. Model Evaluation and Advanced Features

Evaluate the trained model and demonstrate advanced features of the BinanceDataOrganizer.


In [None]:
# Cell 8: Model Evaluation
print(f"📊 Evaluating model performance...")

# Evaluate model
evaluation_results = evaluate_model(
    model=model,
    X_test=X_test_scaled,
    y_test=y_test_scaled,
    scaler_y=scaler_y
)

print(f"\n📈 Evaluation Results:")
print(f"   Test Loss (MSE): {evaluation_results['test_loss']:.6f}")
print(f"   Test MAE: {evaluation_results['test_mae']:.6f}")
print(f"   Test MAPE: {evaluation_results['test_mape']:.2f}%")
print(f"   RMSE: {evaluation_results['rmse']:.6f}")

# Get predictions for visualization
y_pred = evaluation_results['predictions']
y_true = evaluation_results['y_true_original']

print(f"\n📊 Prediction Statistics:")
print(f"   Predictions shape: {y_pred.shape}")
print(f"   True values shape: {y_true.shape}")
print(f"   Prediction range: ${y_pred.min():.2f} - ${y_pred.max():.2f}")
print(f"   True value range: ${y_true.min():.2f} - ${y_true.max():.2f}")


In [None]:
# Cell 9: Plot Predictions vs Actual
print("📈 Plotting predictions vs actual...")
target_columns = ['Open', 'High', 'Low', 'Close', 'Volume']

# Plot predictions vs actual for each target
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, col in enumerate(target_columns):
    if i < len(axes):
        axes[i].scatter(y_true[:, i], y_pred[:, i], alpha=0.6)
        axes[i].plot([y_true[:, i].min(), y_true[:, i].max()], 
                    [y_true[:, i].min(), y_true[:, i].max()], 'r--', lw=2)
        axes[i].set_xlabel(f'Actual {col}')
        axes[i].set_ylabel(f'Predicted {col}')
        axes[i].set_title(f'{col} - Predictions vs Actual')
        axes[i].grid(True)

# Remove empty subplot
if len(target_columns) < len(axes):
    fig.delaxes(axes[-1])

plt.tight_layout()
plt.show()


In [None]:
# Cell 10: Plot Prediction Errors
print("📈 Plotting prediction errors...")

# Calculate errors
errors = y_pred - y_true

# Plot error distributions
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, col in enumerate(target_columns):
    if i < len(axes):
        axes[i].hist(errors[:, i], bins=30, alpha=0.7, edgecolor='black')
        axes[i].set_xlabel(f'Error ({col})')
        axes[i].set_ylabel('Frequency')
        axes[i].set_title(f'{col} - Error Distribution')
        axes[i].grid(True)
        axes[i].axvline(0, color='red', linestyle='--', alpha=0.7)

# Remove empty subplot
if len(target_columns) < len(axes):
    fig.delaxes(axes[-1])

plt.tight_layout()
plt.show()


## 5. Advanced Features and On-Demand Processing

Demonstrate advanced features like on-demand data generation and future predictions.


In [None]:
# Cell 11: On-Demand Data Generation
print("🔄 Demonstrating on-demand data generation...")

# Get data for a specific time range
start_time = "2021-01-15 00:00:00"
end_time = "2021-01-15 23:59:59"

print(f"📅 Requesting data for range: {start_time} to {end_time}")

range_data = organizer.get_data_in_range(
    start_time=start_time,
    end_time=end_time,
    scaled=True
)

if range_data is not None:
    print(f"✅ Range data generated:")
    print(f"   Available keys: {list(range_data.keys())}")
    print(f"   X shape: {range_data['X_scaled'].shape}")
    print(f"   y shape: {range_data['y_scaled'].shape}")
    
    # Make predictions on this range
    range_predictions = model.predict(range_data['X_scaled'], verbose=0)
    print(f"   Predictions shape: {range_predictions.shape}")
    
    # Show sample predictions
    print(f"\n📊 Sample Predictions:")
    for i in range(min(3, len(range_predictions))):
        print(f"   Sample {i+1}: {range_predictions[i]}")
else:
    print("⚠️ No data available for the specified range")


In [None]:
# Cell 12: Future Predictions
print("🔮 Making future predictions...")

# Use the last sequence from test data for prediction
last_sequence = X_test_scaled[-1:]  # Shape: (1, sequence_length, features)
print(f"   Using last sequence shape: {last_sequence.shape}")

# Make prediction
future_prediction = model.predict(last_sequence, verbose=0)
print(f"   Prediction shape: {future_prediction.shape}")

# Inverse transform prediction to original scale
future_prediction_original = scaler_y.inverse_transform(future_prediction)
print(f"   Original scale prediction shape: {future_prediction_original.shape}")

# Display prediction results
target_columns = ['Open', 'High', 'Low', 'Close', 'Volume']
print(f"\n📈 Future Prediction (Next Candle):")
for i, col in enumerate(target_columns):
    print(f"   {col}: ${future_prediction_original[0, i]:.2f}")

# Compare with last known values
last_known = scaler_y.inverse_transform(y_test_scaled[-1:])[0]
print(f"\n📊 Last Known Values:")
for i, col in enumerate(target_columns):
    print(f"   {col}: ${last_known[i]:.2f}")

# Calculate changes
print(f"\n📊 Predicted Changes:")
for i, col in enumerate(target_columns):
    change = future_prediction_original[0, i] - last_known[i]
    change_pct = (change / last_known[i]) * 100
    print(f"   {col}: {change:+.2f} ({change_pct:+.2f}%)")


## 6. Simplified Method Summary

Summary of the simplified OHLCV + Minutes_of_day approach.


In [None]:
# Cell 13: Simplified Method Summary
print("📊 SIMPLIFIED METHOD SUMMARY")
print("=" * 50)

print(f"\n✅ Current Implementation:")
print(f"   Input features: 5 (Open, High, Low, Close, Volume)")
print(f"   Target values: 5 (Open, High, Low, Close, Volume)")
print(f"   Sequence length: {config.sequence_length} timesteps")
print(f"   Data shape: X={X_train_scaled.shape}, y={y_train_scaled.shape}")

print(f"\n📊 Feature Statistics:")
print(f"   X range: {X_train_scaled.min():.4f} to {X_train_scaled.max():.4f}")
print(f"   y range: {y_train_scaled.min():.4f} to {y_train_scaled.max():.4f}")

print(f"\n🎯 Benefits of Simplified Approach:")
print(f"   ✅ Reduced complexity - only essential OHLCV data")
print(f"   ✅ Price-based patterns - OHLCV captures market dynamics")
print(f"   ✅ Memory efficient - minimal feature engineering")
print(f"   ✅ Fast training - fewer parameters to optimize")
print(f"   ✅ Easy to understand - clear input/output relationship")

print(f"\n📈 Model Performance:")
print(f"   Test Loss: {evaluation_results['test_loss']:.6f}")
print(f"   Test MAE: {evaluation_results['test_mae']:.6f}")
print(f"   Test MAPE: {evaluation_results['test_mape']:.2f}%")
print(f"   RMSE: {evaluation_results['rmse']:.6f}")


## 7. Final Summary and Results

Display comprehensive results and summary of the prediction system.


In [None]:
# Cell 14: Final Summary
print("🎉 COMPREHENSIVE CRYPTOCURRENCY PREDICTION COMPLETED!")
print("=" * 70)

print(f"\n📊 FINAL RESULTS:")
print(f"   Symbol: {config.symbol}")
print(f"   Timeframe: {config.timeframe}")
print(f"   Data period: {config.start_time} to {config.end_time}")
print(f"   Total sequences: {feature_info['total_sequences']}")
print(f"   Model parameters: {model.count_params():,}")
print(f"   Training epochs: {EPOCHS}")
print(f"   Final test loss: {evaluation_results['test_loss']:.6f}")
print(f"   Final test MAE: {evaluation_results['test_mae']:.6f}")
print(f"   Final test MAPE: {evaluation_results['test_mape']:.2f}%")

print(f"\n🔧 FEATURES USED:")
for group_name, features in feature_groups.items():
    print(f"   {group_name.capitalize()}: {len(features)} features")
    for feature in features:
        print(f"     - {feature}")

print(f"\n💡 ADVANCED FEATURES DEMONSTRATED:")
print(f"   ✅ Integrated data management with BinanceDataOrganizer")
print(f"   ✅ On-demand data generation and processing")
print(f"   ✅ Grouped normalization for different feature types")
print(f"   ✅ Memory-efficient training and evaluation")
print(f"   ✅ Comprehensive visualization and analysis")
print(f"   ✅ Model persistence and configuration management")
print(f"   ✅ Advanced plotting and error analysis")

print(f"\n🎯 The comprehensive system provides a complete solution")
print(f"   for cryptocurrency prediction with advanced normalization,")
print(f"   memory-efficient data management, and comprehensive analysis!")

print(f"\n✅ All tasks completed successfully!")
