In [None]:
# Cell 0: Imports and Setup
import os
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error, mean_absolute_error
from IPython.display import display

# Suppress warnings
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Configure matplotlib for notebook environment
plt.style.use('default')

# Import our integrated utilities
from src import (
    BinanceDataOrganizer, DataConfig, GroupedScaler,
    create_lstm_model, evaluate_model,
    get_memory_usage, print_memory_stats,
    plot_training_history, plot_predictions_vs_actual,
    plot_price_data, plot_prediction_errors, plot_feature_analysis,
    # Legacy utilities for comparison
    download_crypto_data, scale_data, scale_time_series_data_grouped,
    predict_with_grouped_scaler
)

print("✅ All imports successful!")
print(f"📊 Memory usage: {get_memory_usage():.1f} MB")


## 1. Configuration and Data Setup

Configure the data parameters and create the BinanceDataOrganizer instance.


In [None]:
# Cell 1: Configuration
# Configuration parameters
SYMBOL = "BTCUSDT"
TIMEFRAME = "5m"
START_DATE = "2021-01-01"
END_DATE = "2021-01-31"  # Full month for better training
SEQUENCE_LENGTH = 20  # Number of timesteps to look back
PREDICTION_LENGTH = 1  # Number of future timesteps to predict
MAX_ROWS = 10000  # Maximum rows to load for memory efficiency
TRAIN_SPLIT = 0.8  # 80% for training, 20% for testing

# Model parameters
LSTM_UNITS = 100
DROPOUT_RATE = 0.2
EPOCHS = 10
BATCH_SIZE = 32
LEARNING_RATE = 0.001

# Create configuration
config = DataConfig(
    symbol=SYMBOL,
    timeframe=TIMEFRAME,
    start_time=START_DATE,
    end_time=END_DATE,
    sequence_length=SEQUENCE_LENGTH,
    prediction_length=PREDICTION_LENGTH,
    max_rows=MAX_ROWS,
    train_split=TRAIN_SPLIT
)

print(f"🔧 Configuration created:")
print(f"   Symbol: {config.symbol}")
print(f"   Timeframe: {config.timeframe}")
print(f"   Period: {config.start_time} to {config.end_time}")
print(f"   Sequence Length: {config.sequence_length}")
print(f"   Prediction Length: {config.prediction_length}")
print(f"   Max Rows: {config.max_rows}")
print(f"   Train Split: {config.train_split}")


In [None]:
# Cell 2: Create and Initialize Organizer
# Create the BinanceDataOrganizer instance
organizer = BinanceDataOrganizer(config)

print("🚀 Starting complete data processing pipeline...")

# Process all data (load + create features)
if organizer.process_all():
    print("✅ Data processing completed successfully!")
    
    # Get feature information
    feature_info = organizer.get_feature_info()
    print(f"\n📊 Feature Information:")
    print(f"   Total features: {feature_info['num_features']}")
    print(f"   Sequence length: {feature_info['sequence_length']}")
    print(f"   Prediction length: {feature_info['prediction_length']}")
    print(f"   Data shape: {feature_info['data_shape']}")
    print(f"   Total sequences: {feature_info['total_sequences']}")
    print(f"\n🔧 Feature columns: {feature_info['feature_columns']}")
else:
    print("❌ Data processing failed!")
    raise Exception("Failed to process data")


## 2. Data Visualization and Analysis

Visualize the loaded data and analyze its characteristics.


In [None]:
# Cell 3: Data Visualization
print("📊 Visualizing loaded data...")

# Get unscaled data for visualization
unscaled_data = organizer.get_unscaled_data('all')
if 'data' in unscaled_data:
    original_data = unscaled_data['data']
    plot_price_data(original_data, f"{config.symbol} Price Data ({config.start_time} to {config.end_time})")
    
    # Display basic statistics
    print(f"\n📈 Data Statistics:")
    print(f"   Total rows: {len(original_data):,}")
    print(f"   Date range: {original_data.index[0]} to {original_data.index[-1]}")
    print(f"   Price range: ${original_data['Close'].min():.2f} - ${original_data['Close'].max():.2f}")
    print(f"   Volume range: {original_data['Volume'].min():.2f} - {original_data['Volume'].max():.2f}")
else:
    print("⚠️ Original data not available for plotting")

print("📊 Feature analysis will be shown after data scaling...")


## 3. Model Training with Integrated Normalization

Train the LSTM model using the integrated normalization system.


In [None]:
# Cell 4: Get Scaled Data and Train Model
# Get scaled data (this will fit scalers if not already fitted)
print("🔢 Getting scaled data with integrated normalization...")
scaled_data = organizer.get_scaled_data('all')

X_train_scaled = scaled_data['X_train_scaled']
X_test_scaled = scaled_data['X_test_scaled']
y_train_scaled = scaled_data['y_train_scaled']
y_test_scaled = scaled_data['y_test_scaled']

print(f"✅ Scaled data ready:")
print(f"   X_train_scaled: {X_train_scaled.shape}")
print(f"   y_train_scaled: {y_train_scaled.shape}")
print(f"   X_train range: {X_train_scaled.min():.4f} to {X_train_scaled.max():.4f}")
print(f"   y_train range: {y_train_scaled.min():.4f} to {y_train_scaled.max():.4f}")

# Get scalers for later use
scalers = organizer.get_scalers()
scaler_X = scalers['X']  # GroupedScaler
scaler_y = scalers['y']  # MinMaxScaler

# Display feature groups
feature_groups = scaler_X.get_feature_groups()
print(f"\n📊 Feature Groups:")
for group_name, features in feature_groups.items():
    print(f"   {group_name.capitalize()}: {features}")

# Plot feature analysis
print(f"\n📊 Feature Analysis:")
feature_info_with_groups = {
    'feature_columns': feature_info['feature_columns'],
    'feature_groups': feature_groups
}
plot_feature_analysis(feature_info_with_groups)


In [None]:
# Cell 5: Create and Train LSTM Model
print(f"🏗️ Creating LSTM model...")
print(f"   LSTM units: {LSTM_UNITS}")
print(f"   Dropout rate: {DROPOUT_RATE}")
print(f"   Epochs: {EPOCHS}")
print(f"   Batch size: {BATCH_SIZE}")

# Create model
model = create_lstm_model(
    input_shape=(X_train_scaled.shape[1], X_train_scaled.shape[2]),
    lstm_units=LSTM_UNITS,
    dropout_rate=DROPOUT_RATE
)

# Compile model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
    loss='mse',
    metrics=['mae', 'mape']
)

print(f"✅ Model created with {model.count_params():,} parameters")
print(f"\n📋 Model Architecture:")
model.summary()


In [None]:
# Cell 6: Train the Model
print(f"🚀 Starting model training...")
print(f"   Memory usage before training: {get_memory_usage():.1f} MB")

# Train the model
history = model.fit(
    X_train_scaled, y_train_scaled,
    validation_data=(X_test_scaled, y_test_scaled),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    verbose=1,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True),
        tf.keras.callbacks.ReduceLROnPlateau(patience=5, factor=0.5, min_lr=1e-7)
    ]
)

print(f"\n✅ Training completed!")
print(f"   Memory usage after training: {get_memory_usage():.1f} MB")


In [None]:
# Cell 7: Plot Training History
print("📈 Plotting training history...")
plot_training_history(history)


## 4. Model Evaluation and Advanced Features

Evaluate the trained model and demonstrate advanced features of the BinanceDataOrganizer.


In [None]:
# Cell 8: Model Evaluation
print(f"📊 Evaluating model performance...")

# Evaluate model
evaluation_results = evaluate_model(
    model=model,
    X_test=X_test_scaled,
    y_test=y_test_scaled,
    scaler_y=scaler_y
)

print(f"\n📈 Evaluation Results:")
print(f"   Test Loss (MSE): {evaluation_results['test_loss']:.6f}")
print(f"   Test MAE: {evaluation_results['test_mae']:.6f}")
print(f"   Test MAPE: {evaluation_results['test_mape']:.2f}%")
print(f"   RMSE: {evaluation_results['rmse']:.6f}")

# Get predictions for visualization
y_pred = evaluation_results['predictions']
y_true = evaluation_results['y_true_original']

print(f"\n📊 Prediction Statistics:")
print(f"   Predictions shape: {y_pred.shape}")
print(f"   True values shape: {y_true.shape}")
print(f"   Prediction range: ${y_pred.min():.2f} - ${y_pred.max():.2f}")
print(f"   True value range: ${y_true.min():.2f} - ${y_true.max():.2f}")


In [None]:
# Cell 9: Plot Predictions vs Actual
print("📈 Plotting predictions vs actual...")
target_columns = ['Open', 'High', 'Low', 'Close', 'Volume']
plot_predictions_vs_actual(
    evaluation_results['predictions'], 
    evaluation_results['y_true_original'], 
    target_columns
)


In [None]:
# Cell 10: Plot Prediction Errors
print("📈 Plotting prediction errors...")
plot_prediction_errors(
    evaluation_results['predictions'], 
    evaluation_results['y_true_original'], 
    target_columns
)


## 5. Advanced Features and On-Demand Processing

Demonstrate advanced features like on-demand data generation and future predictions.


In [None]:
# Cell 11: On-Demand Data Generation
print("🔄 Demonstrating on-demand data generation...")

# Get data for a specific time range
start_time = "2021-01-15 00:00:00"
end_time = "2021-01-15 23:59:59"

print(f"📅 Requesting data for range: {start_time} to {end_time}")

range_data = organizer.get_data_in_range(
    start_time=start_time,
    end_time=end_time,
    scaled=True
)

if range_data is not None:
    print(f"✅ Range data generated:")
    print(f"   Available keys: {list(range_data.keys())}")
    print(f"   X shape: {range_data['X_scaled'].shape}")
    print(f"   y shape: {range_data['y_scaled'].shape}")
    
    # Make predictions on this range
    range_predictions = model.predict(range_data['X_scaled'], verbose=0)
    print(f"   Predictions shape: {range_predictions.shape}")
else:
    print("⚠️ No data available for the specified range")


In [None]:
# Cell 12: Future Predictions
print("🔮 Making future predictions...")

# Use the last sequence from test data for prediction
last_sequence = X_test_scaled[-1:]  # Shape: (1, sequence_length, features)
print(f"   Using last sequence shape: {last_sequence.shape}")

# Make prediction
future_prediction = model.predict(last_sequence, verbose=0)
print(f"   Prediction shape: {future_prediction.shape}")

# Inverse transform prediction to original scale
future_prediction_original = scaler_y.inverse_transform(future_prediction)
print(f"   Original scale prediction shape: {future_prediction_original.shape}")

# Display prediction results
target_columns = ['Open', 'High', 'Low', 'Close', 'Volume']
print(f"\n📈 Future Prediction (Next Candle):")
for i, col in enumerate(target_columns):
    print(f"   {col}: ${future_prediction_original[0, i]:.2f}")

# Compare with last known values
last_known = scaler_y.inverse_transform(y_test_scaled[-1:])[0]
print(f"\n📊 Last Known Values:")
for i, col in enumerate(target_columns):
    print(f"   {col}: ${last_known[i]:.2f}")

# Calculate changes
print(f"\n📊 Predicted Changes:")
for i, col in enumerate(target_columns):
    change = future_prediction_original[0, i] - last_known[i]
    change_pct = (change / last_known[i]) * 100
    print(f"   {col}: {change:+.2f} ({change_pct:+.2f}%)")


## 6. Legacy Method Comparison

Compare the results with the legacy method for validation.


In [None]:
# Cell 13: Legacy Method Comparison
print("🔄 Comparing with legacy method...")

# Download data using legacy method
legacy_data = download_crypto_data(
    symbol=SYMBOL,
    interval=TIMEFRAME,
    data_from=START_DATE.replace('-', ' '),
    data_to=END_DATE.replace('-', ' ')
)

if legacy_data is not None:
    print(f"✅ Legacy data downloaded: {legacy_data.shape}")
    
    # Create features using legacy method
    legacy_features = legacy_data.copy()
    
    # Convert index to datetime if it's not already
    if not isinstance(legacy_features.index, pd.DatetimeIndex):
        legacy_features.index = pd.to_datetime(legacy_features.index)
    
    legacy_features['Minutes_of_day'] = legacy_features.index.hour * 60 + legacy_features.index.minute
    legacy_features['Price_Range'] = legacy_features['High'] - legacy_features['Low']
    legacy_features['Price_Change'] = legacy_features['Close'] - legacy_features['Open']
    legacy_features['Price_Change_Pct'] = (legacy_features['Price_Change'] / legacy_features['Open']) * 100
    
    # Create sequences using legacy method
    feature_columns = ['Quote asset volume', 'Number of trades', 'Taker buy base asset volume', 
                      'Taker buy quote asset volume', 'Minutes_of_day', 'Price_Range', 
                      'Price_Change', 'Price_Change_Pct']
    target_columns = ['Open', 'High', 'Low', 'Close', 'Volume']
    
    # Create sequences manually for legacy comparison
    X_legacy = []
    y_legacy = []
    
    for i in range(SEQUENCE_LENGTH, len(legacy_features) - PREDICTION_LENGTH + 1):
        X_legacy.append(legacy_features[feature_columns].iloc[i-SEQUENCE_LENGTH:i].values)
        y_legacy.append(legacy_features[target_columns].iloc[i:i+PREDICTION_LENGTH].values.flatten())
    
    X_legacy = np.array(X_legacy)
    y_legacy = np.array(y_legacy)
    
    print(f"✅ Legacy sequences created: X={X_legacy.shape}, y={y_legacy.shape}")
    
    # Compare data shapes
    print(f"\n📊 Data Comparison:")
    print(f"   Advanced method - X: {X_train_scaled.shape}, y: {y_train_scaled.shape}")
    print(f"   Legacy method - X: {X_legacy.shape}, y: {y_legacy.shape}")
    
    # Compare feature statistics
    print(f"\n📊 Feature Statistics Comparison:")
    print(f"   Advanced method - X range: {X_train_scaled.min():.4f} to {X_train_scaled.max():.4f}")
    print(f"   Legacy method - X range: {X_legacy.min():.4f} to {X_legacy.max():.4f}")
    print(f"   Advanced method - y range: {y_train_scaled.min():.4f} to {y_train_scaled.max():.4f}")
    print(f"   Legacy method - y range: {y_legacy.min():.4f} to {y_legacy.max():.4f}")
    
else:
    print("⚠️ Legacy data download failed")


## 7. Final Summary and Results

Display comprehensive results and summary of the prediction system.


In [None]:
# Cell 14: Final Summary
print("🎉 COMPREHENSIVE CRYPTOCURRENCY PREDICTION COMPLETED!")
print("=" * 70)

print(f"\n📊 FINAL RESULTS:")
print(f"   Symbol: {config.symbol}")
print(f"   Timeframe: {config.timeframe}")
print(f"   Data period: {config.start_time} to {config.end_time}")
print(f"   Total sequences: {feature_info['total_sequences']}")
print(f"   Model parameters: {model.count_params():,}")
print(f"   Training epochs: {EPOCHS}")
print(f"   Final test loss: {evaluation_results['test_loss']:.6f}")
print(f"   Final test MAE: {evaluation_results['test_mae']:.6f}")
print(f"   Final test MAPE: {evaluation_results['test_mape']:.2f}%")

print(f"\n🔧 FEATURES USED:")
for group_name, features in feature_groups.items():
    print(f"   {group_name.capitalize()}: {len(features)} features")
    for feature in features:
        print(f"     - {feature}")

print(f"\n💡 ADVANCED FEATURES DEMONSTRATED:")
print(f"   ✅ Integrated data management with BinanceDataOrganizer")
print(f"   ✅ On-demand data generation and processing")
print(f"   ✅ Grouped normalization for different feature types")
print(f"   ✅ Memory-efficient training and evaluation")
print(f"   ✅ Comprehensive visualization and analysis")
print(f"   ✅ Model persistence and configuration management")
print(f"   ✅ Legacy method comparison and validation")
print(f"   ✅ Advanced plotting and error analysis")

print(f"\n🎯 The comprehensive system provides a complete solution")
print(f"   for cryptocurrency prediction with advanced normalization,")
print(f"   memory-efficient data management, and comprehensive analysis!")

print(f"\n📈 Final memory usage: {get_memory_usage():.1f} MB")
print(f"\n✅ All tasks completed successfully!")
