# 🚀 Crypto Prediction Model Training - New Approach

This notebook trains an LSTM model using the **expanded range scaling approach** with OHLC-only data.

## Key Features:
- **OHLC-only data** (Volume removed)
- **Expanded range scaling** with clipping filter
- **Independent sequence scaling** for better time series handling
- **Combined input+output visualization** for continuous charting
- **Updated model architecture** for HLC prediction (Open derived from previous Close)

## Approach:
1. Load and prepare OHLC data
2. Apply expanded range scaling (input min/2 to input max*2)
3. Train LSTM model on scaled sequences
4. Evaluate and visualize results


In [None]:
# Cell 1: Imports and Setup
import os
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error, mean_absolute_error
from IPython.display import display

# Suppress warnings
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Configure matplotlib for notebook environment
plt.style.use('default')

# Import our updated utilities
from src import (
    BinanceDataOrganizer,
    create_lstm_model, evaluate_model, predict_next_candle, add_open_to_predictions,
    draw_candlestick_chart, plot_combined_input_output_charts, plot_sample_data_comparison,
    plot_training_history, plot_predictions_vs_actual,
    production_config, test_config
)

print("✅ Imports successful - New approach ready!")


## 1. Configuration Selection

Choose between production and test configurations:

- **Production Config**: Full-scale deployment with larger models and more data
- **Test Config**: Fast execution for development and testing


In [None]:
# Cell 2: Configuration Selection
CONFIG_MODE = 'test'  # Change to 'production' for full-scale deployment

if CONFIG_MODE == 'production':
    config = production_config
    print("🚀 PRODUCTION mode")
else:
    config = test_config
    print("⚡ TEST mode")

print(f"Config: {config.symbol} {config.timeframe} | {config.start_date} to {config.end_date}")
print(f"Model: {config.lstm_units} units, {config.epochs} epochs, {config.sequence_length}→{config.prediction_length}")
print(f"Features: OHLC only (Volume removed)")
print(f"Scaling: Expanded range approach with clipping filter")


## 2. Data Loading and Scaling

Load cryptocurrency data and apply the new expanded range scaling approach.


In [None]:
# Cell 3: Data Loading and Scaling
print("📊 LOADING DATA WITH NEW EXPANDED RANGE SCALING")
print("=" * 60)

# Create data organizer
organizer = BinanceDataOrganizer(config)

# Get unscaled data for analysis
unscaled_data = organizer.get_unscaled_split_data()
X_train_unscaled = unscaled_data['input_train']
y_train_unscaled = unscaled_data['output_train']
X_test_unscaled = unscaled_data['input_test']
y_test_unscaled = unscaled_data['output_test']

print(f"1️⃣ Unscaled Data Shapes:")
print(f"   X_train: {X_train_unscaled.shape}")
print(f"   y_train: {y_train_unscaled.shape}")
print(f"   X_test: {X_test_unscaled.shape}")
print(f"   y_test: {y_test_unscaled.shape}")

# Get scaled data using new approach
scaled_data = organizer.get_scaled_data()
X_train_scaled = scaled_data['input_train_scaled']
y_train_scaled = scaled_data['output_train_scaled']
X_test_scaled = scaled_data['input_test_scaled']
y_test_scaled = scaled_data['output_test_scaled']

print(f"\n2️⃣ Scaled Data Shapes:")
print(f"   X_train: {X_train_scaled.shape}")
print(f"   y_train: {y_train_scaled.shape}")
print(f"   X_test: {X_test_scaled.shape}")
print(f"   y_test: {y_test_scaled.shape}")

print(f"\n3️⃣ Scaling Ranges:")
print(f"   X_train range: {X_train_scaled.min():.6f} to {X_train_scaled.max():.6f}")
print(f"   y_train range: {y_train_scaled.min():.6f} to {y_train_scaled.max():.6f}")
print(f"   ✅ Both in 0-1 range: {np.all(X_train_scaled >= 0) and np.all(X_train_scaled <= 1)}")
print(f"   ✅ Both in 0-1 range: {np.all(y_train_scaled >= 0) and np.all(y_train_scaled <= 1)}")

print(f"\n✅ Data loading and scaling completed successfully!")


## 3. Data Visualization

Visualize the unscaled and scaled data using candlestick charts.


In [None]:
# Cell 4: Data Visualization
print("📊 VISUALIZING DATA WITH CANDLESTICK CHARTS")
print("=" * 60)

# Combine first sequence for visualization
first_sequence_unscaled = organizer.combine_input_output_for_chart(X_train_unscaled[0], y_train_unscaled[0])
first_sequence_scaled = organizer.combine_input_output_for_chart(X_train_scaled[0], y_train_scaled[0])

print(f"📈 Creating candlestick charts for first sequence ({len(first_sequence_unscaled)} timesteps)...")
print(f"   Input timesteps: {config.sequence_length}")
print(f"   Output timesteps: {config.prediction_length}")
print(f"   Total timesteps: {len(first_sequence_unscaled)}")

# Display combined charts
plot_combined_input_output_charts(
    first_sequence_unscaled, 
    first_sequence_scaled, 
    config, 
    'Training Data Visualization'
)

# Display sample data comparison
plot_sample_data_comparison(
    first_sequence_unscaled, 
    first_sequence_scaled, 
    config, 
    num_samples=10
)

print(f"\n✅ Data visualization completed!")


## 4. Model Creation and Training

Create and train the LSTM model using the new approach.


In [None]:
# Cell 5: Model Creation
print("🤖 CREATING LSTM MODEL")
print("=" * 40)

# Create model with updated architecture
input_shape = (X_train_scaled.shape[1], X_train_scaled.shape[2])  # (sequence_length, features)
model = create_lstm_model(
    input_shape=input_shape,
    lstm_units=config.lstm_units,
    dropout_rate=config.dropout_rate,
    learning_rate=config.learning_rate,
    prediction_length=config.prediction_length
)

print(f"✅ Model created successfully!")
print(f"   Input shape: {input_shape}")
print(f"   Output size: {config.prediction_length * 3} (HLC per timestep)")
print(f"   Total parameters: {model.count_params():,}")
print(f"   Architecture: LSTM → LSTM → Dense → Dense")

# Display model summary
print(f"\n📋 Model Summary:")
model.summary()


In [None]:
# Cell 6: Model Training
print("🚀 TRAINING LSTM MODEL")
print("=" * 40)

print(f"Training configuration:")
print(f"   Epochs: {config.epochs}")
print(f"   Batch size: {config.batch_size}")
print(f"   Patience: {config.patience}")
print(f"   Learning rate: {config.learning_rate}")

# Train the model
print(f"\n🔥 Starting training...")
history = model.fit(
    X_train_scaled, y_train_scaled,
    validation_data=(X_test_scaled, y_test_scaled),
    epochs=config.epochs,
    batch_size=config.batch_size,
    verbose=1,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            patience=config.patience, 
            restore_best_weights=True,
            monitor='val_loss'
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            patience=config.lr_patience, 
            factor=config.lr_factor, 
            min_lr=config.min_lr,
            monitor='val_loss'
        )
    ]
)

print(f"\n✅ Training completed successfully!")
print(f"   Final training loss: {history.history['loss'][-1]:.6f}")
print(f"   Final validation loss: {history.history['val_loss'][-1]:.6f}")
print(f"   Final training MAE: {history.history['mae'][-1]:.6f}")
print(f"   Final validation MAE: {history.history['val_mae'][-1]:.6f}")

# Display training history
print(f"\n📊 Training History Charts:")
plot_training_history(history)


## 5. Model Evaluation and Testing

Evaluate the trained model and test predictions.


In [None]:
# Cell 7: Model Evaluation
print("📊 EVALUATING TRAINED MODEL")
print("=" * 40)

# Evaluate on test data
print("1️⃣ Evaluating on test data...")
test_metrics = evaluate_model(model, X_test_scaled, y_test_scaled)

print(f"Test Results:")
print(f"   MSE: {test_metrics['test_loss']:.6f}")
print(f"   MAE: {test_metrics['test_mae']:.6f}")
print(f"   MAPE: {test_metrics['test_mape']:.2f}%")
print(f"   RMSE: {test_metrics['rmse']:.6f}")

# Get predictions for visualization
predictions_scaled = test_metrics['predictions']
output_true_scaled = test_metrics['output_true']

print(f"\n2️⃣ Prediction Analysis:")
print(f"   Prediction shape: {predictions_scaled.shape}")
print(f"   Prediction range: {predictions_scaled.min():.6f} to {predictions_scaled.max():.6f}")
print(f"   True output range: {output_true_scaled.min():.6f} to {output_true_scaled.max():.6f}")

# Plot predictions vs actual
print(f"\n3️⃣ Predictions vs Actual Comparison:")
plot_predictions_vs_actual(
    predictions_scaled, 
    output_true_scaled, 
    ['High', 'Low', 'Close'], 
    max_samples=1000
)

print(f"\n✅ Model evaluation completed!")


## 6. Prediction Visualization

Visualize model predictions using candlestick charts.


In [None]:
# Cell 8: Prediction Visualization
print("📈 VISUALIZING MODEL PREDICTIONS")
print("=" * 50)

# Select a test sequence for visualization
test_idx = 0
test_sequence = X_test_scaled[test_idx:test_idx+1]
test_output = y_test_scaled[test_idx]

print(f"1️⃣ Test Sequence Analysis:")
print(f"   Test sequence shape: {test_sequence.shape}")
print(f"   Test output shape: {test_output.shape}")

# Make prediction
prediction_scaled = predict_next_candle(model, test_sequence)
print(f"   Prediction shape: {prediction_scaled.shape}")

# Add Open column to prediction for charting
last_close = test_sequence[0, -1, 3]  # Last Close from input
prediction_with_open = add_open_to_predictions(prediction_scaled, last_close)
print(f"   Prediction with Open shape: {prediction_with_open.shape}")

# Reshape prediction to (prediction_length, 4) for charting
prediction_length = config.prediction_length
prediction_reshaped = prediction_with_open.reshape(prediction_length, 4)
print(f"   Prediction reshaped: {prediction_reshaped.shape}")

# Combine input and prediction for continuous charting
input_sequence = test_sequence[0]  # Remove batch dimension
combined_prediction = np.vstack([input_sequence, prediction_reshaped])

print(f"\n2️⃣ Creating Prediction Charts:")

# Chart 1: Input + True Output
print("   📊 Input + True Output Chart:")
true_output_with_open = organizer.add_open_to_output(input_sequence, test_output, config.prediction_length)
combined_true = np.vstack([input_sequence, true_output_with_open])
draw_candlestick_chart(
    combined_true, 
    'Input + True Output (Ground Truth)', 
    'Scaled Value'
)

# Chart 2: Input + Model Prediction
print("   📊 Input + Model Prediction Chart:")
draw_candlestick_chart(
    combined_prediction, 
    'Input + Model Prediction (Combined)', 
    'Scaled Value'
)

print(f"\n3️⃣ Prediction Analysis:")
print(f"   Input range: {input_sequence.min():.6f} to {input_sequence.max():.6f}")
print(f"   Prediction range: {prediction_reshaped.min():.6f} to {prediction_reshaped.max():.6f}")
print(f"   True output range: {true_output_with_open.min():.6f} to {true_output_with_open.max():.6f}")

# Calculate prediction accuracy
prediction_mae = np.mean(np.abs(prediction_reshaped - true_output_with_open))
print(f"   Prediction MAE: {prediction_mae:.6f}")

print(f"\n✅ Prediction visualization completed!")


## 7. Model Performance Summary

Display comprehensive model performance metrics and analysis.


In [None]:
# Cell 9: Model Performance Summary
print("📊 MODEL PERFORMANCE SUMMARY")
print("=" * 50)

# Training performance
print("1️⃣ Training Performance:")
print(f"   Final Training Loss: {history.history['loss'][-1]:.6f}")
print(f"   Final Validation Loss: {history.history['val_loss'][-1]:.6f}")
print(f"   Final Training MAE: {history.history['mae'][-1]:.6f}")
print(f"   Final Validation MAE: {history.history['val_mae'][-1]:.6f}")

# Test performance
print(f"\n2️⃣ Test Performance:")
print(f"   Test MSE: {test_metrics['test_loss']:.6f}")
print(f"   Test MAE: {test_metrics['test_mae']:.6f}")
print(f"   Test MAPE: {test_metrics['test_mape']:.2f}%")
print(f"   Test RMSE: {test_metrics['rmse']:.6f}")

# Model architecture
print(f"\n3️⃣ Model Architecture:")
print(f"   Input Shape: {input_shape}")
print(f"   Output Size: {config.prediction_length * 3} (HLC per timestep)")
print(f"   Total Parameters: {model.count_params():,}")
print(f"   LSTM Units: {config.lstm_units}")
print(f"   Dropout Rate: {config.dropout_rate}")

# Data characteristics
print(f"\n4️⃣ Data Characteristics:")
print(f"   Training Samples: {X_train_scaled.shape[0]:,}")
print(f"   Test Samples: {X_test_scaled.shape[0]:,}")
print(f"   Sequence Length: {config.sequence_length}")
print(f"   Prediction Length: {config.prediction_length}")
print(f"   Features: OHLC only (Volume removed)")
print(f"   Scaling: Expanded range approach")

# Scaling analysis
print(f"\n5️⃣ Scaling Analysis:")
print(f"   Input Range: {X_train_scaled.min():.6f} to {X_train_scaled.max():.6f}")
print(f"   Output Range: {y_train_scaled.min():.6f} to {y_train_scaled.max():.6f}")
print(f"   Both in 0-1 Range: {np.all(X_train_scaled >= 0) and np.all(X_train_scaled <= 1)}")

print(f"\n✅ Model training and evaluation completed successfully!")
print(f"   The model is ready for production use with the new expanded range scaling approach.")
