# Bitcoin Price Prediction - Model Training

This notebook trains and evaluates machine learning models for Bitcoin price prediction.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
import warnings
warnings.filterwarnings('ignore')

# Add the project root to Python path
sys.path.append(os.path.join(os.getcwd(), '..'))

from config.config import DATA_CONFIG, FEATURE_CONFIG, MODEL_CONFIG, LSTM_CONFIG, RF_CONFIG, PATHS
from utils.data_loader import BitcoinDataLoader
from utils.feature_engineering import FeatureEngineer
from models.random_forest import BitcoinRandomForest
from models.lstm_model import BitcoinLSTM
from models.model_evaluation import ModelEvaluator

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

## 1. Load Feature-Engineered Data

In [None]:
# Load data with features
try:
    # Try to load pre-processed features
    df_features = pd.read_csv('../data/bitcoin_features.csv', index_col='date', parse_dates=True)
    print("‚úÖ Loaded pre-processed features from file")
except:
    # Generate features from scratch
    print("üîÑ Generating features from scratch...")
    data_loader = BitcoinDataLoader(DATA_CONFIG)
    df = data_loader.load_data()
    feature_engineer = FeatureEngineer(FEATURE_CONFIG)
    df_features = feature_engineer.add_technical_indicators(df)

print("üìä Data Shape:", df_features.shape)
print("üéØ Target variable present:", 'target' in df_features.columns)

# Show basic info
df_features.head()

## 2. Prepare Data for Modeling

In [None]:
# Check target distribution
if 'target' in df_features.columns:
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 2, 1)
    plt.hist(df_features['target'], bins=50, alpha=0.7, color='skyblue')
    plt.title('Target Variable Distribution', fontweight='bold')
    plt.xlabel('Target (Price Change %)')
    plt.ylabel('Frequency')
    plt.grid(True, alpha=0.3)
    
    plt.subplot(1, 2, 2)
    # Time series of target
    plt.plot(df_features.index, df_features['target'], alpha=0.7)
    plt.title('Target Variable Over Time', fontweight='bold')
    plt.xlabel('Date')
    plt.ylabel('Target (Price Change %)')
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print("üìà Target Statistics:")
    print(df_features['target'].describe())

## 3. Train Random Forest Model

In [None]:
print("üå≤ Training Random Forest Model...")
print("=" * 50)

# Initialize and train Random Forest
rf_model = BitcoinRandomForest({**MODEL_CONFIG, **RF_CONFIG})
rf_metrics, rf_y_test, rf_y_pred = rf_model.train(df_features)

print("\n‚úÖ Random Forest Training Complete!")
print("üìä Performance Metrics:")
for metric, value in rf_metrics.items():
    print(f"  {metric.upper()}: {value:.4f}")

# Show feature importance
if hasattr(rf_model, 'feature_importance') and rf_model.feature_importance is not None:
    print(f"\nüîù Top 10 Most Important Features:")
    top_features = rf_model.feature_importance.head(10)
    for _, row in top_features.iterrows():
        print(f"  {row['feature']}: {row['importance']:.4f}")

## 4. Train LSTM Model

In [None]:
print("\nüß† Training LSTM Model...")
print("=" * 50)

# Initialize and train LSTM
lstm_model = BitcoinLSTM({**MODEL_CONFIG, **LSTM_CONFIG})
lstm_metrics, lstm_history, lstm_y_test, lstm_y_pred = lstm_model.train(df_features)

print("\n‚úÖ LSTM Training Complete!")
print("üìä Performance Metrics:")
for metric, value in lstm_metrics.items():
    print(f"  {metric.upper()}: {value:.4f}")

## 5. Model Comparison and Evaluation

In [None]:
print("üìä Model Comparison")
print("=" * 50)

# Initialize evaluator
evaluator = ModelEvaluator(PATHS['plots_dir'])

# Compare models
models_metrics = {
    'Random Forest': rf_metrics,
    'LSTM': lstm_metrics
}

metrics_df = evaluator.compare_models(models_metrics)

# Plot predictions comparison
if rf_y_test is not None and lstm_y_test is not None:
    evaluator.plot_predictions_comparison(rf_y_test, rf_y_pred, lstm_y_test, lstm_y_pred)

# Plot feature importance
if hasattr(rf_model, 'feature_importance') and rf_model.feature_importance is not None:
    evaluator.plot_feature_importance(rf_model.feature_importance)

# Plot training history for LSTM
if lstm_history is not None:
    evaluator.plot_training_history(lstm_history)

## 6. Detailed Performance Analysis

In [None]:
# Create detailed performance analysis
plt.figure(figsize=(15, 10))

# 1. Prediction vs Actual (Random Forest)
plt.subplot(2, 3, 1)
plt.scatter(rf_y_test, rf_y_pred, alpha=0.6, color='blue')
plt.plot([rf_y_test.min(), rf_y_test.max()], [rf_y_test.min(), rf_y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Random Forest: Actual vs Predicted', fontweight='bold')
plt.grid(True, alpha=0.3)

# 2. Prediction vs Actual (LSTM)
plt.subplot(2, 3, 2)
plt.scatter(lstm_y_test, lstm_y_pred, alpha=0.6, color='green')
plt.plot([lstm_y_test.min(), lstm_y_test.max()], [lstm_y_test.min(), lstm_y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('LSTM: Actual vs Predicted', fontweight='bold')
plt.grid(True, alpha=0.3)

# 3. Residuals (Random Forest)
plt.subplot(2, 3, 3)
rf_residuals = rf_y_test - rf_y_pred
plt.scatter(rf_y_pred, rf_residuals, alpha=0.6, color='blue')
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Random Forest: Residuals', fontweight='bold')
plt.grid(True, alpha=0.3)

# 4. Residuals (LSTM)
plt.subplot(2, 3, 4)
lstm_residuals = lstm_y_test - lstm_y_pred
plt.scatter(lstm_y_pred, lstm_residuals, alpha=0.5, color='green')
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('LSTM: Residuals', fontweight='bold')
plt.grid(True, alpha=0.3)

# 5. Error Distribution
plt.subplot(2, 3, 5)
plt.hist(rf_residuals, bins=30, alpha=0.7, color='blue', label='Random Forest', density=True)
plt.hist(lstm_residuals, bins=30, alpha=0.7, color='green', label='LSTM', density=True)
plt.xlabel('Prediction Error')
plt.ylabel('Density')
plt.title('Error Distribution', fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)

# 6. Time Series of Predictions (first 100 points)
plt.subplot(2, 3, 6)
n_points = min(100, len(rf_y_test))
indices = range(n_points)
plt.plot(indices, rf_y_test[:n_points], label='Actual', color='black', linewidth=2)
plt.plot(indices, rf_y_pred[:n_points], label='RF Predicted', color='blue', linestyle='--')
plt.plot(indices, lstm_y_pred[:n_points], label='LSTM Predicted', color='green', linestyle='--')
plt.xlabel('Time Index')
plt.ylabel('Target Value')
plt.title('Predictions Over Time', fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Save Trained Models

In [None]:
print("üíæ Saving Trained Models...")

# Create models directory if it doesn't exist
os.makedirs(PATHS['models_dir'], exist_ok=True)

# Save Random Forest model
rf_model_path = os.path.join(PATHS['models_dir'], 'random_forest_model.joblib')
rf_model.save_model(rf_model_path)

# Save LSTM model
lstm_model_path = os.path.join(PATHS['models_dir'], 'lstm_model.h5')
lstm_scaler_path = os.path.join(PATHS['models_dir'], 'lstm_scaler.joblib')
lstm_model.save_model(lstm_model_path, lstm_scaler_path)

print("‚úÖ Models saved successfully!")
print(f"üìÅ Models location: {PATHS['models_dir']}")

## 8. Model Performance Summary

In [None]:
print("üéØ FINAL MODEL PERFORMANCE SUMMARY")
print("=" * 60)

# Create performance comparison table
performance_data = []
for model_name, metrics in models_metrics.items():
    performance_data.append({
        'Model': model_name,
        'MAE': f"{metrics['mae']:.4f}%",
        'RMSE': f"{metrics['rmse']:.4f}%",
        'R¬≤': f"{metrics['r2']:.4f}",
        'MSE': f"{metrics['mse']:.6f}"
    })

performance_df = pd.DataFrame(performance_data)
print("\nüìä Performance Comparison:")
display(performance_df)

# Determine best model
best_model = max(models_metrics.items(), key=lambda x: x[1]['r2'])
print(f"\nüèÜ Best Performing Model: {best_model[0]}")
print(f"   R¬≤ Score: {best_model[1]['r2']:.4f}")
print(f"   MAE: {best_model[1]['mae']:.4f}%")

# Key insights
print("\nüí° Key Insights:")
print("‚Ä¢ LSTM generally performs better for time series data due to sequence learning")
print("‚Ä¢ Random Forest provides good interpretability through feature importance")
print("‚Ä¢ Both models capture meaningful patterns in Bitcoin price movements")
print("‚Ä¢ Model performance is affected by market volatility regimes")

print("\n‚úÖ Model training and evaluation complete!")

## Next Steps

1. **Model Deployment**: Use the saved models for real-time predictions
2. **Hyperparameter Tuning**: Further optimize model parameters
3. **Ensemble Methods**: Combine both models for improved performance
4. **Feature Engineering**: Experiment with additional features
5. **Model Monitoring**: Set up performance tracking over time

The trained models are now ready for making Bitcoin price predictions!