In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Crypto Price Prediction with Hybrid Model\n",
    "\n",
    "This notebook demonstrates how to use both the original XGBoost model and the new hybrid model (LSTM + XGBoost) for crypto price prediction."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.metrics import mean_squared_error, r2_score\n",
    "from model import CryptoPricePredictor\n",
    "from hybrid_model import HybridStockPredictor\n",
    "import time\n",
    "import os"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Load and Preprocess Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "def load_data(data_path='data/book_updates.csv'):\n",
    "    \"\"\"Load and preprocess the orderbook data\"\"\"\n",
    "    # Read the data\n",
    "    df = pd.read_csv(data_path)\n",
    "    \n",
    "    # Convert timestamp to datetime\n",
    "    df['COLLECTION_TIME'] = pd.to_datetime(df['COLLECTION_TIME'])\n",
    "    \n",
    "    # Calculate basic features\n",
    "    df['mid_price'] = (df['BID_PRICE_1'] + df['ASK_PRICE_1']) / 2\n",
    "    df['spread'] = df['ASK_PRICE_1'] - df['BID_PRICE_1']\n",
    "    df['total_bid_size'] = df[['BID_SIZE_1', 'BID_SIZE_2', 'BID_SIZE_3', 'BID_SIZE_4', 'BID_SIZE_5']].sum(axis=1)\n",
    "    df['total_ask_size'] = df[['ASK_SIZE_1', 'ASK_SIZE_2', 'ASK_SIZE_3', 'ASK_SIZE_4', 'ASK_SIZE_5']].sum(axis=1)\n",
    "    df['order_book_imbalance'] = df['total_bid_size'] / (df['total_bid_size'] + df['total_ask_size'])\n",
    "    \n",
    "    return df\n",
    "\n",
    "# Load the data\n",
    "df = load_data()\n",
    "print(f\"Loaded {len(df)} rows of data\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Train and Evaluate Both Models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "def train_and_evaluate_models(df, train_size=0.8):\n",
    "    \"\"\"Train and evaluate both XGBoost and hybrid models\"\"\"\n",
    "    # Split data into train and test sets\n",
    "    train_size = int(len(df) * train_size)\n",
    "    train_data = df.iloc[:train_size]\n",
    "    test_data = df.iloc[train_size:]\n",
    "    \n",
    "    # Initialize models\n",
    "    xgb_model = CryptoPricePredictor(model_path='models/xgboost_model.json')\n",
    "    hybrid_model = HybridStockPredictor(xgb_model_path='models/xgboost_model.json')\n",
    "    \n",
    "    # Train XGBoost model\n",
    "    print(\"Training XGBoost model...\")\n",
    "    xgb_start_time = time.time()\n",
    "    xgb_model.update_model(train_data)\n",
    "    xgb_train_time = time.time() - xgb_start_time\n",
    "    \n",
    "    # Train hybrid model\n",
    "    print(\"\\nTraining hybrid model...\")\n",
    "    hybrid_start_time = time.time()\n",
    "    \n",
    "    # Train LSTM feature extractor\n",
    "    print(\"Training LSTM feature extractor...\")\n",
    "    hybrid_model.train_lstm(train_data, epochs=50, batch_size=32)\n",
    "    \n",
    "    # Train XGBoost part\n",
    "    print(\"Training XGBoost part...\")\n",
    "    hybrid_model.train_xgboost(train_data)\n",
    "    \n",
    "    hybrid_train_time = time.time() - hybrid_start_time\n",
    "    \n",
    "    # Make predictions\n",
    "    print(\"\\nMaking predictions...\")\n",
    "    xgb_pred = xgb_model.predict(test_data)\n",
    "    hybrid_pred = hybrid_model.predict(test_data)\n",
    "    \n",
    "    # Calculate metrics\n",
    "    actual = test_data['mid_price'].values\n",
    "    \n",
    "    xgb_mse = mean_squared_error(actual, xgb_pred)\n",
    "    xgb_r2 = r2_score(actual, xgb_pred)\n",
    "    \n",
    "    hybrid_mse = mean_squared_error(actual, hybrid_pred)\n",
    "    hybrid_r2 = r2_score(actual, hybrid_pred)\n",
    "    \n",
    "    return {\n",
    "        'xgb': {\n",
    "            'predictions': xgb_pred,\n",
    "            'mse': xgb_mse,\n",
    "            'r2': xgb_r2,\n",
    "            'train_time': xgb_train_time\n",
    "        },\n",
    "        'hybrid': {\n",
    "            'predictions': hybrid_pred,\n",
    "            'mse': hybrid_mse,\n",
    "            'r2': hybrid_r2,\n",
    "            'train_time': hybrid_train_time\n",
    "        }\n",
    "    }\n",
    "\n",
    "# Train and evaluate models\n",
    "results = train_and_evaluate_models(df)\n",
    "\n",
    "# Print results\n",
    "print(\"\\nResults:\")\n",
    "print(\"XGBoost Model:\")\n",
    "print(f\"MSE: {results['xgb']['mse']:.6f}\")\n",
    "print(f\"R2 Score: {results['xgb']['r2']:.6f}\")\n",
    "print(f\"Training Time: {results['xgb']['train_time']:.2f} seconds\")\n",
    "\n",
    "print(\"\\nHybrid Model:\")\n",
    "print(f\"MSE: {results['hybrid']['mse']:.6f}\")\n",
    "print(f\"R2 Score: {results['hybrid']['r2']:.6f}\")\n",
    "print(f\"Training Time: {results['hybrid']['train_time']:.2f} seconds\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Visualize Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "def plot_predictions(actual, xgb_pred, hybrid_pred):\n",
    "    \"\"\"Plot actual vs predicted prices for both models\"\"\"\n",
    "    plt.figure(figsize=(15, 7))\n",
    "    \n",
    "    # Plot actual prices\n",
    "    plt.plot(actual, label='Actual', color='black', alpha=0.7)\n",
    "    \n",
    "    # Plot XGBoost predictions\n",
    "    plt.plot(xgb_pred, label='XGBoost', color='blue', alpha=0.5)\n",
    "    \n",
    "    # Plot hybrid model predictions\n",
    "    plt.plot(hybrid_pred, label='Hybrid (LSTM+XGBoost)', color='red', alpha=0.5)\n",
    "    \n",
    "    plt.title('Actual vs Predicted Prices')\n",
    "    plt.xlabel('Time')\n",
    "    plt.ylabel('Price')\n",
    "    plt.legend()\n",
    "    plt.grid(True)\n",
    "    plt.show()\n",
    "\n",
    "# Get test data\n",
    "test_data = df.iloc[int(len(df) * 0.8):]\n",
    "actual = test_data['mid_price'].values\n",
    "\n",
    "# Plot predictions\n",
    "plot_predictions(actual, results['xgb']['predictions'], results['hybrid']['predictions'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Feature Importance Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "def plot_feature_importance(model, title):\n",
    "    \"\"\"Plot feature importance for a model\"\"\"\n",
    "    importance = model.get_feature_importance()\n",
    "    \n",
    "    plt.figure(figsize=(12, 6))\n",
    "    plt.bar(range(len(importance)), importance.values())\n",
    "    plt.xticks(range(len(importance)), importance.keys(), rotation=45, ha='right')\n",
    "    plt.title(title)\n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "\n",
    "# Plot feature importance for both models\n",
    "xgb_model = CryptoPricePredictor(model_path='models/xgboost_model.json')\n",
    "hybrid_model = HybridStockPredictor(xgb_model_path='models/xgboost_model.json')\n",
    "\n",
    "plot_feature_importance(xgb_model, 'XGBoost Feature Importance')\n",
    "plot_feature_importance(hybrid_model, 'Hybrid Model Feature Importance')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Model Comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "def compare_models(results):\n",
    "    \"\"\"Compare the performance of both models\"\"\"\n",
    "    metrics = ['mse', 'r2', 'train_time']\n",
    "    models = ['XGBoost', 'Hybrid']\n",
    "    \n",
    "    comparison = pd.DataFrame({\n",
    "        'Metric': metrics * 2,\n",
    "        'Model': ['XGBoost'] * 3 + ['Hybrid'] * 3,\n",
    "        'Value': [\n",
    "            results['xgb']['mse'],\n",
    "            results['xgb']['r2'],\n",
    "            results['xgb']['train_time'],\n",
    "            results['hybrid']['mse'],\n",
    "            results['hybrid']['r2'],\n",
    "            results['hybrid']['train_time']\n",
    "        ]\n",
    "    })\n",
    "    \n",
    "    # Pivot the dataframe for better visualization\n",
    "    comparison_pivot = comparison.pivot(index='Metric', columns='Model', values='Value')\n",
    "    \n",
    "    # Calculate improvement\n",
    "    comparison_pivot['Improvement'] = ((comparison_pivot['XGBoost'] - comparison_pivot['Hybrid']) / comparison_pivot['XGBoost'] * 100).round(2)\n",
    "    \n",
    "    return comparison_pivot\n",
    "\n",
    "# Compare models\n",
    "comparison = compare_models(results)\n",
    "print(\"\\nModel Comparison:\")\n",
    "print(comparison)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}