# Traffic Prediction Model

This notebook contains the ML model for predicting traffic patterns.


In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# GeoSense Traffic Prediction Model\n",
    "## Exploratory Data Analysis and Model Training\n",
    "\n",
    "This notebook demonstrates:\n",
    "1. Data loading and exploration\n",
    "2. Feature engineering\n",
    "3. Model training and evaluation\n",
    "4. Predictions and visualizations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n",
    "import joblib\n",
    "from datetime import datetime\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "# Set style\n",
    "sns.set_style('whitegrid')\n",
    "plt.rcParams['figure.figsize'] = (12, 6)\n",
    "\n",
    "print(\"Libraries imported successfully!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Data Loading"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load processed data\n",
    "data_path = '../data/processed/traffic_data_processed.csv'\n",
    "\n",
    "try:\n",
    "    df = pd.read_csv(data_path)\n",
    "    print(f\"Data loaded successfully: {len(df)} records\")\n",
    "except FileNotFoundError:\n",
    "    print(\"Processed data not found. Generating synthetic data...\")\n",
    "    # Generate synthetic data\n",
    "    np.random.seed(42)\n",
    "    n_samples = 10000\n",
    "    \n",
    "    data = []\n",
    "    for _ in range(n_samples):\n",
    "        hour = np.random.randint(0, 24)\n",
    "        day_of_week = np.random.randint(0, 7)\n",
    "        lat = np.random.uniform(28.4, 28.7)\n",
    "        lon = np.random.uniform(77.0, 77.4)\n",
    "        is_weekend = 1 if day_of_week >= 5 else 0\n",
    "        is_peak_hour = 1 if (7 <= hour <= 9 or 17 <= hour <= 19) and not is_weekend else 0\n",
    "        \n",
    "        base_congestion = 0.3\n",
    "        if is_peak_hour:\n",
    "            base_congestion += 0.4\n",
    "        if not is_weekend:\n",
    "            base_congestion += 0.2\n",
    "        if 22 <= hour or hour <= 5:\n",
    "            base_congestion -= 0.2\n",
    "        \n",
    "        congestion_level = np.clip(base_congestion + np.random.normal(0, 0.1), 0, 1)\n",
    "        \n",
    "        data.append({\n",
    "            'hour': hour,\n",
    "            'day_of_week': day_of_week,\n",
    "            'lat': lat,\n",
    "            'lon': lon,\n",
    "            'is_weekend': is_weekend,\n",
    "            'is_peak_hour': is_peak_hour,\n",
    "            'congestion_level': congestion_level\n",
    "        })\n",
    "    \n",
    "    df = pd.DataFrame(data)\n",
    "    print(f\"Synthetic data generated: {len(df)} records\")\n",
    "\n",
    "# Display first few rows\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Exploratory Data Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Basic statistics\n",
    "print(\"Dataset Shape:\", df.shape)\n",
    "print(\"\\nColumn Types:\")\n",
    "print(df.dtypes)\n",
    "print(\"\\nMissing Values:\")\n",
    "print(df.isnull().sum())\n",
    "print(\"\\nBasic Statistics:\")\n",
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Congestion level distribution\n",
    "plt.figure(figsize=(12, 4))\n",
    "\n",
    "plt.subplot(1, 3, 1)\n",
    "plt.hist(df['congestion_level'], bins=30, edgecolor='black', alpha=0.7)\n",
    "plt.xlabel('Congestion Level')\n",
    "plt.ylabel('Frequency')\n",
    "plt.title('Distribution of Congestion Levels')\n",
    "\n",
    "plt.subplot(1, 3, 2)\n",
    "hourly_congestion = df.groupby('hour')['congestion_level'].mean()\n",
    "plt.plot(hourly_congestion.index, hourly_congestion.values, marker='o')\n",
    "plt.xlabel('Hour of Day')\n",
    "plt.ylabel('Average Congestion')\n",
    "plt.title('Average Congestion by Hour')\n",
    "plt.grid(True, alpha=0.3)\n",
    "\n",
    "plt.subplot(1, 3, 3)\n",
    "day_congestion = df.groupby('day_of_week')['congestion_level'].mean()\n",
    "days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']\n",
    "plt.bar(range(7), day_congestion.values, alpha=0.7)\n",
    "plt.xticks(range(7), days)\n",
    "plt.xlabel('Day of Week')\n",
    "plt.ylabel('Average Congestion')\n",
    "plt.title('Average Congestion by Day')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Correlation heatmap\n",
    "plt.figure(figsize=(10, 8))\n",
    "correlation = df[['hour', 'day_of_week', 'lat', 'lon', 'is_weekend', 'is_peak_hour', 'congestion_level']].corr()\n",
    "sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0, fmt='.2f')\n",
    "plt.title('Feature Correlation Matrix')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Peak hour vs non-peak hour comparison\n",
    "plt.figure(figsize=(10, 5))\n",
    "\n",
    "peak_data = df[df['is_peak_hour'] == 1]['congestion_level']\n",
    "non_peak_data = df[df['is_peak_hour'] == 0]['congestion_level']\n",
    "\n",
    "plt.subplot(1, 2, 1)\n",
    "plt.hist([peak_data, non_peak_data], bins=20, label=['Peak Hours', 'Non-Peak Hours'], alpha=0.7)\n",
    "plt.xlabel('Congestion Level')\n",
    "plt.ylabel('Frequency')\n",
    "plt.title('Congestion: Peak vs Non-Peak Hours')\n",
    "plt.legend()\n",
    "\n",
    "plt.subplot(1, 2, 2)\n",
    "plt.boxplot([peak_data, non_peak_data], labels=['Peak', 'Non-Peak'])\n",
    "plt.ylabel('Congestion Level')\n",
    "plt.title('Congestion Distribution')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "print(f\"Peak Hours Average Congestion: {peak_data.mean():.3f}\")\n",
    "print(f\"Non-Peak Hours Average Congestion: {non_peak_data.mean():.3f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Feature Engineering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define features and target\n",
    "feature_columns = ['hour', 'day_of_week', 'lat', 'lon', 'is_weekend', 'is_peak_hour']\n",
    "target_column = 'congestion_level'\n",
    "\n",
    "X = df[feature_columns]\n",
    "y = df[target_column]\n",
    "\n",
    "print(f\"Features shape: {X.shape}\")\n",
    "print(f\"Target shape: {y.shape}\")\n",
    "print(f\"\\nFeatures: {feature_columns}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Model Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split data\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
    "\n",
    "print(f\"Training set: {len(X_train)} samples\")\n",
    "print(f\"Test set: {len(X_test)} samples\")\n",
    "\n",
    "# Scale features\n",
    "scaler = StandardScaler()\n",
    "X_train_scaled = scaler.fit_transform(X_train)\n",
    "X_test_scaled = scaler.transform(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train Random Forest\n",
    "print(\"Training Random Forest model...\")\n",
    "rf_model = RandomForestRegressor(\n",
    "    n_estimators=100,\n",
    "    max_depth=10,\n",
    "    random_state=42,\n",
    "    n_jobs=-1\n",
    ")\n",
    "\n",
    "rf_model.fit(X_train_scaled, y_train)\n",
    "print(\"Training completed!\")\n",
    "\n",
    "# Predictions\n",
    "y_train_pred_rf = rf_model.predict(X_train_scaled)\n",
    "y_test_pred_rf = rf_model.predict(X_test_scaled)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train Gradient Boosting\n",
    "print(\"Training Gradient Boosting model...\")\n",
    "gb_model = GradientBoostingRegressor(\n",
    "    n_estimators=100,\n",
    "    max_depth=5,\n",
    "    learning_rate=0.1,\n",
    "    random_state=42\n",
    ")\n",
    "\n",
    "gb_model.fit(X_train_scaled, y_train)\n",
    "print(\"Training completed!\")\n",
    "\n",
    "# Predictions\n",
    "y_train_pred_gb = gb_model.predict(X_train_scaled)\n",
    "y_test_pred_gb = gb_model.predict(X_test_scaled)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Model Evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate metrics\n",
    "def evaluate_model(y_true, y_pred, model_name):\n",
    "    mse = mean_squared_error(y_true, y_pred)\n",
    "    rmse = np.sqrt(mse)\n",
    "    mae = mean_absolute_error(y_true, y_pred)\n",
    "    r2 = r2_score(y_true, y_pred)\n",
    "    \n",
    "    print(f\"\\n{model_name} Performance:\")\n",
    "    print(f\"  MSE:  {mse:.4f}\")\n",
    "    print(f\"  RMSE: {rmse:.4f}\")\n",
    "    print(f\"  MAE:  {mae:.4f}\")\n",
    "    print(f\"  R²:   {r2:.4f}\")\n",
    "    \n",
    "    return {'mse': mse, 'rmse': rmse, 'mae': mae, 'r2': r2}\n",
    "\n",
    "# Evaluate both models\n",
    "print(\"=\" * 60)\n",
    "print(\"MODEL PERFORMANCE COMPARISON\")\n",
    "print(\"=\" * 60)\n",
    "\n",
    "print(\"\\n--- RANDOM FOREST ---\")\n",
    "rf_train_metrics = evaluate_model(y_train, y_train_pred_rf, \"Random Forest (Train)\")\n",
    "rf_test_metrics = evaluate_model(y_test, y_test_pred_rf, \"Random Forest (Test)\")\n",
    "\n",
    "print(\"\\n--- GRADIENT BOOSTING ---\")\n",
    "gb_train_metrics = evaluate_model(y_train, y_train_pred_gb, \"Gradient Boosting (Train)\")\n",
    "gb_test_metrics = evaluate_model(y_test, y_test_pred_gb, \"Gradient Boosting (Test)\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize predictions vs actual\n",
    "fig, axes = plt.subplots(2, 2, figsize=(14, 10))\n",
    "\n",
    "# Random Forest - Train\n",
    "axes[0, 0].scatter(y_train, y_train_pred_rf, alpha=0.5, s=10)\n",
    "axes[0, 0].plot([0, 1], [0, 1], 'r--', lw=2)\n",
    "axes[0, 0].set_xlabel('Actual Congestion')\n",
    "axes[0, 0].set_ylabel('Predicted Congestion')\n",
    "axes[0, 0].set_title(f'Random Forest - Training (R² = {rf_train_metrics[\"r2\"]:.3f})')\n",
    "axes[0, 0].grid(True, alpha=0.3)\n",
    "\n",
    "# Random Forest - Test\n",
    "axes[0, 1].scatter(y_test, y_test_pred_rf, alpha=0.5, s=10)\n",
    "axes[0, 1].plot([0, 1], [0, 1], 'r--', lw=2)\n",
    "axes[0, 1].set_xlabel('Actual Congestion')\n",
    "axes[0, 1].set_ylabel('Predicted Congestion')\n",
    "axes[0, 1].set_title(f'Random Forest - Test (R² = {rf_test_metrics[\"r2\"]:.3f})')\n",
    "axes[0, 1].grid(True, alpha=0.3)\n",
    "\n",
    "# Gradient Boosting - Train\n",
    "axes[1, 0].scatter(y_train, y_train_pred_gb, alpha=0.5, s=10)\n",
    "axes[1, 0].plot([0, 1], [0, 1], 'r--', lw=2)\n",
    "axes[1, 0].set_xlabel('Actual Congestion')\n",
    "axes[1, 0].set_ylabel('Predicted Congestion')\n",
    "axes[1, 0].set_title(f'Gradient Boosting - Training (R² = {gb_train_metrics[\"r2\"]:.3f})')\n",
    "axes[1, 0].grid(True, alpha=0.3)\n",
    "\n",
    "# Gradient Boosting - Test\n",
    "axes[1, 1].scatter(y_test, y_test_pred_gb, alpha=0.5, s=10)\n",
    "axes[1, 1].plot([0, 1], [0, 1], 'r--', lw=2)\n",
    "axes[1, 1].set_xlabel('Actual Congestion')\n",
    "axes[1, 1].set_ylabel('Predicted Congestion')\n",
    "axes[1, 1].set_title(f'Gradient Boosting - Test (R² = {gb_test_metrics[\"r2\"]:.3f})')\n",
    "axes[1, 1].grid(True, alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Feature importance\n",
    "plt.figure(figsize=(10, 6))\n",
    "\n",
    "# Random Forest feature importance\n",
    "rf_importance = pd.DataFrame({\n",
    "    'feature': feature_columns,\n",
    "    'importance': rf_model.feature_importances_\n",
    "}).sort_values('importance', ascending=True)\n",
    "\n",
    "plt.barh(rf_importance['feature'], rf_importance['importance'], alpha=0.8)\n",
    "plt.xlabel('Feature Importance')\n",
    "plt.title('Random Forest Feature Importance')\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "print(\"\\nFeature Importance Ranking:\")\n",
    "print(rf_importance.sort_values('importance', ascending=False).to_string(index=False))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Model Predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Test predictions for different scenarios\n",
    "test_scenarios = [\n",
    "    {'hour': 8, 'day_of_week': 1, 'lat': 28.6139, 'lon': 77.2090, 'is_weekend': 0, 'is_peak_hour': 1, 'name': 'Monday 8 AM (Peak)'},\n",
    "    {'hour': 14, 'day_of_week': 1, 'lat': 28.6139, 'lon': 77.2090, 'is_weekend': 0, 'is_peak_hour': 0, 'name': 'Monday 2 PM'},\n",
    "    {'hour': 23, 'day_of_week': 1, 'lat': 28.6139, 'lon': 77.2090, 'is_weekend': 0, 'is_peak_hour': 0, 'name': 'Monday 11 PM'},\n",
    "    {'hour': 10, 'day_of_week': 6, 'lat': 28.6139, 'lon': 77.2090, 'is_weekend': 1, 'is_peak_hour': 0, 'name': 'Sunday 10 AM'},\n",
    "    {'hour': 18, 'day_of_week': 3, 'lat': 28.6139, 'lon': 77.2090, 'is_weekend': 0, 'is_peak_hour': 1, 'name': 'Thursday 6 PM (Peak)'}\n",
    "]\n",
    "\n",
    "print(\"\\n\" + \"=\" * 80)\n",
    "print(\"SAMPLE PREDICTIONS\")\n",
    "print(\"=\" * 80)\n",
    "\n",
    "predictions_data = []\n",
    "\n",
    "for scenario in test_scenarios:\n",
    "    features = np.array([[scenario['hour'], scenario['day_of_week'], scenario['lat'], \n",
    "                         scenario['lon'], scenario['is_weekend'], scenario['is_peak_hour']]])\n",
    "    features_scaled = scaler.transform(features)\n",
    "    \n",
    "    rf_pred = rf_model.predict(features_scaled)[0]\n",
    "    gb_pred = gb_model.predict(features_scaled)[0]\n",
    "    \n",
    "    # Classify congestion level\n",
    "    def classify_congestion(value):\n",
    "        if value < 0.3:\n",
    "            return 'Low'\n",
    "        elif value < 0.6:\n",
    "            return 'Moderate'\n",
    "        elif value < 0.8:\n",
    "            return 'High'\n",
    "        else:\n",
    "            return 'Severe'\n",
    "    \n",
    "    print(f\"\\n{scenario['name']}:\")\n",
    "    print(f\"  Random Forest: {rf_pred:.3f} ({classify_congestion(rf_pred)})\")\n",
    "    print(f\"  Gradient Boost: {gb_pred:.3f} ({classify_congestion(gb_pred)})\")\n",
    "    \n",
    "    predictions_data.append({\n",
    "        'Scenario': scenario['name'],\n",
    "        'RF_Prediction': rf_pred,\n",
    "        'GB_Prediction': gb_pred\n",
    "    })\n",
    "\n",
    "# Visualize predictions\n",
    "pred_df = pd.DataFrame(predictions_data)\n",
    "pred_df.set_index('Scenario', inplace=True)\n",
    "\n",
    "pred_df.plot(kind='bar', figsize=(12, 6), alpha=0.8)\n",
    "plt.ylabel('Predicted Congestion Level')\n",
    "plt.title('Model Predictions for Different Scenarios')\n",
    "plt.xticks(rotation=45, ha='right')\n",
    "plt.legend(title='Model')\n",
    "plt.grid(axis='y', alpha=0.3)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Save Models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save models\n",
    "import os\n",
    "\n",
    "models_dir = '../saved_models'\n",
    "os.makedirs(models_dir, exist_ok=True)\n",
    "\n",
    "# Save Random Forest (primary model)\n",
    "rf_path = os.path.join(models_dir, 'traffic_model.pkl')\n",
    "joblib.dump(rf_model, rf_path)\n",
    "print(f\"Random Forest model saved to: {rf_path}\")\n",
    "\n",
    "# Save Gradient Boosting\n",
    "gb_path = os.path.join(models_dir, 'traffic_model_gb.pkl')\n",
    "joblib.dump(gb_model, gb_path)\n",
    "print(f\"Gradient Boosting model saved to: {gb_path}\")\n",
    "\n",
    "# Save scaler\n",
    "scaler_path = os.path.join(models_dir, 'scaler.pkl')\n",
    "joblib.dump(scaler, scaler_path)\n",
    "print(f\"Scaler saved to: {scaler_path}\")\n",
    "\n",
    "print(\"\\n✅ All models saved successfully!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8. Model Summary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Summary\n",
    "summary = f\"\"\"\n",
    "{'='*80}\n",
    "                          MODEL TRAINING SUMMARY\n",
    "{'='*80}\n",
    "\n",
    "Dataset:\n",
    "  Total Samples: {len(df):,}\n",
    "  Training Set: {len(X_train):,} samples\n",
    "  Test Set: {len(X_test):,} samples\n",
    "\n",
    "Features:\n",
    "  {', '.join(feature_columns)}\n",
    "\n",
    "Random Forest Performance:\n",
    "  Test R²: {rf_test_metrics['r2']:.4f}\n",
    "  Test RMSE: {rf_test_metrics['rmse']:.4f}\n",
    "  Test MAE: {rf_test_metrics['mae']:.4f}\n",
    "\n",
    "Gradient Boosting Performance:\n",
    "  Test R²: {gb_test_metrics['r2']:.4f}\n",
    "  Test RMSE: {gb_test_metrics['rmse']:.4f}\n",
    "  Test MAE: {gb_test_metrics['mae']:.4f}\n",
    "\n",
    "Best Model: {'Random Forest' if rf_test_metrics['r2'] > gb_test_metrics['r2'] else 'Gradient Boosting'}\n",
    "\n",
    "Model Files Saved:\n",
    "  - traffic_model.pkl (Random Forest)\n",
    "  - traffic_model_gb.pkl (Gradient Boosting)\n",
    "  - scaler.pkl (Feature Scaler)\n",
    "\n",
    "{'='*80}\n",
    "\"\"\"\n",
    "\n",
    "print(summary)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}