# Smart Bus System - Exploratory Data Analysis and Modeling

This notebook contains exploratory data analysis and machine learning modeling for the Smart Bus System.

## Table of Contents
1. [Data Loading and Overview](#data-loading)
2. [Exploratory Data Analysis](#eda)
3. [Feature Engineering](#feature-engineering)
4. [Model Development](#modeling)
5. [Model Evaluation](#evaluation)
6. [Predictions and Insights](#insights)


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Import machine learning libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline

# Import database connection
import sys
sys.path.append('..')
from app import create_app, db
from app.models import Route, Bus, Trip, Event, RouteStop, Prediction

print("Libraries imported successfully!")


## 1. Data Loading and Overview {#data-loading}


In [None]:
# Initialize Flask app and database connection
app = create_app()
app.app_context().push()

# Load data from database
print("Loading data from database...")

# Load routes
routes_df = pd.read_sql('SELECT * FROM routes', db.engine)
print(f"Loaded {len(routes_df)} routes")

# Load buses
buses_df = pd.read_sql('SELECT * FROM buses', db.engine)
print(f"Loaded {len(buses_df)} buses")

# Load trips
trips_df = pd.read_sql('SELECT * FROM trips', db.engine)
print(f"Loaded {len(trips_df)} trips")

# Load events
events_df = pd.read_sql('SELECT * FROM events', db.engine)
print(f"Loaded {len(events_df)} events")

# Load route stops
stops_df = pd.read_sql('SELECT * FROM route_stops', db.engine)
print(f"Loaded {len(stops_df)} route stops")

# Load predictions
predictions_df = pd.read_sql('SELECT * FROM predictions', db.engine)
print(f"Loaded {len(predictions_df)} predictions")

print("\nData loading completed!")


In [None]:
# Display basic information about the datasets
print("=== ROUTES DATASET ===")
print(routes_df.info())
print("\nFirst few rows:")
print(routes_df.head())

print("\n=== TRIPS DATASET ===")
print(trips_df.info())
print("\nFirst few rows:")
print(trips_df.head())

print("\n=== EVENTS DATASET ===")
print(events_df.info())
print("\nFirst few rows:")
print(events_df.head())


## 2. Exploratory Data Analysis {#eda}


In [None]:
# Convert timestamp columns to datetime
trips_df['scheduled_start_time'] = pd.to_datetime(trips_df['scheduled_start_time'])
trips_df['actual_start_time'] = pd.to_datetime(trips_df['actual_start_time'])
events_df['timestamp'] = pd.to_datetime(events_df['timestamp'])

# Analyze trip delays
trips_with_delays = trips_df.dropna(subset=['actual_start_time'])
trips_with_delays['delay_minutes'] = (trips_with_delays['actual_start_time'] - trips_with_delays['scheduled_start_time']).dt.total_seconds() / 60

print("=== TRIP DELAY ANALYSIS ===")
print(f"Average delay: {trips_with_delays['delay_minutes'].mean():.2f} minutes")
print(f"Median delay: {trips_with_delays['delay_minutes'].median():.2f} minutes")
print(f"Standard deviation: {trips_with_delays['delay_minutes'].std():.2f} minutes")
print(f"On-time percentage (within 5 minutes): {(abs(trips_with_delays['delay_minutes']) <= 5).mean() * 100:.1f}%")

# Plot delay distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(trips_with_delays['delay_minutes'], bins=50, alpha=0.7, edgecolor='black')
plt.axvline(0, color='red', linestyle='--', label='On Time')
plt.xlabel('Delay (minutes)')
plt.ylabel('Frequency')
plt.title('Distribution of Trip Delays')
plt.legend()

plt.subplot(1, 2, 2)
plt.boxplot(trips_with_delays['delay_minutes'])
plt.ylabel('Delay (minutes)')
plt.title('Box Plot of Trip Delays')
plt.axhline(0, color='red', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()
