# Singapore Taxi Fare Prediction - ML Model Training

This notebook trains Random Forest and XGBoost models for taxi fare prediction in Singapore.

## What this notebook does:
1. Generates synthetic Singapore taxi trip data
2. Trains Random Forest and XGBoost models
3. Evaluates model performance
4. Downloads trained models for local use


## Step 1: Install Required Packages


In [None]:
# Install required packages
!pip install scikit-learn xgboost pandas numpy matplotlib seaborn plotly


## Step 2: Import Libraries


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import pickle
from datetime import datetime, timedelta
import random
from math import radians, cos, sin, asin, sqrt
import warnings
warnings.filterwarnings('ignore')

print("✅ All libraries imported successfully!")


## Step 3: Generate Synthetic Singapore Taxi Data


In [None]:
def haversine_distance(lat1, lon1, lat2, lon2):
    """Calculate distance between two points using Haversine formula."""
    # Convert decimal degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    
    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    
    # Radius of earth in kilometers
    r = 6371
    return c * r

print("✅ Haversine function defined!")


In [None]:
def generate_singapore_taxi_data(n_samples=15000):
    """Generate synthetic Singapore taxi trip data."""
    print(f"🚕 Generating {n_samples} synthetic taxi trips...")
    
    # Singapore coordinates bounds
    singapore_bounds = {
        'lat_min': 1.15, 'lat_max': 1.47,
        'lon_min': 103.6, 'lon_max': 104.1
    }
    
    data = []
    
    for i in range(n_samples):
        if i % 2000 == 0:
            print(f"   Generated {i} trips...")
            
        # Generate random pickup and dropoff coordinates within Singapore
        pickup_lat = random.uniform(singapore_bounds['lat_min'], singapore_bounds['lat_max'])
        pickup_lon = random.uniform(singapore_bounds['lon_min'], singapore_bounds['lon_max'])
        dropoff_lat = random.uniform(singapore_bounds['lat_min'], singapore_bounds['lat_max'])
        dropoff_lon = random.uniform(singapore_bounds['lon_min'], singapore_bounds['lon_max'])
        
        # Calculate distance using Haversine formula
        distance = haversine_distance(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon)
        
        # Skip very short trips (less than 0.5km)
        if distance < 0.5:
            continue
            
        # Generate random datetime
        start_date = datetime(2024, 1, 1)
        end_date = datetime(2024, 12, 31)
        random_date = start_date + timedelta(
            days=random.randint(0, (end_date - start_date).days),
            hours=random.randint(0, 23),
            minutes=random.randint(0, 59)
        )
        
        # Calculate features
        hour = random_date.hour
        day_of_week = random_date.weekday()  # 0=Monday, 6=Sunday
        month = random_date.month
        
        # Estimate duration (30 km/h average speed)
        duration = (distance / 30) * 60  # Convert to minutes
        
        # Generate passenger count (1-4 passengers)
        passengers = random.choices([1, 2, 3, 4], weights=[0.6, 0.25, 0.1, 0.05])[0]
        
        # Determine if peak hour
        is_peak_hour = (7 <= hour <= 9) or (18 <= hour <= 20)
        
        # Determine if weekend
        is_weekend = day_of_week in [5, 6]  # Saturday, Sunday
        
        # Generate realistic fare using Singapore taxi pricing
        base_fare = 3.20
        distance_cost = 1.50 * distance
        time_cost = 0.40 * duration
        passenger_cost = (passengers - 1) * 0.50  # Additional cost per extra passenger
        
        # Base fare calculation
        fare = base_fare + distance_cost + time_cost + passenger_cost
        
        # Apply surcharges
        if is_peak_hour:
            fare *= 1.20  # 20% peak hour surcharge
        if is_weekend:
            fare *= 1.10  # 10% weekend surcharge
            
        # Add some realistic noise/variation
        noise_factor = random.uniform(0.95, 1.05)
        fare *= noise_factor
        
        # Round to 2 decimal places
        fare = round(fare, 2)
        
        data.append({
            'pickup_lat': pickup_lat,
            'pickup_lon': pickup_lon,
            'dropoff_lat': dropoff_lat,
            'dropoff_lon': dropoff_lon,
            'distance_km': round(distance, 2),
            'duration_minutes': round(duration, 1),
            'passengers': passengers,
            'hour': hour,
            'day_of_week': day_of_week,
            'month': month,
            'is_peak_hour': int(is_peak_hour),
            'is_weekend': int(is_weekend),
            'fare_sgd': fare
        })
    
    print(f"✅ Generated {len(data)} taxi trips!")
    return pd.DataFrame(data)

# Generate the data
df = generate_singapore_taxi_data(15000)
print(f"\n📊 Dataset shape: {df.shape}")
print(f"💰 Average fare: ${df['fare_sgd'].mean():.2f} SGD")
print(f"📏 Average distance: {df['distance_km'].mean():.2f} km")


In [None]:
# Display sample of the data
print("📋 Sample of generated data:")
display(df.head())

print("\n📊 Data summary:")
display(df.describe())


## Step 4: Feature Engineering & Model Training


In [None]:
def prepare_features(df):
    """Prepare features for machine learning."""
    print("🔧 Preparing features for machine learning...")
    
    # Create additional features
    df['distance_squared'] = df['distance_km'] ** 2
    df['duration_squared'] = df['duration_minutes'] ** 2
    df['distance_duration_ratio'] = df['distance_km'] / (df['duration_minutes'] + 1e-6)
    
    # Time-based features
    df['is_morning'] = ((df['hour'] >= 6) & (df['hour'] < 12)).astype(int)
    df['is_afternoon'] = ((df['hour'] >= 12) & (df['hour'] < 18)).astype(int)
    df['is_evening'] = ((df['hour'] >= 18) & (df['hour'] < 24)).astype(int)
    df['is_night'] = ((df['hour'] >= 0) & (df['hour'] < 6)).astype(int)
    
    # Seasonal features
    df['is_rainy_season'] = ((df['month'] >= 11) | (df['month'] <= 3)).astype(int)
    
    # Feature columns for training
    feature_columns = [
        'distance_km', 'duration_minutes', 'passengers',
        'hour', 'day_of_week', 'month',
        'is_peak_hour', 'is_weekend',
        'distance_squared', 'duration_squared', 'distance_duration_ratio',
        'is_morning', 'is_afternoon', 'is_evening', 'is_night',
        'is_rainy_season'
    ]
    
    print(f"✅ Created {len(feature_columns)} features")
    return df[feature_columns], df['fare_sgd'], feature_columns

# Prepare features
X, y, feature_columns = prepare_features(df.copy())
print(f"\n📊 Features shape: {X.shape}")
print(f"🎯 Target shape: {y.shape}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"\n📊 Data split:")
print(f"   Training set: {X_train.shape[0]} samples")
print(f"   Test set: {X_test.shape[0]} samples")


In [None]:
# Train Random Forest Model
print("🌲 Training Random Forest model...")

rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)
print("✅ Random Forest trained!")

# Train XGBoost Model
print("🚀 Training XGBoost model...")

xgb_model = xgb.XGBRegressor(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.9,
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(X_train, y_train)
print("✅ XGBoost trained!")


In [None]:
# Evaluate both models
print("📈 Evaluating model performance...")

# Random Forest evaluation
rf_pred = rf_model.predict(X_test)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
rf_mae = mean_absolute_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)

# XGBoost evaluation
xgb_pred = xgb_model.predict(X_test)
xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_pred))
xgb_mae = mean_absolute_error(y_test, xgb_pred)
xgb_r2 = r2_score(y_test, xgb_pred)

print("\n🎯 Model Performance Results:")
print("=" * 50)
print(f"\n🌲 Random Forest:")
print(f"   RMSE: ${rf_rmse:.2f} SGD")
print(f"   MAE:  ${rf_mae:.2f} SGD")
print(f"   R²:   {rf_r2:.4f}")

print(f"\n🚀 XGBoost:")
print(f"   RMSE: ${xgb_rmse:.2f} SGD")
print(f"   MAE:  ${xgb_mae:.2f} SGD")
print(f"   R²:   {xgb_r2:.4f}")

# Check if models meet the proposal target (RMSE < $3.00)
print("\n🎯 Performance Check:")
print("-" * 30)
if rf_rmse < 3.0:
    print(f"✅ Random Forest: RMSE ${rf_rmse:.2f} < $3.00 (Target met!)")
else:
    print(f"⚠️  Random Forest: RMSE ${rf_rmse:.2f} >= $3.00 (Target not met)")
    
if xgb_rmse < 3.0:
    print(f"✅ XGBoost: RMSE ${xgb_rmse:.2f} < $3.00 (Target met!)")
else:
    print(f"⚠️  XGBoost: RMSE ${xgb_rmse:.2f} >= $3.00 (Target not met)")


In [None]:
# Save models and feature columns
print("💾 Saving trained models...")

# Create a dictionary with all the necessary components
model_data = {
    'rf_model': rf_model,
    'xgb_model': xgb_model,
    'feature_columns': feature_columns,
    'rf_rmse': rf_rmse,
    'xgb_rmse': xgb_rmse,
    'rf_mae': rf_mae,
    'xgb_mae': xgb_mae,
    'rf_r2': rf_r2,
    'xgb_r2': xgb_r2
}

# Save as pickle file
with open('singapore_taxi_models.pkl', 'wb') as f:
    pickle.dump(model_data, f)

print("✅ Models saved as 'singapore_taxi_models.pkl'")
print("\n📁 Files created:")
print("   - singapore_taxi_models.pkl (Complete model package)")
print("\n📋 Model Summary:")
print(f"   Random Forest RMSE: ${rf_rmse:.2f}")
print(f"   XGBoost RMSE: ${xgb_rmse:.2f}")
print(f"   Features: {len(feature_columns)}")
print(f"   Training samples: {len(X_train)}")
print(f"   Test samples: {len(X_test)}")

print("\n🎉 Training completed successfully!")
print("Your models are ready for deployment! 🚀")


## Step 5: Download Instructions

### How to use the trained models in your local project:

1. **Download the model file**: Click on `singapore_taxi_models.pkl` in the file browser on the left
2. **Place it in your project**: Put the file in your `singapore-taxi-fare-prediction` folder
3. **Update your local code**: Use the model loading code below

### Model Loading Code for Local Project:

```python
import pickle
import pandas as pd
import numpy as np

def load_ml_models():
    """Load the trained ML models."""
    with open('singapore_taxi_models.pkl', 'rb') as f:
        model_data = pickle.load(f)
    
    return model_data

# Load models
models = load_ml_models()
rf_model = models['rf_model']
xgb_model = models['xgb_model']
feature_columns = models['feature_columns']

print("✅ Models loaded successfully!")
print(f"Random Forest RMSE: ${models['rf_rmse']:.2f}")
print(f"XGBoost RMSE: ${models['xgb_rmse']:.2f}")
```
