# Singapore Taxi Fare Prediction - ML Model Training

This notebook trains Random Forest and XGBoost models for taxi fare prediction in Singapore.

## What this notebook does:
1. Generates synthetic Singapore taxi trip data
2. Trains Random Forest and XGBoost models
3. Evaluates model performance
4. Downloads trained models for local use


## Step 1: Install Required Packages


In [None]:
# Install required packages
!pip install scikit-learn xgboost pandas numpy matplotlib seaborn plotly


## Step 2: Import Libraries


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import pickle
from datetime import datetime, timedelta
import random
from math import radians, cos, sin, asin, sqrt
import warnings
warnings.filterwarnings('ignore')

print("✅ All libraries imported successfully!")


## Step 3: Generate Synthetic Singapore Taxi Data


In [None]:
def haversine_distance(lat1, lon1, lat2, lon2):
    """Calculate distance between two points using Haversine formula."""
    # Convert decimal degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    
    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    
    # Radius of earth in kilometers
    r = 6371
    return c * r

print("✅ Haversine function defined!")


In [None]:
def generate_singapore_taxi_data(n_samples=15000):
    """Generate synthetic Singapore taxi trip data."""
    print(f"🚕 Generating {n_samples} synthetic taxi trips...")
    
    # Singapore coordinates bounds
    singapore_bounds = {
        'lat_min': 1.15, 'lat_max': 1.47,
        'lon_min': 103.6, 'lon_max': 104.1
    }
    
    data = []
    
    for i in range(n_samples):
        if i % 2000 == 0:
            print(f"   Generated {i} trips...")
            
        # Generate random pickup and dropoff coordinates within Singapore
        pickup_lat = random.uniform(singapore_bounds['lat_min'], singapore_bounds['lat_max'])
        pickup_lon = random.uniform(singapore_bounds['lon_min'], singapore_bounds['lon_max'])
        dropoff_lat = random.uniform(singapore_bounds['lat_min'], singapore_bounds['lat_max'])
        dropoff_lon = random.uniform(singapore_bounds['lon_min'], singapore_bounds['lon_max'])
        
        # Calculate distance using Haversine formula
        distance = haversine_distance(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon)
        
        # Skip very short trips (less than 0.5km)
        if distance < 0.5:
            continue
            
        # Generate random datetime
        start_date = datetime(2024, 1, 1)
        end_date = datetime(2024, 12, 31)
        random_date = start_date + timedelta(
            days=random.randint(0, (end_date - start_date).days),
            hours=random.randint(0, 23),
            minutes=random.randint(0, 59)
        )
        
        # Calculate features
        hour = random_date.hour
        day_of_week = random_date.weekday()  # 0=Monday, 6=Sunday
        month = random_date.month
        
        # Estimate duration (30 km/h average speed)
        duration = (distance / 30) * 60  # Convert to minutes
        
        # Generate passenger count (1-4 passengers)
        passengers = random.choices([1, 2, 3, 4], weights=[0.6, 0.25, 0.1, 0.05])[0]
        
        # Determine if peak hour
        is_peak_hour = (7 <= hour <= 9) or (18 <= hour <= 20)
        
        # Determine if weekend
        is_weekend = day_of_week in [5, 6]  # Saturday, Sunday
        
        # Generate realistic fare using Singapore taxi pricing
        base_fare = 3.20
        distance_cost = 1.50 * distance
        time_cost = 0.40 * duration
        passenger_cost = (passengers - 1) * 0.50  # Additional cost per extra passenger
        
        # Base fare calculation
        fare = base_fare + distance_cost + time_cost + passenger_cost
        
        # Apply surcharges
        if is_peak_hour:
            fare *= 1.20  # 20% peak hour surcharge
        if is_weekend:
            fare *= 1.10  # 10% weekend surcharge
            
        # Add some realistic noise/variation
        noise_factor = random.uniform(0.95, 1.05)
        fare *= noise_factor
        
        # Round to 2 decimal places
        fare = round(fare, 2)
        
        data.append({
            'pickup_lat': pickup_lat,
            'pickup_lon': pickup_lon,
            'dropoff_lat': dropoff_lat,
            'dropoff_lon': dropoff_lon,
            'distance_km': round(distance, 2),
            'duration_minutes': round(duration, 1),
            'passengers': passengers,
            'hour': hour,
            'day_of_week': day_of_week,
            'month': month,
            'is_peak_hour': int(is_peak_hour),
            'is_weekend': int(is_weekend),
            'fare_sgd': fare
        })
    
    print(f"✅ Generated {len(data)} taxi trips!")
    return pd.DataFrame(data)

# Generate the data
df = generate_singapore_taxi_data(15000)
print(f"\n📊 Dataset shape: {df.shape}")
print(f"💰 Average fare: ${df['fare_sgd'].mean():.2f} SGD")
print(f"📏 Average distance: {df['distance_km'].mean():.2f} km")


In [None]:
# Display sample of the data
print("📋 Sample of generated data:")
display(df.head())

print("\n📊 Data summary:")
display(df.describe())

# Show data distribution
print("\n📈 Data Distribution:")
print(f"Fare range: ${df['fare_sgd'].min():.2f} - ${df['fare_sgd'].max():.2f} SGD")
print(f"Distance range: {df['distance_km'].min():.2f} - {df['distance_km'].max():.2f} km")
print(f"Peak hour trips: {df['is_peak_hour'].sum()} ({df['is_peak_hour'].mean()*100:.1f}%)")
print(f"Weekend trips: {df['is_weekend'].sum()} ({df['is_weekend'].mean()*100:.1f}%)")


In [None]:
# Display sample of the data
print("📋 Sample of generated data:")
display(df.head())

print("\n📊 Data summary:")
display(df.describe())


## Step 4: Feature Engineering & Model Training


In [None]:
def prepare_features(df):
    """Prepare features for machine learning."""
    print("🔧 Preparing features for machine learning...")
    
    # Create additional features
    df['distance_squared'] = df['distance_km'] ** 2
    df['duration_squared'] = df['duration_minutes'] ** 2
    df['distance_duration_ratio'] = df['distance_km'] / (df['duration_minutes'] + 1e-6)
    
    # Time-based features
    df['is_morning'] = ((df['hour'] >= 6) & (df['hour'] < 12)).astype(int)
    df['is_afternoon'] = ((df['hour'] >= 12) & (df['hour'] < 18)).astype(int)
    df['is_evening'] = ((df['hour'] >= 18) & (df['hour'] < 24)).astype(int)
    df['is_night'] = ((df['hour'] >= 0) & (df['hour'] < 6)).astype(int)
    
    # Seasonal features
    df['is_rainy_season'] = ((df['month'] >= 11) | (df['month'] <= 3)).astype(int)
    
    # Feature columns for training
    feature_columns = [
        'distance_km', 'duration_minutes', 'passengers',
        'hour', 'day_of_week', 'month',
        'is_peak_hour', 'is_weekend',
        'distance_squared', 'duration_squared', 'distance_duration_ratio',
        'is_morning', 'is_afternoon', 'is_evening', 'is_night',
        'is_rainy_season'
    ]
    
    print(f"✅ Created {len(feature_columns)} features")
    return df[feature_columns], df['fare_sgd'], feature_columns

# Prepare features
X, y, feature_columns = prepare_features(df.copy())
print(f"\n📊 Features shape: {X.shape}")
print(f"🎯 Target shape: {y.shape}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"\n📊 Data split:")
print(f"   Training set: {X_train.shape[0]} samples")
print(f"   Test set: {X_test.shape[0]} samples")


In [None]:
# Test predictions with sample data
print("🧪 Testing model predictions with sample data:")
print("=" * 50)

def predict_fare(distance_km, duration_minutes, passengers=1, 
                hour=12, day_of_week=1, month=6, is_peak_hour=False, is_weekend=False):
    """Predict fare using both models."""
    
    # Create feature vector
    features = pd.DataFrame({
        'distance_km': [distance_km],
        'duration_minutes': [duration_minutes],
        'passengers': [passengers],
        'hour': [hour],
        'day_of_week': [day_of_week],
        'month': [month],
        'is_peak_hour': [int(is_peak_hour)],
        'is_weekend': [int(is_weekend)],
        'distance_squared': [distance_km ** 2],
        'duration_squared': [duration_minutes ** 2],
        'distance_duration_ratio': [distance_km / (duration_minutes + 1e-6)],
        'is_morning': [int(6 <= hour < 12)],
        'is_afternoon': [int(12 <= hour < 18)],
        'is_evening': [int(18 <= hour < 24)],
        'is_night': [int(0 <= hour < 6)],
        'is_rainy_season': [int((month >= 11) or (month <= 3))]
    })
    
    # Ensure correct column order
    features = features[feature_columns]
    
    # Make predictions
    rf_pred = rf_model.predict(features)[0]
    xgb_pred = xgb_model.predict(features)[0]
    
    return {
        'Random Forest': round(rf_pred, 2),
        'XGBoost': round(xgb_pred, 2),
        'Average': round((rf_pred + xgb_pred) / 2, 2)
    }

# Test cases
test_cases = [
    {
        'distance_km': 5.0,
        'duration_minutes': 15.0,
        'passengers': 1,
        'description': 'Marina Bay to Sentosa (Normal hours)'
    },
    {
        'distance_km': 10.0,
        'duration_minutes': 25.0,
        'passengers': 2,
        'description': 'Airport to City Center (2 passengers)'
    },
    {
        'distance_km': 3.0,
        'duration_minutes': 8.0,
        'passengers': 1,
        'hour': 8,
        'is_peak_hour': True,
        'description': 'Short trip during peak hours'
    }
]

for i, case in enumerate(test_cases, 1):
    print(f"\nTest {i}: {case['description']}")
    print(f"Distance: {case['distance_km']} km, Duration: {case['duration_minutes']} min")
    
    predictions = predict_fare(
        distance_km=case['distance_km'],
        duration_minutes=case['duration_minutes'],
        passengers=case['passengers'],
        hour=case.get('hour', 12),
        day_of_week=1,
        month=6,
        is_peak_hour=case.get('is_peak_hour', False),
        is_weekend=False
    )
    
    for model, prediction in predictions.items():
        print(f"   {model}: ${prediction} SGD")


In [None]:
# Create visualizations
print("📊 Creating performance visualizations...")

# Create comparison plots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Random Forest predictions vs actual
axes[0, 0].scatter(y_test, rf_pred, alpha=0.5, color='blue')
axes[0, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0, 0].set_xlabel('Actual Fare (SGD)')
axes[0, 0].set_ylabel('Predicted Fare (SGD)')
axes[0, 0].set_title(f'Random Forest\\nR² = {rf_r2:.4f}')

# XGBoost predictions vs actual
axes[0, 1].scatter(y_test, xgb_pred, alpha=0.5, color='green')
axes[0, 1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0, 1].set_xlabel('Actual Fare (SGD)')
axes[0, 1].set_ylabel('Predicted Fare (SGD)')
axes[0, 1].set_title(f'XGBoost\\nR² = {xgb_r2:.4f}')

# Residuals plot for Random Forest
rf_residuals = y_test - rf_pred
axes[1, 0].scatter(rf_pred, rf_residuals, alpha=0.5, color='blue')
axes[1, 0].axhline(y=0, color='r', linestyle='--')
axes[1, 0].set_xlabel('Predicted Fare (SGD)')
axes[1, 0].set_ylabel('Residuals (SGD)')
axes[1, 0].set_title('Random Forest Residuals')

# Residuals plot for XGBoost
xgb_residuals = y_test - xgb_pred
axes[1, 1].scatter(xgb_pred, xgb_residuals, alpha=0.5, color='green')
axes[1, 1].axhline(y=0, color='r', linestyle='--')
axes[1, 1].set_xlabel('Predicted Fare (SGD)')
axes[1, 1].set_ylabel('Residuals (SGD)')
axes[1, 1].set_title('XGBoost Residuals')

plt.tight_layout()
plt.show()

print("✅ Visualizations created!")


In [None]:
# Feature importance analysis
print("🔍 Analyzing feature importance...")

# Random Forest feature importance
rf_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=True)

# XGBoost feature importance
xgb_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=True)

# Create feature importance plots
fig, axes = plt.subplots(1, 2, figsize=(15, 8))

# Random Forest feature importance
axes[0].barh(rf_importance['feature'], rf_importance['importance'], color='blue', alpha=0.7)
axes[0].set_title('Random Forest Feature Importance')
axes[0].set_xlabel('Importance')

# XGBoost feature importance
axes[1].barh(xgb_importance['feature'], xgb_importance['importance'], color='green', alpha=0.7)
axes[1].set_title('XGBoost Feature Importance')
axes[1].set_xlabel('Importance')

plt.tight_layout()
plt.show()

# Display top features
print("\n🏆 Top 5 Most Important Features:")
print("\nRandom Forest:")
for i, row in rf_importance.tail(5).iterrows():
    print(f"   {row['feature']}: {row['importance']:.4f}")

print("\nXGBoost:")
for i, row in xgb_importance.tail(5).iterrows():
    print(f"   {row['feature']}: {row['importance']:.4f}")

print("\n✅ Feature importance analysis completed!")


In [None]:
# Train Random Forest Model
print("🌲 Training Random Forest model...")

rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)
print("✅ Random Forest trained!")

# Train XGBoost Model
print("🚀 Training XGBoost model...")

xgb_model = xgb.XGBRegressor(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.9,
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(X_train, y_train)
print("✅ XGBoost trained!")


In [None]:
# Evaluate both models
print("📈 Evaluating model performance...")

# Random Forest evaluation
rf_pred = rf_model.predict(X_test)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
rf_mae = mean_absolute_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)

# XGBoost evaluation
xgb_pred = xgb_model.predict(X_test)
xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_pred))
xgb_mae = mean_absolute_error(y_test, xgb_pred)
xgb_r2 = r2_score(y_test, xgb_pred)

print("\n🎯 Model Performance Results:")
print("=" * 50)
print(f"\n🌲 Random Forest:")
print(f"   RMSE: ${rf_rmse:.2f} SGD")
print(f"   MAE:  ${rf_mae:.2f} SGD")
print(f"   R²:   {rf_r2:.4f}")

print(f"\n🚀 XGBoost:")
print(f"   RMSE: ${xgb_rmse:.2f} SGD")
print(f"   MAE:  ${xgb_mae:.2f} SGD")
print(f"   R²:   {xgb_r2:.4f}")

# Check if models meet the proposal target (RMSE < $3.00)
print("\n🎯 Performance Check:")
print("-" * 30)
if rf_rmse < 3.0:
    print(f"✅ Random Forest: RMSE ${rf_rmse:.2f} < $3.00 (Target met!)")
else:
    print(f"⚠️  Random Forest: RMSE ${rf_rmse:.2f} >= $3.00 (Target not met)")
    
if xgb_rmse < 3.0:
    print(f"✅ XGBoost: RMSE ${xgb_rmse:.2f} < $3.00 (Target met!)")
else:
    print(f"⚠️  XGBoost: RMSE ${xgb_rmse:.2f} >= $3.00 (Target not met)")


In [None]:
# Save models and feature columns
print("💾 Saving trained models...")

# Create a dictionary with all the necessary components
model_data = {
    'rf_model': rf_model,
    'xgb_model': xgb_model,
    'feature_columns': feature_columns,
    'rf_rmse': rf_rmse,
    'xgb_rmse': xgb_rmse,
    'rf_mae': rf_mae,
    'xgb_mae': xgb_mae,
    'rf_r2': rf_r2,
    'xgb_r2': xgb_r2
}

# Save as pickle file
with open('singapore_taxi_models.pkl', 'wb') as f:
    pickle.dump(model_data, f)

print("✅ Models saved as 'singapore_taxi_models.pkl'")
print("\n📁 Files created:")
print("   - singapore_taxi_models.pkl (Complete model package)")
print("\n📋 Model Summary:")
print(f"   Random Forest RMSE: ${rf_rmse:.2f}")
print(f"   XGBoost RMSE: ${xgb_rmse:.2f}")
print(f"   Features: {len(feature_columns)}")
print(f"   Training samples: {len(X_train)}")
print(f"   Test samples: {len(X_test)}")

print("\n🎉 Training completed successfully!")
print("Your models are ready for deployment! 🚀")


## Step 5: Download Instructions

### How to use the trained models in your local project:

1. **Download the model file**: Click on `singapore_taxi_models.pkl` in the file browser on the left
2. **Place it in your project**: Put the file in your `singapore-taxi-fare-prediction` folder
3. **Update your local code**: Use the model loading code below

### Model Loading Code for Local Project:

```python
import pickle
import pandas as pd
import numpy as np

def load_ml_models():
    """Load the trained ML models."""
    with open('singapore_taxi_models.pkl', 'rb') as f:
        model_data = pickle.load(f)
    
    return model_data

# Load models
models = load_ml_models()
rf_model = models['rf_model']
xgb_model = models['xgb_model']
feature_columns = models['feature_columns']

print("✅ Models loaded successfully!")
print(f"Random Forest RMSE: ${models['rf_rmse']:.2f}")
print(f"XGBoost RMSE: ${models['xgb_rmse']:.2f}")
```
