In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import pickle
from datetime import datetime
import math

In [12]:
# Load the dataset
df = pd.read_csv('uber.csv')

# Basic cleaning
df = df.dropna()
df = df[(df['fare_amount'] > 0) & (df['fare_amount'] < 200)]  # Remove outliers

# Convert datetime
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

# Feature engineering - adding demand features
df['hour'] = df['pickup_datetime'].dt.hour
df['day_of_week'] = df['pickup_datetime'].dt.dayofweek
df['month'] = df['pickup_datetime'].dt.month

# New: Create time-based demand proxies
df['is_morning_rush'] = ((df['hour'] >= 7) & (df['hour'] <= 9) & (df['day_of_week'] >= 0) & (df['day_of_week'] <= 4)).astype(int)
df['is_evening_rush'] = ((df['hour'] >= 17) & (df['hour'] <= 19) & (df['day_of_week'] >= 0) & (df['day_of_week'] <= 4)).astype(int)
df['is_weekend_night'] = ((df['hour'] >= 22) | (df['hour'] <= 3)) & ((df['day_of_week'] == 5) | (df['day_of_week'] == 6)).astype(int)


In [13]:
# Calculate distance using Haversine formula
def haversine_distance(lat1, lon1, lat2, lon2):
    # [Your existing distance calculation function]
    R = 6371  # Earth radius in kilometers
    
    # Convert decimal degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    distance = R * c
    
    return distance

# Apply distance calculation
df['distance_km'] = df.apply(
    lambda row: haversine_distance(
        row['pickup_latitude'], row['pickup_longitude'],
        row['dropoff_latitude'], row['dropoff_longitude']
    ),
    axis=1
)

# Remove rows with unrealistic distances
df = df[(df['distance_km'] > 0.1) & (df['distance_km'] < 300)]

In [17]:
# Prepare features - including new demand features
features = [
    'passenger_count', 
    'distance_km', 
    'hour', 
    'day_of_week', 
    'month', 
    'is_morning_rush', 
    'is_evening_rush', 
    'is_weekend_night'
]
X = df[features]
y = df['fare_amount']

In [18]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost model
model = xgb.XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
model.fit(X_train, y_train)

In [19]:
# Evaluation
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse:.2f}")

RMSE: 4.18


In [21]:
model = xgb.XGBRegressor(random_state=42)

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 9]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model and its hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print(f"Best Hyperparameters: {best_params}")

# Evaluate the best model
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R-squared: {r2:.2f}")


Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100}
RMSE: 4.18
R-squared: 0.81


In [22]:
# Save the model
with open('xgboost_fare_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)