In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd
import os

In [5]:
raw_path = '../01 - data/02 - processed/'
df = pd.read_csv(os.path.join(raw_path, 'processed_data.csv'))

In [10]:
# Define the significant numerical variables here
features = ['Trip_Distance_km', 'Per_Km_Rate', 'Per_Minute_Rate', 'Trip_Duration_Minutes']  # exemplo
target = 'Trip_Price'

# Separate X and y
X = df[features]
y = df[target]

# Divide the dataset into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the templates
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"\n📊 Model: {name}")
    print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.2f}")
    print(f"R²: {r2_score(y_test, y_pred):.2f}")

# Example: Manual forecast
# Replace with actual input values
sample_input = pd.DataFrame([{
    'Trip_Distance_km': 25,
    'Per_Km_Rate': 12.5,
    'Per_Minute_Rate': 2,
    'Trip_Duration_Minutes': 1.3
}])

# Choose the best model after comparing the results
best_model = models["Random Forest"]  
predicted_price = best_model.predict(sample_input)

print(f"\n🚕 Predicted Trip Price: ${predicted_price[0]:.2f}")


📊 Model: Linear Regression
MAE: 9.88
RMSE: 16.94
R²: 0.88

📊 Model: Random Forest
MAE: 6.03
RMSE: 11.97
R²: 0.94

📊 Model: Gradient Boosting
MAE: 5.60
RMSE: 13.26
R²: 0.92

🚕 Predicted Trip Price: $48.80
