VISUALIZATIONS TO CHECK OFF 

Confusion Matrix (Classification Problems)
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay(confusion_matrix=cm).plot()

Feature Importance Visualization
import matplotlib.pyplot as plt
importances = model.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(10, 6))
plt.barh(range(len(indices)), importances[indices])
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.title('Feature Importances')
plt.show()

SHAP (SHapley Additive exPlanations):
import shap
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test)

Permutation Importance:
from sklearn.inspection import permutation_importance
result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42)
plt.barh(X_test.columns, result.importances_mean)
plt.title("Permutation Importances")
plt.show()

Model Performance Metrics and Comparison
ROC Curve (Receiver Operating Characteristic Curve):
Purpose: Visualize the trade-off between true positive rate and false positive rate.
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.plot(fpr, tpr, label=f'AUC = {auc(fpr, tpr):.2f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

Precision-Recall Curve:
Purpose: Particularly useful in imbalanced datasets.
from sklearn.metrics import precision_recall_curve
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
plt.plot(recall, precision)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()

Comparing Model Metrics Side-by-Side:
Purpose: Compare key metrics (accuracy, F1-score, RMSE) across different models in a table.
Example:
python
import pandas as pd
model_performance = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'XGBoost'],
    'Accuracy': [0.85, 0.90, 0.88],
    'Precision': [0.82, 0.91, 0.87],
    'Recall': [0.78, 0.89, 0.85],
    'F1 Score': [0.80, 0.90, 0.86]
})
print(model_performance)

Residual Plots and Prediction vs. Actuals (Regression Problems)
Residual Plot: Shows errors by plotting residuals.
        Example:

        python

    residuals = y_test - y_pred
    sns.scatterplot(x=y_pred, y=residuals)
    plt.xlabel('Predicted')
    plt.ylabel('Residuals')
    plt.title('Residual Plot')

Prediction vs. Actual Plot:

# Results and Analysis

---

## 1. Introduction

- Briefly summarize the goals of this notebook and what you aim to achieve.

## 2. Loading and Preparing Data

In [1]:
# Import necessary libraries
import pandas as pd
import seaborn as sns
# Load the test set results
logistics_results_df = pd.read_csv('../data/processed/engineered_data.csv')
logistics_results_df.head()

Unnamed: 0,Route ID,Delivery Time (hours),Date,Fuel Costs (USD),Delivery Start Time,Distance Traveled (miles),Estimated Distance (miles),Weather Conditions,Traffic Conditions,Driver Ratings,...,Breakdown Repair Costs (USD),Overtime Labor Costs (USD),Fuel Surcharge (USD),Idle Cost (USD),Total Operational Cost (USD),Fuel Cost per Mile,Delivery Efficiency Score,Cost per Mile,Delivery Efficiency,Route Length Category
0,89273afc-bd2f-41ed-a73d-ef0b92913719,6.834667,2023-06-19,860.972014,10:21:08,2600.678278,1347.343378,Light Rain,Severe,2.990486,...,660.421071,0.0,75.199379,0.219246,1729.539837,0.331057,0.040278,0.331057,380.512777,Long
1,78800389-d0f5-41e7-9ab3-4401f6c25c32,5.090882,2023-02-24,847.397208,06:44:56,1493.396998,1610.912518,Clear,Moderate,1.243085,...,0.0,0.0,74.013722,1.420456,1000.309343,0.567429,0.04107,0.567429,293.347423,Long
2,cc5a94e9-d53b-4d84-ba13-19d045c00e21,5.179179,2023-02-12,368.294777,16:45:48,495.929796,585.832494,Clear,Severe,1.334854,...,0.0,0.0,32.167757,2.569551,463.841459,0.742635,0.013087,0.742635,95.754527,Long
3,36b33d73-1c8e-4782-9775-84def74b3cd0,3.271822,2023-09-23,740.660213,01:47:40,2518.656016,2720.531979,Heavy Rain,Moderate,4.355089,...,0.0,0.0,64.691055,1.144174,960.391265,0.29407,0.054048,0.29407,769.80217,Long
4,1139533f-4a9f-4348-b05c-10a55d768e04,7.007333,2023-06-20,323.146483,13:03:09,2466.773297,1489.795422,Clear,Severe,2.61814,...,0.0,0.0,28.224396,0.813373,442.445597,0.131,0.036246,0.131,352.027398,Long


In [2]:
from sklearn.metrics import mean_absolute_error, r2_score

# Calculate and print evaluation metrics
mae = mean_absolute_error(logistics_results_df['Actual'], logistics_results_df['Predicted'])
r2 = r2_score(logistics_results_df['Actual'], logistics_results_df['Predicted'])
print(f"Mean Absolute Error: {mae}")
print(f"R^2 Score: {r2}")


KeyError: 'Actual'

In [None]:
import matplotlib.pyplot as plt

# Plot predictions vs actual values
plt.scatter(logistics_results_df['Actual'], logistics_results_df['Predicted'], alpha=0.5)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs. Predicted Values")
plt.show()


In [None]:
# Plot a feature's effect (e.g., Cost per Mile) on predicted costs
sns.boxplot(data=logistics_results_df, x='Cost per Mile', y='Predicted')
plt.title("Predicted Costs by Cost per Mile")
plt.show()


# Summary