## Data Loading

In [6]:
import pandas as pd
# Load the training and test datasets
train_df = pd.read_csv('../data/processed/train_balanced_data.csv')
test_df = pd.read_csv('../data/processed/test_balanced_data.csv')

# Separate features and target
X_test = test_df.drop(columns='Accident')
y_test = test_df['Accident']


## Model Comparison

In [10]:
import joblib
# Define paths to the saved models for comparison
model_paths = {
    'Logistic Regression': '../src/models/logistic_regression_model.joblib',
    'Random Forest': '../src/models/random_forest_model.pkl'
}

# Initialize a dictionary to store evaluation metrics for each model
metrics = {
    'Model': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1 Score': [],
    'AUC': []
}

# Evaluate each model
for model_name, path in model_paths.items():
    model = joblib.load(path)
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    metrics['Model'].append(model_name)
    metrics['Accuracy'].append(accuracy_score(y_test, y_pred))
    metrics['Precision'].append(precision_score(y_test, y_pred))
    metrics['Recall'].append(recall_score(y_test, y_pred))
    metrics['F1 Score'].append(f1_score(y_test, y_pred))
    metrics['AUC'].append(roc_auc_score(y_test, y_pred_proba))

# Convert metrics to DataFrame for easier display
metrics_df = pd.DataFrame(metrics)
metrics_df


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,AUC
0,Logistic Regression,0.556338,0.677876,0.596573,0.634631,0.566925
1,Random Forest,0.577465,0.651639,0.742991,0.694323,0.526265


## Visualization of Model Comparison

In [None]:
import matplotlib.pyplot as plt
# Plot comparison of each metric
metrics_df.set_index('Model').plot(kind='bar', figsize=(12, 8))
plt.title('Model Comparison')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.legend(loc='lower right')
plt.show()
