# Electric Vehicle ML Model Experiments

This notebook contains comprehensive model training and evaluation experiments.

In [null]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import mlflow.sklearn
import warnings
warnings.filterwarnings('ignore')

In [null]:
# Configure MLflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("EV_Model_Experiments")

## 1. Load and Prepare Data

In [null]:
# Load feature-engineered data
df = pd.read_csv('../data/processed/ev_data_features.csv')

# Select features
feature_cols = [
    'Model Year', 'Electric Range', 'Base MSRP', 'vehicle_age',
    'is_luxury', 'is_bev', 'has_long_range', 'city_ev_density',
    'county_ev_density', 'is_urban', 'manufacturer_market_share',
    'model_popularity', 'manufacturer_avg_range'
]

# Prepare features and target
X = df[feature_cols].dropna()
y_classification = df.loc[X.index, 'is_bev']  # Classification target
y_regression = df.loc[X.index, 'Electric Range']  # Regression target

print(f"Features shape: {X.shape}")
print(f"Classification target distribution:\n{y_classification.value_counts()}")

## 2. Train-Test Split

In [null]:
# Split for classification
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X, y_classification, test_size=0.2, random_state=42, stratify=y_classification
)

# Split for regression
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X, y_regression, test_size=0.2, random_state=42
)

print(f"Training set size: {len(X_train_clf)}")
print(f"Test set size: {len(X_test_clf)}")

## 3. Classification Experiments

In [null]:
def evaluate_classifier(model, X_train, X_test, y_train, y_test, model_name):
    """Train and evaluate a classifier with MLflow tracking."""
    
    with mlflow.start_run(run_name=model_name):
        # Train model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
        
        # Cross-validation score
        cv_scores = cross_val_score(model, X_train, y_train, cv=5)
        cv_mean = cv_scores.mean()
        cv_std = cv_scores.std()
        
        # Log parameters
        mlflow.log_param("model_type", model_name)
        mlflow.log_param("n_features", X_train.shape[1])
        
        # Log metrics
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("cv_mean", cv_mean)
        mlflow.log_metric("cv_std", cv_std)
        
        # Log model
        mlflow.sklearn.log_model(model, "model")
        
        # Print results
        print(f"\n{model_name} Results:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-Score: {f1:.4f}")
        print(f"CV Score: {cv_mean:.4f} (+/- {cv_std:.4f})")
        
        return model, accuracy

In [null]:
# Train multiple classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'Decision Tree': DecisionTreeClassifier(max_depth=10, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'XGBoost': xgb.XGBClassifier(n_estimators=100, random_state=42)
}

results = {}
for name, clf in classifiers.items():
    model, accuracy = evaluate_classifier(
        clf, X_train_clf, X_test_clf, y_train_clf, y_test_clf, name
    )
    results[name] = {'model': model, 'accuracy': accuracy}

## 4. Hyperparameter Tuning

In [null]:
# Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

with mlflow.start_run(run_name="RF_GridSearch"):
    rf = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train_clf, y_train_clf)
    
    # Best parameters
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    
    # Test performance
    y_pred = grid_search.predict(X_test_clf)
    test_accuracy = accuracy_score(y_test_clf, y_pred)
    
    # Log to MLflow
    mlflow.log_params(best_params)
    mlflow.log_metric("best_cv_score", best_score)
    mlflow.log_metric("test_accuracy", test_accuracy)
    mlflow.sklearn.log_model(grid_search.best_estimator_, "best_model")
    
    print("Best Parameters:")
    for param, value in best_params.items():
        print(f"{param}: {value}")
    print(f"\nBest CV Score: {best_score:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")

## 5. Regression Experiments

In [null]:
def evaluate_regressor(model, X_train, X_test, y_train, y_test, model_name):
    """Train and evaluate a regressor with MLflow tracking."""
    
    with mlflow.start_run(run_name=f"{model_name}_Regression"):
        # Train model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        # Log metrics
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2_score", r2)
        
        # Log model
        mlflow.sklearn.log_model(model, "model")
        
        print(f"\n{model_name} Regression Results:")
        print(f"RMSE: {rmse:.2f}")
        print(f"MAE: {mae:.2f}")
        print(f"R2 Score: {r2:.4f}")
        
        return model, r2

In [null]:
# Train regression models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

regressors = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBRegressor(n_estimators=100, random_state=42)
}

regression_results = {}
for name, reg in regressors.items():
    model, r2 = evaluate_regressor(
        reg, X_train_reg, X_test_reg, y_train_reg, y_test_reg, name
    )
    regression_results[name] = {'model': model, 'r2': r2}

## 6. Feature Importance Analysis

In [null]:
# Get feature importance from best model
best_model = results['Random Forest']['model']
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['feature'][:10], feature_importance['importance'][:10])
plt.xlabel('Importance')
plt.title('Top 10 Feature Importances')
plt.tight_layout()
plt.show()

print("Top 5 Most Important Features:")
print(feature_importance.head())

## 7. Model Comparison Visualization

In [null]:
# Compare classification models
plt.figure(figsize=(12, 5))

# Classification comparison
plt.subplot(1, 2, 1)
clf_scores = [results[name]['accuracy'] for name in results]
plt.bar(results.keys(), clf_scores, color='skyblue')
plt.ylabel('Accuracy')
plt.title('Classification Model Comparison')
plt.xticks(rotation=45)
plt.ylim([0.8, 1.0])

# Regression comparison
plt.subplot(1, 2, 2)
reg_scores = [regression_results[name]['r2'] for name in regression_results]
plt.bar(regression_results.keys(), reg_scores, color='lightcoral')
plt.ylabel('R2 Score')
plt.title('Regression Model Comparison')
plt.xticks(rotation=45)
plt.ylim([0.5, 1.0])

plt.tight_layout()
plt.show()

## 8. Save Best Models

In [null]:
import joblib

# Save best classification model
best_clf = max(results.items(), key=lambda x: x[1]['accuracy'])
joblib.dump(best_clf[1]['model'], '../models/saved/best_classifier.pkl')
print(f"Best classifier saved: {best_clf[0]} with accuracy {best_clf[1]['accuracy']:.4f}")

# Save best regression model
best_reg = max(regression_results.items(), key=lambda x: x[1]['r2'])
joblib.dump(best_reg[1]['model'], '../models/saved/best_regressor.pkl')
print(f"Best regressor saved: {best_reg[0]} with R2 score {best_reg[1]['r2']:.4f}")

## 9. Model Experiment Summary

In [null]:
print("="*60)
print("EXPERIMENT SUMMARY")
print("="*60)
print("\nClassification Models:")
print("-"*30)
for name, result in results.items():
    print(f"{name:20} Accuracy: {result['accuracy']:.4f}")

print("\nRegression Models:")
print("-"*30)
for name, result in regression_results.items():
    print(f"{name:20} R2 Score: {result['r2']:.4f}")

print("\nBest Models:")
print("-"*30)
print(f"Classification: {best_clf[0]}")
print(f"Regression: {best_reg[0]}")
print("="*60)