In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d
import xgboost as xgb
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import plot_importance as xgb_plot_importance
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from datetime import datetime, timedelta
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

In [None]:
trips_df = pd.read_csv('all_trips.csv')

In [None]:
trips_df.info()

In [None]:
numerical_columns = ['duration_min', 'mean_speed', 'mean_acceleration', 'max_acceleration',
                     'mean_weight', 'elev_gain', 'distance_travelled', 'total_fuel_used_lit',
                     'lit_per_100km']

correlation_matrix = trips_df[numerical_columns].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', vmin=-1, vmax=1)
plt.title("Correlation Matrix")
plt.show()

# Check MAE on different subsets of data (before merging)

In [None]:
for source in ['30min', '2hr']:
    subset = trips_df[trips_df['source'] == source]

    if len(subset) < 5:
        print(f"Skipping {source} — too few samples ({len(subset)}).")
        continue

    X = subset[['duration_min', 'mean_speed', 'mean_acceleration', 'max_acceleration',
                'mean_weight', 'elev_gain', 'distance_travelled']]
    y = subset['total_fuel_used_lit']

    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('regressor', RandomForestRegressor(n_estimators=50, random_state=42))
    ])

    scores = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
    print(f"{source} MAE:", -np.mean(scores))

In [None]:
def plot_feature_distributions(X_train, X_test, y_train, y_test, feature_cols, target_col, source_col=None):
    if source_col:
      print(f"\nSource distribution by '{source_col}':")
      print("Train:")
      print(X_train[source_col].value_counts(normalize=True).sort_index())
      print("\nTest:")
      print(X_test[source_col].value_counts(normalize=True).sort_index())


    plt.figure(figsize=(5, 3))
    sns.histplot(y_train, color='blue', label='Train', stat='density', bins=30)
    sns.histplot(y_test, color='red', label='Test', stat='density', bins=30)
    plt.title(f'Distribution of Target ({target_col}) in Train vs Test')
    plt.legend()
    plt.show()

In [None]:
def train_and_evaluate(X_train,X_test,y_train,y_test, name, model_tuple):
    model, param_grid = model_tuple

    # Normalize only if the model needs it
    models_needing_scaling = (LinearRegression, Ridge, Lasso, ElasticNet,LinearSVR)
    if isinstance(model, models_needing_scaling):
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('regressor', model)
        ])
        # Adjust param grid for pipeline
        param_grid = {f'regressor__{k}': v for k, v in param_grid.items()}
        estimator = pipeline
    else:
        estimator = model

    grid_search = GridSearchCV(
        estimator=estimator,
        param_grid=param_grid,
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        verbose=1,
        error_score='raise'
    )

    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)

    print(f"Best parameters for {name}: {grid_search.best_params_}")
    print(f"{name} MAE: {mae:.4f}")

    plot_actual_vs_predicted(y_test, y_pred, name)

    final_model = best_model.named_steps['regressor'] if isinstance(best_model, Pipeline) else best_model
    plot_feature_importance(final_model, feature_cols, name)

    return mae


In [None]:
def plot_feature_importance(best_model, feature_cols, name,colors ='#8DA0CB'):
    plt.figure(figsize=(8, 4))

    if isinstance(best_model, xgb.XGBRegressor):
        xgb.plot_importance(best_model, importance_type='gain', xlabel='Average Gain', height=0.5, grid=False, values_format="{v:.0f}",
                            color=colors if colors else 'C0')
        plt.title(f'{name}: Feature Importance (XGBoost)')

    elif hasattr(best_model, 'feature_importances_'):
        importances = best_model.feature_importances_
        indices = np.argsort(importances)[::-1]
        feature_names = np.array(feature_cols)[indices]
        bar_colors = colors if colors else sns.color_palette("Set2", len(feature_names))
        plt.barh(feature_names, importances[indices], height=0.5, color=bar_colors)
        plt.xlabel('Feature Importance')
        plt.title(f'{name}: Feature Importance')

    elif hasattr(best_model, 'coef_'):
        importances = best_model.coef_
        indices = np.argsort(importances)
        feature_names = np.array(feature_cols)[indices]
        bar_colors = colors if colors else sns.color_palette("coolwarm", len(feature_names))

        plt.barh(feature_names, importances[indices], height=0.5, color=bar_colors)
        plt.xlabel('Coefficient Value (Signed)')
        plt.title(f'{name}: Signed Coefficient Importance')
    else:
        print(f"Model {name} does not support feature importances.")
        return

    plt.tight_layout()
    plt.show()

In [None]:
def plot_actual_vs_predicted(y_test, y_pred, name):
    indices = range(len(y_test))
    plt.figure(figsize=(5, 3))
    plt.plot(indices, y_test, color='#66C2A5', label='Actual', lw=2)
    plt.plot(indices, y_pred, color='orange', label='Predicted', lw=2)
    plt.title(f'{name}: Actual vs Predicted')
    plt.xlabel('Example')
    plt.ylabel('Fuel Used (liters)')
    plt.legend(loc='upper left')
    plt.grid(False)
    plt.tight_layout()
    plt.show()

In [None]:
def plot_mae_comparison(results,title):

    sorted_results = dict(sorted(results.items(), key=lambda item: item[1]))
    model_names = list(sorted_results.keys())
    mae_values = [sorted_results[model] for model in model_names]

    plt.figure(figsize=(10, 6))
    bars = plt.bar(model_names, mae_values, color = sns.color_palette("Accent", len(model_names)))
    plt.title(title, fontsize=14)
    plt.ylabel('Mean Absolute Error', fontsize=12)
    plt.xticks(rotation=45, ha='right', fontsize=10)

    for bar, mae in zip(bars, mae_values):
        yval = bar.get_height()
        plt.text(bar.get_x() + bar.get_width() / 2, yval + 0.5, f'{mae:.2f}', ha='center', va='bottom', fontsize=9)
    plt.ylim(0, max(mae_values) * 1.15)
    plt.tight_layout()
    plt.show()

In [None]:
algorithms = {
    'Random Forest': (
        RandomForestRegressor(random_state=42),
        {
          'n_estimators': [100, 200],
          'max_depth': [10, 20],
          'min_samples_split': [2, 5],
          'min_samples_leaf': [1, 2],
          'max_features': ['sqrt', 'log2']
        }
    ),
    'XGBoost': (
        xgb.XGBRegressor(objective='reg:squarederror', random_state=42),
        {
          'n_estimators': [50, 100],
          'max_depth': [3, 5, 7],
          'learning_rate': [0.01, 0.1, 0.2]
        }
    ),
     'Ridge Regression': (
        Ridge(),
        {
            'alpha': [0.01, 0.1, 1.0, 10.0]
        }
    ),
    'Lasso Regression': (
        Lasso(max_iter=10000),
        {
            'alpha': [0.01, 0.1, 1.0, 10.0]
        }
    ),
    'Gradient Boosting': (
        GradientBoostingRegressor(random_state=42),
        {
          'n_estimators': [100, 200],
          'learning_rate': [0.05, 0.1],
          'max_depth': [3, 5, 7]
        }
    ),
    'SVR': (
    LinearSVR(),
    {

        'C': [0.1, 1],
        'epsilon': [0.01, 0.1],
        'max_iter': [1000,5000]

    }
),
    'Decision Tree': (
    DecisionTreeRegressor(random_state=42),
    {
        'max_depth': [3, 5, 10, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    ),
    'Linear Regression': (
        LinearRegression(),
        {}
    ),
     'Elastic Net': (
        ElasticNet(max_iter=10000),
        {
            'alpha': [0.01, 0.1, 1.0],
            'l1_ratio': [0.2, 0.5, 0.8]
        }
    )
}


feature_cols = ['duration_min', 'mean_speed', 'mean_acceleration', 'max_acceleration',
                'mean_weight', 'elev_gain', 'distance_travelled']
source_col = 'source'
target_cols = ['total_fuel_used_lit','lit_per_100km']

In [None]:
def split_data(target,random_state=42,test_size=0.2,source_col = None):
  X = trips_df[feature_cols].copy()
  y = trips_df[target]

  if source_col:
    source = trips_df[source_col]
  else:
    source = None

  X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=random_state,
    stratify=source if source_col else None
  )

  # if source_col:
  #   X_train[source_col] = trips_df.loc[X_train.index, source_col]
  #   X_test[source_col] = trips_df.loc[X_test.index, source_col]

  plot_feature_distributions(X_train, X_test, y_train, y_test,
                            feature_cols, target)


  if source_col:
    X_train = X_train.drop(columns=[source_col])
    X_test = X_test.drop(columns=[source_col])

  return X_train, X_test, y_train, y_test

In [None]:
for target in target_cols:
  results = {}
  X_train, X_test, y_train, y_test = split_data(target)
  for name, model in algorithms.items():
      mae = train_and_evaluate(
          X_train,
          X_test,
          y_train,
          y_test,
          name,
          model
      )
      results[name] = mae
  plot_mae_comparison(results,title=f"MAE for target: {target}")