In [1]:
# Import library
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Lasso library
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

# RandomForest 
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv("dataset.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91794 entries, 0 to 91793
Data columns (total 43 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Car_Id                     91794 non-null  int64  
 1   Person_Id                  91794 non-null  int64  
 2   Trip                       91794 non-null  int64  
 3   GPS_Time                   91794 non-null  object 
 4   Device_Time                91794 non-null  object 
 5   GPS_Long                   91794 non-null  float64
 6   GPS_Lat                    91794 non-null  float64
 7   GPS_Speed_Ms               91794 non-null  float64
 8   GPS_HDOP                   91794 non-null  int64  
 9   GPS_Bearing                91794 non-null  float64
 10  Gx                         91794 non-null  float64
 11  Gy                         91794 non-null  float64
 12  Gz                         91794 non-null  float64
 13  G_Calibrated               91794 non-null  flo

  df = pd.read_csv("dataset.csv")


In [None]:
# Drop na
df = df.select_dtypes(include=[np.number]).dropna()

# Drop inf
df.replace([np.inf, -np.inf], np.nan, inplace = True)
df.dropna(inplace = True)

# Drop Person_Id, Trip, Car_Id, Device_Cost_Km_Inst, Device_Cost_Km_Trip
df = df.drop(columns=["Person_Id", "Trip", "Car_Id", "Device_Cost_Km_Trip", "KPL_Instant", "Speed_RPM_Relation", "Device_Trip_Distance_Km"])


In [5]:
X = df.drop("OBD_CO2_gkm_Instant", axis=1)
y = df["OBD_CO2_gkm_Instant"].values

In [6]:
# Standardize data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
lasso = Lasso(alpha=0.05)
lasso.fit(X_scaled, y)
importance = lasso.coef_
feature_importance = pd.Series(importance, index=X.columns).abs()

In [None]:
# Sort by importance
feature_importance_sorted = feature_importance.sort_values(ascending=False)
print("\n===== LASSO Feature Ranking (Highest → Lowest Importance) =====")
print(feature_importance_sorted)


===== LASSO Feature Ranking (Highest → Lowest Importance) =====
OBD_Speed_Km                 159.206847
OBD_Fuel_Flow_CCmin          151.121002
OBD_Engine_RPM                75.168528
OBD_Engine_Load               42.290711
KPL_Instant                   32.054430
Speed_RPM_Relation            28.549007
OBD_KPL_Instant               27.251528
OBD_Air_Pedal                 22.905260
Air_Drag_Force                20.997452
Acceleration_kmhs             11.732849
OBD_CO2_gkm_Average           11.031319
Car_Id                        10.449633
OBD_Adapter_Voltage            5.825275
Device_Fuel_Remaining          5.745388
GPS_HDOP                       5.392540
Reaction_Time                  4.394773
OBD_Trip_KPL_Average           4.394708
GPS_Speed_Km                   4.009265
OBD_Engine_Coolant_Temp_C      3.715442
Gz                             3.558256
Trip                           2.840212
Device_Trip_Dist_Km            1.869367
G_Calibrated                   1.275935
Gy             

In [8]:
# Random Forest feature importance
forest = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
forest.fit(X_scaled, y)
importances = forest.feature_importances_
rf_feature_importance = pd.Series(importances, index=X.columns).abs()
rf_feature_importance_sorted = rf_feature_importance.sort_values(ascending=False)
print("\n===== Random Forest Feature Ranking (Highest → Lowest Importance) =====")
print(rf_feature_importance_sorted)
# Find R^2 score for random forest
r2_score = forest.score(X_scaled, y)
print(f"\nRandom Forest R^2 Score: {r2_score:.4f}")



===== Random Forest Feature Ranking (Highest → Lowest Importance) =====
OBD_KPL_Instant              0.467684
OBD_Fuel_Flow_CCmin          0.406568
Air_Drag_Force               0.035041
OBD_Speed_Km                 0.032293
KPL_Instant                  0.030099
Acceleration_kmhs            0.008590
Speed_RPM_Relation           0.003155
Device_Trip_Dist_Km          0.002039
OBD_Engine_RPM               0.001223
OBD_Engine_Load              0.001102
GPS_Speed_Ms                 0.001037
Gx                           0.000850
G_Calibrated                 0.000775
Gz                           0.000754
GPS_Speed_Km                 0.000720
Device_Fuel_Remaining        0.000677
OBD_Air_Pedal                0.000674
OBD_Trip_KPL_Average         0.000673
OBD_CO2_gkm_Average          0.000641
GPS_Long                     0.000560
OBD_Engine_Coolant_Temp_C    0.000554
GPS_Bearing                  0.000519
Gy                           0.000507
GPS_Altitude_M               0.000477
Device_Baromete