In [64]:
# Import library
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Lasso library
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

# RandomForest 
from sklearn.ensemble import RandomForestRegressor

In [50]:
df = pd.read_csv("dataset.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91794 entries, 0 to 91793
Data columns (total 43 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Car_Id                     91794 non-null  int64  
 1   Person_Id                  91794 non-null  int64  
 2   Trip                       91794 non-null  int64  
 3   GPS_Time                   91794 non-null  object 
 4   Device_Time                91794 non-null  object 
 5   GPS_Long                   91794 non-null  float64
 6   GPS_Lat                    91794 non-null  float64
 7   GPS_Speed_Ms               91794 non-null  float64
 8   GPS_HDOP                   91794 non-null  int64  
 9   GPS_Bearing                91794 non-null  float64
 10  Gx                         91794 non-null  float64
 11  Gy                         91794 non-null  float64
 12  Gz                         91794 non-null  float64
 13  G_Calibrated               91794 non-null  flo

  df = pd.read_csv("dataset.csv")


In [47]:
df["Device_Cost_Km_Inst"].unique()

array(['0.11', '0.03', '0.08', '0.15', '0.16', '0.06', '0.04', '0.09',
       '0.19', '1.08', '0.71', '∞', '0.42', '0.36', '0.34', '0.22',
       '0.17', '0.2', '0.21', '0.12', '0.1', '0.07', '0.05', '0.02',
       '0.51', '3', '0.39', '0.27', '0.18', '0.14', '0.13', '0.24',
       '0.25', '0.33', '1.41', '1.04', '0.45', '1.6', '1.63', '1.46',
       '0.64', '0.37', '0.32', '0.23', '0.63', '1', '0.3', '2.99', '0.43',
       '0.26', '0.4', '0.69', '1.71', '1.21', '0.49', '0.74', '0.31',
       '0.35', '0.56', '0.81', '0.86', '0.66', '0.55', '1.39', '0.38',
       '3.99', '0.41', '1.53', '2.25', '0.84', '0.28', '2.02', '1.34',
       '1.17', '0.7', '0.59', '0.95', '0.92', '1.18', '0.65', '0.5',
       '0.75', '0.77', '2.11', '0.29', '0.48', '0.47', '2.8', '2.48',
       '1.22', '1.01', '1.2', '0.91', '0.85', '2.07', '2.83', '0.68',
       '1.97', '3.21', '1.59', '0.82', '0.73', '0.46', '0.52', '0.8',
       '0.44', '1.06', '1.88', '0.54', '1.66', '1.54', '1.67', '1.23',
       '0.83', '0

In [None]:
# Drop na
df = df.select_dtypes(include=[np.number]).dropna()

# Drop inf
df.replace([np.inf, -np.inf], np.nan, inplace = True)
df.dropna(inplace = True)

(54839, 40)

In [53]:
X = df.drop("OBD_CO2_gkm_Instant", axis=1)
y = df["OBD_CO2_gkm_Instant"].values

In [54]:
# Standardize data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [61]:
lasso = Lasso(alpha=0.05)
lasso.fit(X_scaled, y)
importance = lasso.coef_
feature_importance = pd.Series(importance, index=X.columns).abs()

In [66]:
# Sort by importance
feature_importance_sorted = feature_importance.sort_values(ascending=False)
print("\n===== LASSO Feature Ranking (Highest → Lowest Importance) =====")
print(feature_importance_sorted)


===== LASSO Feature Ranking (Highest → Lowest Importance) =====
OBD_Speed_Km                 159.206847
OBD_Fuel_Flow_CCmin          151.121002
OBD_Engine_RPM                75.168528
OBD_Engine_Load               42.290711
KPL_Instant                   32.054430
Speed_RPM_Relation            28.549007
OBD_KPL_Instant               27.251528
OBD_Air_Pedal                 22.905260
Air_Drag_Force                20.997452
Acceleration_kmhs             11.732849
OBD_CO2_gkm_Average           11.031319
Car_Id                        10.449633
OBD_Adapter_Voltage            5.825275
Device_Fuel_Remaining          5.745388
GPS_HDOP                       5.392540
Reaction_Time                  4.394773
OBD_Trip_KPL_Average           4.394708
GPS_Speed_Km                   4.009265
OBD_Engine_Coolant_Temp_C      3.715442
Gz                             3.558256
Trip                           2.840212
Device_Trip_Dist_Km            1.869367
G_Calibrated                   1.275935
Gy             

In [65]:
# Random Forest feature importance
forest = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
forest.fit(X_scaled, y)
importances = forest.feature_importances_
rf_feature_importance = pd.Series(importances, index=X.columns).abs()
rf_feature_importance_sorted = rf_feature_importance.sort_values(ascending=False)
print("\n===== Random Forest Feature Ranking (Highest → Lowest Importance) =====")
print(rf_feature_importance_sorted)


===== Random Forest Feature Ranking (Highest → Lowest Importance) =====
OBD_KPL_Instant              0.467362
OBD_Fuel_Flow_CCmin          0.406489
Air_Drag_Force               0.035330
OBD_Speed_Km                 0.031827
KPL_Instant                  0.030428
Acceleration_kmhs            0.008484
Speed_RPM_Relation           0.003133
Device_Trip_Dist_Km          0.001881
OBD_Engine_RPM               0.001184
GPS_Speed_Ms                 0.001011
OBD_Engine_Load              0.000983
Gx                           0.000942
Gz                           0.000836
G_Calibrated                 0.000766
OBD_Trip_KPL_Average         0.000691
GPS_Speed_Km                 0.000672
OBD_CO2_gkm_Average          0.000664
Device_Fuel_Remaining        0.000633
GPS_Bearing                  0.000603
OBD_Engine_Coolant_Temp_C    0.000595
OBD_Air_Pedal                0.000580
GPS_Long                     0.000573
Gy                           0.000507
GPS_Altitude_M               0.000478
Device_Baromete