In [42]:
import pandas as pd
import numpy as np

## Loading Data

In [43]:
df = pd.read_csv('../data/fuel_consumption.csv', parse_dates=['YEAR'])
# Change Type of fuel to name
df['FUEL'] = df['FUEL'].replace({'X': 'Regular gasoline', 'Z': 'Premium gasoline', 'D': 'Diesel', 'E': 'Ethanol (E85)', 'N': 'Natural Gas'})
# Extract last caracter of transmission as number of gears
# ie. 816 cars have continuous variable transmission and don't have a number of gears
df['GEARS'] = df['TRANSMISSION'].str.extract(r'(\d+)$', expand=False)
df['TRANSMISSION'] = df['TRANSMISSION'].str.replace(r'\d+$', '')
df['TRANSMISSION'] = df['TRANSMISSION'].replace({'A': 'Automatic', 'AM': 'Automated manual', 'AS': 'Automatic with select shift', 'AV': 'Continuously variable', 'M': 'Manual'})
# Rename FUEL CONSUMPTION to CITY (L/100 km)
df = df.drop(columns=['COMB (mpg)'], axis = 1)
df = df.rename(columns={'FUEL CONSUMPTION': 'CITY (L/100 km)'})
df['MAKE'] = df['MAKE'].str.capitalize()

# Uniformize vehicle class
df['VEHICLE CLASS'] = df['VEHICLE CLASS'].str.capitalize()
df.loc[df['VEHICLE CLASS'].str.contains('Pickup truck'), 'VEHICLE CLASS'] = 'Pickup truck'
df.loc[df['VEHICLE CLASS'].str.contains('Station wagon'), 'VEHICLE CLASS'] = 'Station wagon'
df.loc[df['VEHICLE CLASS'].str.contains('Suv'), 'VEHICLE CLASS'] = 'SUV'
df.loc[df['VEHICLE CLASS'].str.contains('Van'), 'VEHICLE CLASS'] = 'Van'

# rename YEAR, VEHICLE CLASS, MAKE, MODEL, ENGINE SIZE, CYLINDERS, TRANSMISSION, FUEL, CITY (L/100 km), HWY (L/100 km), COMB (L/100 km), CO2 EMISSIONS (g/km)
df = df.rename(columns={'YEAR': 'Release year', 'GEARS' : 'Gears', 'VEHICLE CLASS': 'Vehicle class', 'MAKE': 'Make', 'MODEL': 'Model', 'ENGINE SIZE': 'Engine size (L)', 'CYLINDERS': 'Cylinders', 'TRANSMISSION': 'Transmission', 'FUEL': 'Fuel', 'CITY (L/100 km)': 'City (L/100 km)', 'COMB (L/100 km)': 'Mixed consumption (L/100 km)', 'HWY (L/100 km)': 'Highway (L/100 km)', 'EMISSIONS': 'CO2 emissions (g/km)'})
df['Release year'] = df['Release year'].dt.year
# Target - Features
X = df[['Make', 'Release year', 'Vehicle class', 'Fuel', 'Transmission', 'Gears', 'Engine size (L)', 'Cylinders']]
Y = df[['CO2 emissions (g/km)', 'Mixed consumption (L/100 km)', 'City (L/100 km)', 'Highway (L/100 km)']]

  df['TRANSMISSION'] = df['TRANSMISSION'].str.replace(r'\d+$', '')


## Preprocessing

In [44]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

numerical = X.select_dtypes(include=['int64', 'float64']).columns.values.tolist()
categorical = X.select_dtypes(include=['object']).columns.values.tolist()

# Preprocessing

preprocessor = ColumnTransformer(
transformers = [
    ('categorical', OneHotEncoder(handle_unknown='ignore'), categorical),
    ('numerical', StandardScaler(), numerical)
    ])

X_fitted = preprocessor.fit_transform(X)
X_fitted.shape

(22556, 87)

## Mutual Information

In [45]:
from sklearn.feature_selection import mutual_info_regression

mi_co2 = mutual_info_regression(X_fitted, Y['CO2 emissions (g/km)'])
mi_mixed = mutual_info_regression(X_fitted, Y['Mixed consumption (L/100 km)'])
mi_city = mutual_info_regression(X_fitted, Y['City (L/100 km)'])
mi_highway = mutual_info_regression(X_fitted, Y['Highway (L/100 km)'])

cat_col = preprocessor.named_transformers_['categorical'].get_feature_names_out()
num_col = preprocessor.named_transformers_['numerical'].get_feature_names_out()
col = np.concatenate((cat_col, num_col))
mi_targets = pd.DataFrame({'Features': col, 'MI_co2': mi_co2, 'MI_mixed': mi_mixed, 'MI_city': mi_city, 'MI_highway': mi_highway})

mi_dict = {
    'CO2 emissions (g/km)' : 'MI_co2',
    'Mixed consumption (L/100 km)' : 'MI_mixed',
    'City (L/100 km)' : 'MI_city',
    'Highway (L/100 km)' : 'MI_highway'
}

In [46]:
import altair as alt
    
def plot_mi(mi_targets, target):
    return alt.Chart(mi_targets.sort_values(by=mi_dict[target], ascending=False).head(10)).mark_bar().encode(
        y = alt.Y('Features:N', sort='-x'),
        x = alt.X(f'{mi_dict[target]}:Q', title = f'Mutual Information Score for {target}'),
    )

# choice = st.selectbox('Select a target', ('CO2 emissions (g/km)', 'Mixed consumption (L/100 km)', 'City (L/100 km)', 'Highway (L/100 km)'))
choice = np.random.choice(list(mi_dict.keys()))
plot_mi(mi_targets, choice)

## Building Pipeline

In [47]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval
from hyperopt.pyll.base import scope

In [48]:
from tempfile import mkdtemp
cachedir = mkdtemp()

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor())],
    memory = cachedir)

In [51]:
space = {
    'regressor__max_depth': scope.int(hp.quniform('max_depth', 3, 18, 1)),
    'regressor__n_estimators': hp.choice('n_estimators', np.arange(100, 1000, 100)),
    'regressor__learning_rate': hp.choice('learning_rate', np.geomspace(0.01, 0.3, 10)),
    'regressor__gamma': hp.uniform ('gamma', 1, 9),
    'regressor__reg_alpha' : hp.quniform('reg_alpha', 40, 180, 1),
    'regressor__reg_lambda' : hp.uniform('reg_lambda', 0, 1),
    'regressor__subsample' : hp.uniform('subsample', 0.5, 1),
    'regressor__colsample_bytree' : hp.uniform('colsample_bytree', 0.5, 1),
    'regressor__colsample_bylevel' : hp.uniform('colsample_bylevel', 0.4, 1),
    'regressor__min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
}

def objective(space):
    # Set pipeline parameters
    pipeline.set_params(**space)
    cross_val = cross_val_score(pipeline, X, Y['CO2 emissions (g/km)'], verbose = 1, cv=3, scoring='r2', n_jobs=-1)
    r2 = cross_val.mean()    
    print ("SCORE:", r2)
    return {'loss': -r2, 'status': STATUS_OK }

In [52]:
trials = Trials()

result = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials,
                        verbose = 1,
                        show_progressbar=True)

  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.



SCORE:                                                 
0.8166062313444541                                     
  1%|          | 1/100 [00:02<03:58,  2.41s/trial, best loss: -0.8166062313444541]

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.4s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.



SCORE:                                                                            
0.808551887524612                                                                 
  2%|▏         | 2/100 [00:04<03:58,  2.44s/trial, best loss: -0.8166062313444541]

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.4s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.



SCORE:                                                                            
0.7936807245507286                                                                
  3%|▎         | 3/100 [00:06<03:07,  1.93s/trial, best loss: -0.8166062313444541]

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.3s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.



SCORE:                                                                            
0.8165130975642093                                                                
  4%|▍         | 4/100 [00:10<04:35,  2.87s/trial, best loss: -0.8166062313444541]

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    4.3s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.



SCORE:                                                                            
0.7869049165837843                                                                
  5%|▌         | 5/100 [00:11<03:17,  2.08s/trial, best loss: -0.8166062313444541]

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.7s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.



SCORE:                                                                            
0.7906791538926338                                                                
  6%|▌         | 6/100 [00:12<02:46,  1.77s/trial, best loss: -0.8166062313444541]

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.2s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.



SCORE:                                                                            
0.7915963795794521                                                                
  7%|▋         | 7/100 [00:14<03:00,  1.94s/trial, best loss: -0.8166062313444541]

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.3s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.



SCORE:                                                                            
0.7752420712744673                                                                
  8%|▊         | 8/100 [00:18<03:59,  2.60s/trial, best loss: -0.8166062313444541]

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    4.0s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.



SCORE:                                                                            
0.8224541274381054                                                                
  9%|▉         | 9/100 [00:26<06:29,  4.28s/trial, best loss: -0.8224541274381054]

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    8.0s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.



SCORE:                                                                            
0.8255409652871831                                                                
 10%|█         | 10/100 [00:36<09:11,  6.13s/trial, best loss: -0.8255409652871831]

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   10.2s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.



SCORE:                                                                             
0.8228209859117963                                                                 
 11%|█         | 11/100 [00:41<08:13,  5.55s/trial, best loss: -0.8255409652871831]

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    4.2s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.



SCORE:                                                                             
0.8038030525237487                                                                 
 12%|█▏        | 12/100 [00:53<11:02,  7.53s/trial, best loss: -0.8255409652871831]

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   12.0s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.



SCORE:                                                                             
0.8218540268627947                                                                 
 13%|█▎        | 13/100 [00:58<09:45,  6.73s/trial, best loss: -0.8255409652871831]

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    4.9s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.



SCORE:                                                                             
0.8197307605525456                                                                 
 14%|█▍        | 14/100 [01:09<11:38,  8.12s/trial, best loss: -0.8255409652871831]

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   11.3s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.



SCORE:                                                                             
0.8188710491860025                                                                 
 15%|█▌        | 15/100 [01:13<09:52,  6.97s/trial, best loss: -0.8255409652871831]

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    4.3s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.



SCORE:                                                                             
0.7966262399175509                                                                 
 16%|█▌        | 16/100 [01:17<08:22,  5.99s/trial, best loss: -0.8255409652871831]

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    3.7s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.



SCORE:                                                                             
0.8185835280884154                                                                 
 17%|█▋        | 17/100 [01:27<09:59,  7.23s/trial, best loss: -0.8255409652871831]

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   10.1s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.



SCORE:                                                                             
0.8150479911485292                                                                 
 18%|█▊        | 18/100 [01:38<11:25,  8.36s/trial, best loss: -0.8255409652871831]

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   11.0s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.



SCORE:                                                                             
0.8229405047855476                                                                 
 19%|█▉        | 19/100 [01:49<12:26,  9.22s/trial, best loss: -0.8255409652871831]

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   11.2s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.



SCORE:                                                                             
0.8131707914773338                                                                 
 20%|██        | 20/100 [01:53<10:00,  7.50s/trial, best loss: -0.8255409652871831]

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    3.5s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.



SCORE:                                                                             
0.8259066013454098                                                                 
 21%|██        | 21/100 [02:07<12:36,  9.58s/trial, best loss: -0.8259066013454098]

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   14.4s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.



SCORE:                                                                             
0.823819106225918                                                                  
 22%|██▏       | 22/100 [02:17<12:24,  9.54s/trial, best loss: -0.8259066013454098]

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    9.4s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.



SCORE:                                                                             
0.8223130224222666                                                                 
 23%|██▎       | 23/100 [02:26<12:03,  9.39s/trial, best loss: -0.8259066013454098]

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    9.0s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.



SCORE:                                                                             
0.821960202202904                                                                  
 24%|██▍       | 24/100 [02:32<10:40,  8.42s/trial, best loss: -0.8259066013454098]

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    6.2s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.



SCORE:                                                                             
0.8266621848278667                                                                 
 25%|██▌       | 25/100 [02:44<12:01,  9.63s/trial, best loss: -0.8266621848278667]

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   12.4s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.



SCORE:                                                                             
0.825146860369808                                                                  
 26%|██▌       | 26/100 [02:52<11:14,  9.11s/trial, best loss: -0.8266621848278667]

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    7.9s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.



SCORE:                                                                             
0.8280510797345363                                                                 
 27%|██▋       | 27/100 [03:03<11:33,  9.50s/trial, best loss: -0.8280510797345363]

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   10.4s finished

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.



In [None]:
best_hyperparams = space_eval(space, result)
best_hyperparams

{'regressor__colsample_bylevel': 0.6141123893478884,
 'regressor__colsample_bytree': 0.7544294982956999,
 'regressor__gamma': 2.404609402359236,
 'regressor__learning_rate': 0.045341755991148446,
 'regressor__max_depth': 17,
 'regressor__min_child_weight': 0.0,
 'regressor__n_estimators': 900,
 'regressor__reg_alpha': 40.0,
 'regressor__reg_lambda': 0.2587299493111584,
 'regressor__subsample': 0.8780109941605825}

In [None]:
pipeline.set_params(**best_hyperparams)

result = cross_val_score(pipeline, X, Y['CO2 emissions (g/km)'], verbose = 2, cv=3, scoring='r2', n_jobs=-1)
# pipeline.named_steps['regressor'].set_params(**best_param)
print(f'R2 Score : {round(result.mean()*100, 2)}%')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


[CV] END .................................................... total time=   7.0s
[CV] END .................................................... total time=   7.0s
[CV] END .................................................... total time=   7.1s
R2 Score : 84.79%


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    7.1s finished
