In [85]:
import pandas as pd
import numpy as np

In [86]:
df = pd.read_csv('data/fuel_consumption.csv', parse_dates=['YEAR'])
# Change Type of fuel to name
df['FUEL'] = df['FUEL'].replace({'X': 'Regular gasoline', 'Z': 'Premium gasoline', 'D': 'Diesel', 'E': 'Ethanol (E85)', 'N': 'Natural Gas'})
# Extract last caracter of transmission as number of gears
# ie. 816 cars have continuous variable transmission and don't have a number of gears
df['GEARS'] = df['TRANSMISSION'].str.extract(r'(\d+)$', expand=False)
df['TRANSMISSION'] = df['TRANSMISSION'].str.replace(r'\d+$', '')
df['TRANSMISSION'] = df['TRANSMISSION'].replace({'A': 'Automatic', 'AM': 'Automated manual', 'AS': 'Automatic with select shift', 'AV': 'Continuously variable', 'M': 'Manual'})
# Rename FUEL CONSUMPTION to CITY (L/100 km)
df = df.drop(columns=['COMB (mpg)'], axis = 1)
df = df.rename(columns={'FUEL CONSUMPTION': 'CITY (L/100 km)'})
df['MAKE'] = df['MAKE'].str.capitalize()

# Uniformize vehicle class
df['VEHICLE CLASS'] = df['VEHICLE CLASS'].str.capitalize()
df.loc[df['VEHICLE CLASS'].str.contains('Pickup truck'), 'VEHICLE CLASS'] = 'Pickup truck'
df.loc[df['VEHICLE CLASS'].str.contains('Station wagon'), 'VEHICLE CLASS'] = 'Station wagon'
df.loc[df['VEHICLE CLASS'].str.contains('Suv'), 'VEHICLE CLASS'] = 'SUV'
df.loc[df['VEHICLE CLASS'].str.contains('Van'), 'VEHICLE CLASS'] = 'Van'

# rename YEAR, VEHICLE CLASS, MAKE, MODEL, ENGINE SIZE, CYLINDERS, TRANSMISSION, FUEL, CITY (L/100 km), HWY (L/100 km), COMB (L/100 km), CO2 EMISSIONS (g/km)
df = df.rename(columns={'YEAR': 'Release year', 'GEARS' : 'Gears', 'VEHICLE CLASS': 'Vehicle class', 'MAKE': 'Make', 'MODEL': 'Model', 'ENGINE SIZE': 'Engine size (L)', 'CYLINDERS': 'Cylinders', 'TRANSMISSION': 'Transmission', 'FUEL': 'Fuel', 'CITY (L/100 km)': 'City (L/100 km)', 'COMB (L/100 km)': 'Mixed consumption (L/100 km)', 'HWY (L/100 km)': 'Highway (L/100 km)', 'EMISSIONS': 'CO2 emissions (g/km)'})
df['Release year'] = df['Release year'].dt.year
# Target - Features
X = df[['Make', 'Release year', 'Vehicle class', 'Fuel', 'Transmission', 'Gears', 'Engine size (L)', 'Cylinders']]
Y = df[['CO2 emissions (g/km)', 'Mixed consumption (L/100 km)', 'City (L/100 km)', 'Highway (L/100 km)']]

  df['TRANSMISSION'] = df['TRANSMISSION'].str.replace(r'\d+$', '')


In [87]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

numerical = X.select_dtypes(include=['int64', 'float64']).columns.values.tolist()
categorical = X.select_dtypes(include=['object']).columns.values.tolist()

# Preprocessing

preprocessor = ColumnTransformer(
transformers = [
    ('categorical', OneHotEncoder(handle_unknown='ignore'), categorical),
    ('numerical', StandardScaler(), numerical)
    ]) 

# test preprocessing on X
preprocessor.fit_transform(X).shape

(22556, 87)

In [88]:
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn.decomposition import TruncatedSVD

from tempfile import mkdtemp

In [89]:
pipelines = []
regressors = [Lasso(), RandomForestRegressor(), MultiOutputRegressor(DecisionTreeRegressor()), MultiOutputRegressor(GradientBoostingRegressor())]

cachedir = mkdtemp()

for regressor in regressors:
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('svd', TruncatedSVD()),
        ('regressor', regressor)]
        , memory = cachedir)
    pipelines.append(pipeline)

In [90]:
# TruncatedSVD
param_svd = { 
    'svd__n_components': [10, 20, 40, 60, 80]
}
# Lasso
lasso_param = {'regressor__alpha': np.logspace(-6, 0, 10) }
# RandomForestRegressor
RFR_param = {
    'regressor__criterion': ['friedman_mse', 'poisson'],
    'estimator__max_depth': [2, 3, 5, 10, 20],
    'estimator__min_samples_leaf': [5, 10, 20] 
    }
# 'regressor' : MultiOutputRegressor(DecisionTreeRegressor())
DTR_param = {
    'regressor__estimator__criterion': ['friedman_mse', 'poisson'],
    'regressor__estimator__max_depth': [2],
    'regressor__estimator__max_depth': [2, 3, 5, 10, 20],
    'regressor__estimator__min_samples_leaf': [5, 10, 20]
        }
# GradientBoostingRegressor
GBR_param = {
    'regressor__estimator__max_depth': [2, 3, 5, 10, 20, 40], # maximum number of levels allowed in each decision tree
    }
# Merge dictionnaries with svd params
params = [e | param_svd for e in [lasso_param, RFR_param, DTR_param, GBR_param]]


In [91]:
def describe_model(grid):
    display(f'#### {grid.best_estimator_.named_steps["regressor"].__class__.__name__}')
    display(f'Best params : {grid.best_params_}')
    display(f'Average R2 score : {grid.best_score_:.3f}')

In [93]:
grid = RandomizedSearchCV(pipelines[0], param_distributions = lasso_param, cv = 3, n_iter = 3, scoring = 'r2', verbose = 1, refit = True)
grid.fit(X, Y)
describe_model(grid)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


'#### Lasso'

"Best params : {'regressor__alpha': 0.0005}"

'Average R2 score : 0.581'

In [95]:
grid = RandomizedSearchCV(MultiOutputRegressor(pipelines[1]), param_distributions = RFR_param, cv = 3, n_iter = 3, scoring = 'r2', verbose = 1, refit = True)
grid.fit(X, Y)
describe_model(grid)



Fitting 3 folds for each of 2 candidates, totalling 6 fits


In [97]:
# models = []
# for i, (pipeline, param) in enumerate(zip(pipelines, params)):
#     display(f'Running pipeline {i+1}/{len(pipelines)} with {pipeline.named_steps["regressor"]}')
#     # Use RandomizedSearchCV to find the best parameters
#     grid = RandomizedSearchCV(pipeline, param_distributions = param, cv = 3, n_iter = 3, scoring = 'r2', verbose = 1, refit = True)
#     # Transform X and Y to array before fitting
#     grid.fit(X, Y)
#     models.append(grid)
#     describe_model(grid)