In [2]:
from utils import load_car_data

DATA_PATH = "./../data/fuel_consumption.csv"
df = load_car_data(DATA_PATH)

numerical_features = ["release_year", "gears", "engine_size", "cylinders"]
categorical_features = ["make", "vehicle_class", "fuel_type", "transmission_type"]
features = numerical_features + categorical_features

targets = ["emissions", "fc_mixed", "fc_city", "fc_highway"]

X = df[features]
Y = df[targets]

### Model definition

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        (
            "numerical",
            Pipeline(
                steps=[
                    ("imputer", SimpleImputer(strategy="mean")),
                    ("scaler", StandardScaler()),
                ]
            ),
            numerical_features,
        ),
    ]
)

pipeline = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", Lasso(max_iter=10000))]
)

param = {"regressor__alpha": [0.1, 1.0, 10.0, 100.0]}

In [4]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(pipeline, param, cv=5, n_jobs=-1, refit=True)
grid.fit(X, Y)

print(f"Best parameters : {grid.best_params_}")
print(f"Best score : {grid.best_score_:0.2f}")

Best parameters : {'regressor__alpha': 0.1}
Best score : 0.68


### Save model

In [5]:
from joblib import dump

dump(grid.best_estimator_, "./../data/lasso_regression.pkl")

['./../data/lasso_regression.pkl']