## PROJECT - Deployment ##

# Getaround Pricing Optimization 🚗 #

In [365]:
import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import r2_score

from collections import defaultdict
from typing import DefaultDict, List, Dict

In [325]:
pd.options.display.float_format = '{:,.2g}'.format

chart_default_fillcolor='#d6a62b'
chart_default_marker_color='#545350'

In [326]:
PRICING_PROJECT_CSV_FILE_PATH = "data/get_around_pricing_project.csv"


## Utility functions ##

In [368]:
DECIMAL_FORMAT_2 = "%.2f"
def format_2_decimals(number) -> str:
    return (DECIMAL_FORMAT_2 % number)

def decimal_format_str(nb_decimals:int) -> str:
    return "{:."+str(nb_decimals)+"f}"

def format_decimals(number, nb_decimals:int) ->str:
    return decimal_format_str(nb_decimals).format(number)

def get_sup_outlier_threshold_proportion(data:np.ndarray) -> float:
    q1 = np.percentile(data,25)
    q3 = np.percentile(data,75)
    iqr = q3-q1
    upper_fence  = q3 + 1.5*iqr
    return (np.sum(data>=upper_fence)/data.shape[0])

def get_inf_outlier_threshold_proportion(data:np.ndarray) -> float:
    q1 = np.percentile(data,25)
    q3 = np.percentile(data,75)
    iqr = q3-q1
    lower_fence  = q1 - 1.5*iqr
    return (np.sum(data<=lower_fence)/data.shape[0])

def get_outliers_threshold_proportions(dtf:pd.DataFrame, columns:List[str]) -> DefaultDict[str,Dict[str,float]]:
    outliers_dict: DefaultDict[str,Dict[str,float]] = defaultdict(dict)
    for i in range(len(columns)):    
        data_ndarray = dtf[columns[i]].to_numpy()
        outliers_dict[columns[i]]['inf_outliers_proportion'] = get_inf_outlier_threshold_proportion(data_ndarray)
        outliers_dict[columns[i]]['sup_outliers_proportion'] = get_sup_outlier_threshold_proportion(data_ndarray)
    return outliers_dict

def print_dict(dico:DefaultDict[str,Dict[str,float]]):
    entries =((o_key, i_key,val) for o_key, inner_dict in dico.items() for i_key,val in inner_dict.items() )
    for outer_key, inner_key, value in entries:
        print(outer_key, inner_key, value)

In [341]:
pricing_dtf = pd.read_csv(PRICING_PROJECT_CSV_FILE_PATH, delimiter=',', encoding="UTF-8", index_col=0)

## EDA ##

In [342]:
pricing_dtf.head(5)

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183


In [343]:
print(f'rows count : {pricing_dtf.shape[0]}')

rows count : 4843


In [344]:
TARGET_COLUMN = 'rental_price_per_day'

### Null values ###

In [345]:
print("Missing / null values for each column")

serie = pricing_dtf.isnull().sum() * 100/pricing_dtf.shape[0]
na_cols_dtf = serie.reset_index( ).rename(columns={"index":"column", 0:"na_prop"}).sort_values(by='na_prop',ascending=True)

na_cols_dtf.sort_values(['na_prop'], ascending = [True], inplace=True)
na_cols_dtf

Missing / null values for each column


Unnamed: 0,column,na_prop
0,model_key,0
1,mileage,0
2,engine_power,0
3,fuel,0
4,paint_color,0
5,car_type,0
6,private_parking_available,0
7,has_gps,0
8,has_air_conditioning,0
9,automatic_car,0


In [346]:
print("Basics statistics: ")
data_desc = pricing_dtf.describe()
display(data_desc)

Basics statistics: 


Unnamed: 0,mileage,engine_power,rental_price_per_day
count,4800.0,4800.0,4800.0
mean,140000.0,130.0,120.0
std,60000.0,39.0,34.0
min,-64.0,0.0,10.0
25%,100000.0,100.0,100.0
50%,140000.0,120.0,120.0
75%,180000.0,140.0,140.0
max,1000000.0,420.0,420.0


We can see above inconsistent data
- mileage 0
- engine_power = 0

Remove rows with inconsistent data

In [347]:
pricing_dtf = pricing_dtf[(pricing_dtf['mileage']>=0.0) & (pricing_dtf['engine_power']>0)]

In [348]:
print(f'rows count : {pricing_dtf.shape[0]}')

rows count : 4841


### Columns data types ###

In [None]:
def get_columns_by_dtype(dataset:pd.DataFrame) -> DefaultDict[str,List[str]]:
    columns_by_dtype = defaultdict(list)
    for col, dtype in dataset.dtypes.to_dict().items():
        columns_by_dtype[str(dtype)].append(col)
    return columns_by_dtype

columns_by_dtype = get_columns_by_dtype(pricing_dtf)

categorical_columns = columns_by_dtype['object']
print('categorical_columns')
print(categorical_columns)

numerical_columns = columns_by_dtype['int64'] + columns_by_dtype['float64'] 
print('numerical_columns')
print(numerical_columns)

binary_columns = columns_by_dtype['bool']
print('binary_columns')
print(binary_columns)

categorical_columns
['model_key', 'fuel', 'paint_color', 'car_type']
numerical_columns
['mileage', 'engine_power', 'rental_price_per_day']
binary_columns
['private_parking_available', 'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']


### Numeric variables ###

#### Distribution of numeric features / Detect outliers ####

In [350]:
# Univariate analysis
# Distribution of each numeric variable
num_features = numerical_columns.copy()
num_features.remove(TARGET_COLUMN)
for i in range(len(num_features)):
    fig = px.box(pricing_dtf[num_features[i]])
    fig.update_traces(fillcolor=chart_default_fillcolor, marker_color=chart_default_marker_color)
    fig.show()

#### Box plot outliers proportions ####

In [369]:
#outliers_dict = get_outliers_threshold_proportions(pricing_dtf, num_features) 
print_dict(get_outliers_threshold_proportions(pricing_dtf, num_features) )

mileage inf_outliers_proportion 0.0
mileage sup_outliers_proportion 0.020243751291055566
engine_power inf_outliers_proportion 0.0004131377814501136
engine_power sup_outliers_proportion 0.12125593885560834


In [370]:
pricing_dtf[(pricing_dtf['mileage']>=1000000.0)]

#pricing_dtf[(pricing_dtf['rental_price_per_day']<=20.0)]

#pricing_dtf[(pricing_dtf['engine_power'] >=300)]

#pricing_dtf[(pricing_dtf['model_key'].isin(['Lamborghini','Maserati','Porsche','Ferrari','Suzuki']))]

#pricing_dtf[(pricing_dtf['model_key'].isin(['Ferrari','Suzuki']))]



Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
3732,Citroën,1000376,90,diesel,black,subcompact,True,False,False,False,False,False,True,37


#### Target variable distribution ####

In [360]:
fig = px.box(pricing_dtf[TARGET_COLUMN], title="Target variable distribution")
fig.update_traces(fillcolor=chart_default_fillcolor, marker_color=chart_default_marker_color)
fig.show()

#### Box plot outliers proportions ####

In [371]:
print_dict(get_outliers_threshold_proportions(pricing_dtf, [TARGET_COLUMN]) )

rental_price_per_day inf_outliers_proportion 0.024994835777731872
rental_price_per_day sup_outliers_proportion 0.04792398264821318


#### Correlation Matrix ####

In [372]:
# Correlation matrix
corr_matrix = pricing_dtf[numerical_columns].corr().round(2)

import plotly.figure_factory as ff

fig = ff.create_annotated_heatmap(corr_matrix.values,
                                  x = corr_matrix.columns.tolist(),
                                  y = corr_matrix.index.tolist()
                                  )


fig.show()

Correlation matrix shows that

 - (numeric features) can be considered independent
 - engine_power enough positively correlated with target
 - mileage moderately correlated with target

### Categorical features ###

#### Distribution of Categorical Variables ####

Attention to

- encoding issues
- rare categories
- if too many unique categories
- outliers
- relationship with target
- if means are clearly distinct between groups

In [373]:
['model_key', 'fuel', 'paint_color', 'car_type']
pd.set_option('display.float_format', '{:.2f}'.format)
TMP_CAT_FEATURE = 'fuel'
print(pricing_dtf[TMP_CAT_FEATURE].value_counts())

pricing_dtf.groupby(TMP_CAT_FEATURE)[TARGET_COLUMN].agg(['mean','median', 'std']).sort_values('mean')

fuel
diesel           4639
petrol            191
hybrid_petrol       8
electro             3
Name: count, dtype: int64


Unnamed: 0_level_0,mean,median,std
fuel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
petrol,110.07,109.0,47.8
diesel,121.52,120.0,32.55
electro,145.0,145.0,1.0
hybrid_petrol,184.75,157.5,62.09


In [374]:
boxplot = px.box(pricing_dtf, y=TARGET_COLUMN, x=TMP_CAT_FEATURE, title="Rental Price distribution by fuel", width=800)
boxplot.update_traces(fillcolor=chart_default_fillcolor, marker_color=chart_default_marker_color)
boxplot.show()

In [375]:
TMP_CAT_FEATURE = 'paint_color'
#print(pricing_dtf[TMP_CAT_FEATURE].value_counts())
pricing_dtf.groupby(TMP_CAT_FEATURE)[TARGET_COLUMN].agg(['mean','median', 'std', 'count']).sort_values('mean')

Unnamed: 0_level_0,mean,median,std,count
paint_color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
green,77.56,83.5,35.13,18
silver,116.11,120.0,36.44,329
blue,117.66,115.0,35.51,710
grey,120.25,118.0,33.63,1175
brown,121.22,118.0,25.67,341
black,122.69,120.0,32.84,1632
red,122.92,115.0,37.31,52
beige,123.98,117.0,21.41,41
white,127.34,122.0,33.47,537
orange,135.67,131.0,22.56,6


In [376]:
TMP_CAT_FEATURE = 'car_type'
#print(pricing_dtf[TMP_CAT_FEATURE].value_counts())
pricing_dtf.groupby(TMP_CAT_FEATURE)[TARGET_COLUMN].agg(['mean','median', 'std', 'count']).sort_values('mean')

Unnamed: 0_level_0,mean,median,std,count
car_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
subcompact,94.82,96.0,22.97,117
estate,111.5,114.0,25.9,1606
hatchback,112.1,109.0,26.67,699
van,116.39,123.0,28.59,44
sedan,122.57,122.0,30.64,1167
convertible,123.38,111.0,44.11,47
suv,141.28,133.0,39.22,1057
coupe,142.79,151.0,42.39,104


In [377]:
boxplot = px.box(pricing_dtf, y=TARGET_COLUMN, x=TMP_CAT_FEATURE, title="Rental Price distribution by cat type", width=800)
boxplot.update_traces(fillcolor=chart_default_fillcolor, marker_color=chart_default_marker_color)
boxplot.show()

In [378]:
TMP_CAT_FEATURE = 'model_key'
pricing_dtf.groupby(TMP_CAT_FEATURE)[TARGET_COLUMN].agg(['mean','median', 'std', 'count']).sort_values('mean')

Unnamed: 0_level_0,mean,median,std,count
model_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Mazda,67.0,67.0,,1
Fiat,93.0,93.0,19.8,2
Peugeot,104.92,107.0,25.04,642
Citroën,108.76,114.0,27.06,969
Ford,111.0,109.0,17.22,5
Nissan,111.14,114.0,18.39,274
BMW,117.43,118.0,29.86,827
Renault,120.44,121.0,28.02,915
Mercedes,121.36,120.0,10.77,97
PGO,126.09,132.0,28.2,33


## Pre processing ##

### converting model_key to lower case ###

In [386]:
pricing_dtf['model_key']= pricing_dtf['model_key'].str.lower()

In [387]:
# Separate target variable Y from features X
print("Separating labels from features...")
Y = pricing_dtf.loc[:, TARGET_COLUMN]
X = pricing_dtf.drop(TARGET_COLUMN, axis=1)


Separating labels from features...


In [388]:
print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

Dividing into train and test sets...


In [389]:
# Create pipeline for numeric features, no imputer as there is no missing values
# To be used for Linear Regression
numeric_transformer = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
    ]
)

In [390]:
categorical_transformer = Pipeline(
    steps=[("encoder", OneHotEncoder(drop='first'))]
)


# and no preprocessing for boolean features

In [391]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, categorical_columns),
    ]
)

In [392]:
print("Performing preprocessings on train set...")
X_train = preprocessor.fit_transform(X_train)
print("Done")
print("Performing preprocessings on test set...")
X_test = preprocessor.transform(X_test)
print("Done")

Performing preprocessings on train set...
Done
Performing preprocessings on test set...
Done


### Baseline Model -  Linear Regression ###

In [393]:
# Train model
print("Train model...")
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
print("...Done.")

Train model...
...Done.


#### Predict on train and test ####

In [394]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = regressor.predict(X_train)
print("...Done.")
print(Y_train_pred[:5])
print()

# Predictions on test set
print("Predictions on test set...")
Y_test_pred = regressor.predict(X_test)
print("...Done.")
print(Y_test_pred[:5])
print()

Predictions on training set...
...Done.
[101.8382533  107.50174591  95.01800584 157.99671063 120.76988201]

Predictions on test set...
...Done.
[ 99.37542883 119.43362459 124.5866309  133.74349018 127.2399187 ]



### Performance assessment ###

In [395]:
# Print R^2 scores
train_r2_score = r2_score(Y_train, Y_train_pred)
test_r2_score = r2_score(Y_test, Y_test_pred)
print("R2 score on training set : ", train_r2_score)
print("R2 score on test set : ", test_r2_score)

R2 score on training set :  0.6863445774305257
R2 score on test set :  0.6187079923137364


### Model 2 RandomForest Regressor ###

#### Pre Processing ####

##### Create new feature : Model_key_count  #####

As model_key comprises approx 30 modalities not well-balanced

In [None]:
#pricing_dtf['model_key_count'] = pricing_dtf['model_key'].map(pricing_dtf['model_key'].value_counts())
#pricing_dtf.head(5)

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day,model_key_count
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106,969
1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264,969
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101,969
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158,969
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183,969


In [396]:
columns_by_dtype = get_columns_by_dtype(pricing_dtf)

categorical_columns = columns_by_dtype['object']
print('categorical_columns')
print(categorical_columns)

numerical_columns = columns_by_dtype['int64'] + columns_by_dtype['float64'] 
print('numerical_columns')
print(numerical_columns)

binary_columns = columns_by_dtype['bool']
print('binary_columns')
print(binary_columns)

categorical_columns
['model_key', 'fuel', 'paint_color', 'car_type']
numerical_columns
['mileage', 'engine_power', 'rental_price_per_day']
binary_columns
['private_parking_available', 'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']


In [397]:
MODEL_KEY_COUNT_FEATURE = 'model_key_count'
num_features = numerical_columns.copy()
num_features.remove(TARGET_COLUMN)
num_features.append(MODEL_KEY_COUNT_FEATURE)
print('num_features')
print(num_features)

num_features
['mileage', 'engine_power', 'model_key_count']


In [398]:
binary_features = binary_columns
categorical_features= categorical_columns.copy()
categorical_features.remove('model_key')
print(categorical_features)

['fuel', 'paint_color', 'car_type']


In [399]:
print("Separating labels from features...")
Y = pricing_dtf.loc[:, TARGET_COLUMN]
#X = pricing_dtf.drop({TARGET_COLUMN, 'model_key'}, axis=1)
X = pricing_dtf.drop({TARGET_COLUMN}, axis=1)
X.head(5)

Separating labels from features...


Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires
0,citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True
1,citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True
2,citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True
3,citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True
4,citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True


In [400]:
print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)


Dividing into train and test sets...


#### Custom Transformer for model_key_count ####

In [402]:
from sklearn.base import BaseEstimator, TransformerMixin

class ModelKeyCountEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, column='model_key'):
        self.column = column
        self.count_map_ = None

    def fit(self, X, y=None):
        # Learn mapping from model_key to count
        value_counts = X[self.column].value_counts()
        self.count_map_ = value_counts.to_dict()
        return self

    def transform(self, X):
        X = X.copy()
        # Map model_key to count; unknown keys get 0
        X[self.column + '_count'] = X[self.column].map(self.count_map_).fillna(0)
        X = X.drop(columns=[self.column])  # optionally drop original column
        return X

In [403]:
column_transformer = ColumnTransformer(
    transformers=[
        ("num", 'passthrough', num_features),
        ("cat", OneHotEncoder(), categorical_features),
        ('binary','passthrough', binary_features)
    ]
)

preprocessing_pipeline = Pipeline(
    [
    ('model_key_count', ModelKeyCountEncoder(column='model_key')),
    ('column_transformer', column_transformer)
])

preprocess_and_model_pipeline = Pipeline(
    [
        ('preprocessing', preprocessing_pipeline)
        ,('model',RandomForestRegressor(random_state=42) )
    ]
)

In [None]:
#print("Performing preprocessings on train set...")
#X_train = preprocessor.fit_transform(X_train)
#print("Done")
#print("Performing preprocessings on test set...")
#X_test = preprocessor.transform(X_test)
#print("Done")

Performing preprocessings on train set...
Done
Performing preprocessings on test set...
Done


In [404]:
params_V0 = {
    'model__max_depth': [5, 10, 14],
    'model__min_samples_split': [4, 8],
    'model__n_estimators': [60, 80, 100]
}

params = {
    'model__max_depth': [3, 5, 7, 10],
    'model__min_samples_split': [8,10,20],
    'model__n_estimators': [50, 100],
    'model__max_features': ['sqrt', 'log2']
}

gridsearch = GridSearchCV(
    estimator=preprocess_and_model_pipeline,
    param_grid=params,
    cv=5,
    verbose=2,
    scoring='neg_root_mean_squared_error',  # or another metric depending on your task
    n_jobs=-1
)

gridsearch.fit(X_train, Y_train)
print("Best hyperparameters : ", gridsearch.best_params_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best hyperparameters :  {'model__max_depth': 10, 'model__max_features': 'sqrt', 'model__min_samples_split': 8, 'model__n_estimators': 100}


In [None]:
# Perform grid search
""" print("Grid search...")
regressor = RandomForestRegressor()

# Grid of values to be tested
params = {
    'max_depth': [5, 10, 14],
    'min_samples_split': [4, 8],
    'n_estimators': [60, 80, 100]
}
gridsearch = GridSearchCV(regressor, param_grid = params, cv = 3, verbose = 2)
gridsearch.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best validation accuracy : ", gridsearch.best_score_) """

Grid search...
Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] END ..max_depth=5, min_samples_split=4, n_estimators=60; total time=   0.1s
[CV] END ..max_depth=5, min_samples_split=4, n_estimators=60; total time=   0.1s
[CV] END ..max_depth=5, min_samples_split=4, n_estimators=60; total time=   0.1s
[CV] END ..max_depth=5, min_samples_split=4, n_estimators=80; total time=   0.2s
[CV] END ..max_depth=5, min_samples_split=4, n_estimators=80; total time=   0.2s
[CV] END ..max_depth=5, min_samples_split=4, n_estimators=80; total time=   0.2s
[CV] END .max_depth=5, min_samples_split=4, n_estimators=100; total time=   0.3s
[CV] END .max_depth=5, min_samples_split=4, n_estimators=100; total time=   0.4s
[CV] END .max_depth=5, min_samples_split=4, n_estimators=100; total time=   0.3s
[CV] END ..max_depth=5, min_samples_split=8, n_estimators=60; total time=   0.1s
[CV] END ..max_depth=5, min_samples_split=8, n_estimators=60; total time=   0.1s
[CV] END ..max_depth=5, min_sampl

#### Performance Assessment ####

In [405]:
# Print R^2 scores
print("RMSE on training set : ", - gridsearch.score(X_train, Y_train))
print("RMSE on test set : ", - gridsearch.score(X_test, Y_test))

# 3. Final evaluation on test set
Y_test_pred = gridsearch.predict(X_test)

print("Train R² Score:", r2_score(Y_train, gridsearch.predict(X_train)))
print("Test R² Score:", r2_score(Y_test, Y_test_pred))

RMSE on training set :  15.73863361197544
RMSE on test set :  18.087896672515036
Train R² Score: 0.7829988862378313
Test R² Score: 0.6868585406363577


#### Monitor residuals ####

In [406]:
residuals = Y_test - Y_test_pred
# Create a DataFrame for plotting
residuals_df = pd.DataFrame({
    'True Values': Y_test,
    'Predictions': Y_test_pred,
    'Residuals': residuals
})

# Scatter plot: True values vs Residuals
fig = px.scatter(
    residuals_df,
    x='True Values',
    y='Residuals',
    title='Residuals vs True Values',
    trendline="ols",
    labels={'True Values': 'Actual', 'Residuals': 'Error'},
    width=800,
    height=500
)

# Add zero line manually
fig.add_hline(y=0, line_dash="dash", line_color="red")

fig.show()

#### Feature importance ####

In [407]:
def get_feature_names_from_column_transformer(ct):
    output_features = []

    for name, transformer, columns in ct.transformers_:
        if transformer == 'drop':
            continue
        elif transformer == 'passthrough':
            output_features.extend(columns)
        else:
            # If the transformer is itself a pipeline
            if hasattr(transformer, 'named_steps'):
                last_step = transformer.named_steps[list(transformer.named_steps)[-1]]
                if hasattr(last_step, 'get_feature_names_out'):
                    names = last_step.get_feature_names_out(columns)
                    output_features.extend(names)
                else:
                    output_features.extend(columns)
            elif hasattr(transformer, 'get_feature_names_out'):
                names = transformer.get_feature_names_out(columns)
                output_features.extend(names)
            else:
                output_features.extend(columns)
    
    return output_features

best_pipeline = gridsearch.best_estimator_
print(type(best_pipeline))
fitted_preprocessor = best_pipeline.named_steps['preprocessing']
fitted_model = best_pipeline.named_steps['model']

column_transformer = fitted_preprocessor.named_steps['column_transformer']

feature_names = get_feature_names_from_column_transformer(column_transformer)

importances = fitted_model.feature_importances_

# Check that lengths match
assert len(feature_names) == len(importances), "Mismatch between features and importances"

feature_importance_dtf = pd.DataFrame(index = feature_names, data = importances, columns=["feature_importances"])
feature_importance_dtf = feature_importance_dtf.sort_values(by = 'feature_importances')

# Plot coefficients
fig = px.bar(feature_importance_dtf, orientation = 'h', title='Feature importance')
fig.update_layout(showlegend = False, 
                  margin = {'l': 120} # to avoid cropping of column names
                  ,height=700
                 )
fig.update_traces(marker=dict(color=chart_default_fillcolor, line=dict(color=chart_default_marker_color, width=1)))

fig.show()


<class 'sklearn.pipeline.Pipeline'>
