In [17]:
import numpy as np
import pandas as pd

import joblib
import plotly.express as px

from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, explained_variance_score, mean_absolute_error
import xgboost as xgb
from sklearn.neighbors import KNeighborsRegressor


pd.options.display.max_columns = None
pd.options.display.max_rows = None


import warnings
warnings.filterwarnings('ignore')

In [4]:
# Import dataset
print("Loading dataset...")
pricing_df = pd.read_csv("./assets/get_around_pricing_project.csv")  # sep = ";"

print("...Done.")

pd.set_option('display.max_columns', None)

Loading dataset...
...Done.


In [5]:
# Basic stats
print("Number of rows : {}".format(pricing_df.shape[0]))
print("Number of columns : {}".format(pricing_df.shape[1]))
print()

print("Display of dataset: ")
display(pricing_df.head())
print()

print("Basics statistics: ")
data_desc = pricing_df.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")

pd.concat([pricing_df.isnull().sum(), 100 * pricing_df.isnull().sum()/len(pricing_df)], axis=1).rename(columns={0:'Missing Records', 1:'Percentage (%)'})


Number of rows : 4843
Number of columns : 15

Display of dataset: 


Unnamed: 0.1,Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183



Basics statistics: 


Unnamed: 0.1,Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
count,4843.0,4843,4843.0,4843.0,4843,4843,4843,4843,4843,4843,4843,4843,4843,4843,4843.0
unique,,28,,,4,10,8,2,2,2,2,2,2,2,
top,,Citroën,,,diesel,black,estate,True,True,False,False,False,False,True,
freq,,969,,,4641,1633,1606,2662,3839,3865,3881,2613,3674,4514,
mean,2421.0,,140962.8,128.98823,,,,,,,,,,,121.214536
std,1398.198007,,60196.74,38.99336,,,,,,,,,,,33.568268
min,0.0,,-64.0,0.0,,,,,,,,,,,10.0
25%,1210.5,,102913.5,100.0,,,,,,,,,,,104.0
50%,2421.0,,141080.0,120.0,,,,,,,,,,,119.0
75%,3631.5,,175195.5,135.0,,,,,,,,,,,136.0



Percentage of missing values: 


Unnamed: 0,Missing Records,Percentage (%)
Unnamed: 0,0,0.0
model_key,0,0.0
mileage,0,0.0
engine_power,0,0.0
fuel,0,0.0
paint_color,0,0.0
car_type,0,0.0
private_parking_available,0,0.0
has_gps,0,0.0
has_air_conditioning,0,0.0


In [6]:
pricing_df.head()

Unnamed: 0.1,Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183


In [7]:
# droping useless features

pricing_df = pricing_df.drop(['Unnamed: 0'], axis=1)
pricing_df.head(1)

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106


## Separate target Y from features X

In [8]:
# Separate target variable Y from features X
print("Separating labels from features...")
print('')
features_list = list(pricing_df.columns[:-1])
target_variable = 'rental_price_per_day'

X = pricing_df.loc[:, features_list]
Y = pricing_df.loc[:, target_variable]
print('Fait...')

Separating labels from features...

Fait...


In [9]:
numeric_features = []
categorical_features = []
for i,t in X.dtypes.items():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Auto detection des features')
print('Fait...')
print('Features numériques: ', numeric_features)
print('Features catégorielles: ', categorical_features)

Auto detection des features
Fait...
Features numériques:  ['mileage', 'engine_power']
Features catégorielles:  ['model_key', 'fuel', 'paint_color', 'car_type', 'private_parking_available', 'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']


## Preprocessor

On  va essayer de travailler les feautures:
    -features numériques : normalisation, mise à l'échelle, remplacement des valeurs manquantes restantes par la médiane des valeurs de colonne (si besoin).
    - features catégorielles : un encodage à chaud car ce sont des valeurs nominales.

In [10]:
# Création des models de preprocessing
print('Création des modèles de prétraitement')
# Création du pipeline pour les features numériques
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), # Les valeurs manquantes seront remplacées par la moyenne des colonnes
    ('scaler', StandardScaler())
])

# Création du pipeline pour les features catégoriques
categorical_transformer = Pipeline(
    steps=[
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore')) # On ignore les catégories inconnues lors de la transformation
    ])

# Utilisation de ColumnTransformer pour créer un objet preprocessor qui décrit tous les traitements à effectuer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

print('\nFait...')

Création des modèles de prétraitement

Fait...


## Defining train/test sets

On va faire une séparation sur l'ensemble de données, 80 % pour les données d'entraînement et 20 % pour les données de test.

In [11]:
# Séparation du jeu de données en train et test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state=0)

# Prétraitement sur X_train
print("\nPrétraitement de X_train...")
X_train = preprocessor.fit_transform(X_train)
print("...Terminé!")
print(X_train[0:5,:]) 

# Prétraitement sur X_test
print("\nPrétraitement de X_test...")
X_test = preprocessor.transform(X_test) 
print("...Terminé!")
print(X_test[0:5,:]) 


Prétraitement de X_train...
...Terminé!
  (0, 0)	-0.13697524611219603
  (0, 1)	0.16120867170881872
  (0, 3)	1.0
  (0, 38)	1.0
  (0, 44)	1.0
  (0, 46)	1.0
  (0, 47)	1.0
  (0, 52)	1.0
  (1, 0)	-0.1601928122265152
  (1, 1)	0.16120867170881872
  (1, 4)	1.0
  (1, 30)	1.0
  (1, 40)	1.0
  (1, 47)	1.0
  (1, 50)	1.0
  (1, 52)	1.0
  (2, 0)	1.1691149778744785
  (2, 1)	0.16120867170881872
  (2, 20)	1.0
  (2, 34)	1.0
  (2, 42)	1.0
  (2, 46)	1.0
  (2, 47)	1.0
  (2, 50)	1.0
  (2, 52)	1.0
  (3, 0)	0.14389943017254048
  (3, 1)	-0.7407589686724106
  (3, 18)	1.0
  (3, 30)	1.0
  (3, 40)	1.0
  (3, 46)	1.0
  (3, 47)	1.0
  (3, 52)	1.0
  (4, 0)	0.510103768430211
  (4, 1)	0.16120867170881872
  (4, 4)	1.0
  (4, 30)	1.0
  (4, 40)	1.0
  (4, 46)	1.0
  (4, 47)	1.0
  (4, 52)	1.0

Prétraitement de X_test...
...Terminé!
  (0, 0)	0.8970234864247778
  (0, 1)	-1.1273165288357947
  (0, 20)	1.0
  (0, 30)	1.0
  (0, 40)	1.0
  (0, 47)	1.0
  (0, 52)	1.0
  (1, 0)	-1.2114345025888595
  (1, 1)	0.16120867170881872
  (1, 3)	1.0
  

## Process evaluation

In [14]:
# Entraînement et évaluation des modèles de régression
# Crée les modèles Regression, XGBoost, Random Forest et KNN


lin_reg_model = LinearRegression()
xgb_reg_model = xgb.XGBRegressor(use_label_encoder=False, eval_metric='rmse')
rf_reg_model = RandomForestRegressor(n_jobs=-1)
knn_reg_model = KNeighborsRegressor(n_neighbors=5)

# Boucle à travers les modèles et effectue l'entraînement et l'évaluation
for reg in [lin_reg_model, xgb_reg_model, rf_reg_model, knn_reg_model]:
    reg.fit(X_train, Y_train)  # Entraîne le modèle sur les données d'entraînement
    y_reg_pred = reg.predict(X_test)  # Fait des prédictions sur les données de test
    mse = mean_squared_error(Y_test, y_reg_pred)  # Calcule l'erreur quadratique moyenne
    rmse = np.sqrt(mse)  # Calcule la racine carrée de l'erreur quadratique moyenne pour obtenir le RMSE
    r2 = r2_score(Y_test, y_reg_pred)  # Calcule le score R2
    mae = mean_absolute_error(Y_test, y_reg_pred)  # Calcule l'erreur absolue moyenne
    explained_var = explained_variance_score(Y_test, y_reg_pred)  # Calcule le score de variance expliquée
   
    
    # Affiche les métriques d'évaluation pour chaque modèle
    print(f"{reg.__class__.__name__} RMSE: {rmse:.2f}")
    print(f"{reg.__class__.__name__} R2 Score: {r2:.2f}")
    print(f"{reg.__class__.__name__} MAE: {mae:.2f}")
    print(f"{reg.__class__.__name__} Explained Variance Score: {explained_var:.2f}")
    print()

LinearRegression RMSE: 18.37
LinearRegression R2 Score: 0.69
LinearRegression MAE: 12.52
LinearRegression Explained Variance Score: 0.69

XGBRegressor RMSE: 16.25
XGBRegressor R2 Score: 0.76
XGBRegressor MAE: 10.64
XGBRegressor Explained Variance Score: 0.76

RandomForestRegressor RMSE: 15.90
RandomForestRegressor R2 Score: 0.77
RandomForestRegressor MAE: 10.39
RandomForestRegressor Explained Variance Score: 0.77

KNeighborsRegressor RMSE: 18.38
KNeighborsRegressor R2 Score: 0.69
KNeighborsRegressor MAE: 12.70
KNeighborsRegressor Explained Variance Score: 0.69



## XGBOOST ET KNN AYANT LES MEILLEURES PERFOMANCES, ON VA CONTINUER AVEC CES DEUX MODELS SUR LA GRIDSEARCH

### 1 -  XGBOOST

In [15]:

# Hyperparamètres pour XGBoost
params_xgb = {
    'max_depth': [8, 10, 12],
    'learning_rate': [0.03, 0.04, 0.05],
    'n_estimators': [180, 185, 190, 250],
    'colsample_bytree': [0.7, 0.75, 0.8, 0.9],
    'subsample': [0.8],
}

# Hyperparamètres pour KNN
params_knn = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
}

# Modèles à boucler
models = {
    'XGBoostRegressor': (xgb_reg_model, params_xgb),
    'KNNRegressor': (knn_reg_model, params_knn),
}

# Boucle sur chaque modèle
for model_name, (model, params) in models.items():
    print(f"Grid search pour {model_name}...")
    
    gridsearch = GridSearchCV(model, param_grid=params, cv=5)
    gridsearch.fit(X_train, Y_train)
    
    print("...Terminé.")
    print(f"Meilleurs hyperparamètres pour {model_name} : ", gridsearch.best_params_)
    print(f"Meilleure précision de validation pour {model_name} : ", gridsearch.best_score_)
    print()


Grid search pour XGBoostRegressor...
...Terminé.
Meilleurs hyperparamètres pour XGBoostRegressor :  {'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 10, 'n_estimators': 250, 'subsample': 0.8}
Meilleure précision de validation pour XGBoostRegressor :  0.7815450459488262

Grid search pour KNNRegressor...
...Terminé.
Meilleurs hyperparamètres pour KNNRegressor :  {'algorithm': 'auto', 'n_neighbors': 9, 'weights': 'distance'}
Meilleure précision de validation pour KNNRegressor :  0.7064015706107486



In [16]:
# Hyperparamètres pour XGBoost
params_xgb = {
    'max_depth': [8, 10, 12],
    'learning_rate': [0.03, 0.04, 0.05],
    'n_estimators': [180, 185, 190, 250],
    'colsample_bytree': [0.7, 0.75, 0.8, 0.9],
    'subsample': [0.8],
}

# Hyperparamètres pour Random Forest
params_rf = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Modèles à boucler
models = {
    'XGBoostRegressor': (xgb_reg_model, params_xgb),
    'RandomForestRegressor': (rf_reg_model, params_rf),
}

# Boucle sur chaque modèle
for model_name, (model, params) in models.items():
    print(f"Grid search pour {model_name}...")
    
    gridsearch = GridSearchCV(model, param_grid=params, cv=5)
    gridsearch.fit(X_train, Y_train)
    
    print("...Terminé.")
    print(f"Meilleurs hyperparamètres pour {model_name} : ", gridsearch.best_params_)
    print(f"Meilleure précision de validation pour {model_name} : ", gridsearch.best_score_)


Grid search pour XGBoostRegressor...
...Terminé.
Meilleurs hyperparamètres pour XGBoostRegressor :  {'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 10, 'n_estimators': 250, 'subsample': 0.8}
Meilleure précision de validation pour XGBoostRegressor :  0.7815450459488262
Grid search pour RandomForestRegressor...
...Terminé.
Meilleurs hyperparamètres pour RandomForestRegressor :  {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}
Meilleure précision de validation pour RandomForestRegressor :  0.7581060455930309


Nous constatons que les perfomances du model XGBoost et RF sont presques égales, même si celle du RF sont légèrement meilleures, vu que le model RF prend enormemet de temps de calcul et que les résultats ne sont pas si différents de ceux du XGBoost, on va décider de continuer avec le model XGBoost.

In [18]:
column_names = []
for name, step, features_list in preprocessor.transformers_: # loop over steps of ColumnTransformer
    if name == 'num': # if pipeline is for numeric variables
        features = features_list # just get the names of columns to which it has been applied
    else: # if pipeline is for categorical variables
        features = step.get_feature_names_out() # get output columns names from OneHotEncoder
    column_names.extend(features) # concatenate features names
        
print("Names of columns corresponding to each coefficient: ", column_names)

# Create a pandas DataFrame
feature_importance = pd.DataFrame(index = column_names, data = xgb_reg_model.feature_importances_, columns=["feature_importances"])
feature_importance = feature_importance.sort_values(by = 'feature_importances')

# Plot coefficients
fig = px.bar(feature_importance, orientation = 'h')
fig.update_layout(showlegend = False, 
                  margin = {'l': 120} # to avoid cropping of column names
                 )
fig.show()

Names of columns corresponding to each coefficient:  ['mileage', 'engine_power', 'model_key_Audi', 'model_key_BMW', 'model_key_Citroën', 'model_key_Ferrari', 'model_key_Fiat', 'model_key_Ford', 'model_key_Honda', 'model_key_KIA Motors', 'model_key_Lamborghini', 'model_key_Lexus', 'model_key_Maserati', 'model_key_Mercedes', 'model_key_Mitsubishi', 'model_key_Nissan', 'model_key_Opel', 'model_key_PGO', 'model_key_Peugeot', 'model_key_Porsche', 'model_key_Renault', 'model_key_SEAT', 'model_key_Subaru', 'model_key_Suzuki', 'model_key_Toyota', 'model_key_Volkswagen', 'model_key_Yamaha', 'fuel_electro', 'fuel_hybrid_petrol', 'fuel_petrol', 'paint_color_black', 'paint_color_blue', 'paint_color_brown', 'paint_color_green', 'paint_color_grey', 'paint_color_orange', 'paint_color_red', 'paint_color_silver', 'paint_color_white', 'car_type_coupe', 'car_type_estate', 'car_type_hatchback', 'car_type_sedan', 'car_type_subcompact', 'car_type_suv', 'car_type_van', 'private_parking_available_True', 'has_

D'après notre graphique, la valeur qui a la plus influencé notre modèle est la feature has_getaround_connect = True.

## ENREGISTREMENT DU MODELE

In [24]:
from joblib import dump, load

joblib.dump(xgb_reg_model,'api/models/xgb_model.joblib')
joblib.dump(preprocessor,'api/models/preprocessor.joblib')

['api/models/preprocessor.joblib']

On va tester notre modèle

In [22]:
example_input_1 = {
  "model_key": "Toyota","mileage": 25000,"engine_power": 130,"fuel": "diesel","paint_color": "red","car_type": "sedan","private_parking_available": True,
  "has_gps": True,"has_air_conditioning": True,"automatic_car": False,"has_getaround_connect": True,"has_speed_regulator": True,"winter_tires": True
  }

example_input_2 = {
        "model_key" : "Peugeot", "mileage" : 14699, "engine_power" : 100, "fuel" : "diesel", "paint_color" : "black", "car_type" : "sedan", "private_parking_available" : False, "has_gps" : True, "has_air_conditioning" : False, "automatic_car" : False, "has_getaround_connect" : False, "has_speed_regulator" : True, "winter_tires" : True
        }

In [27]:
df_1 = pd.DataFrame(dict(example_input_1), index=[0])
df_2 = pd.DataFrame(dict(example_input_2), index=[0])

df_1


Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires
0,Toyota,25000,130,diesel,red,sedan,True,True,True,False,True,True,True


In [31]:
# chargement du model et du preprocesseur
model = load('api/models/xgb_model.joblib')
preprocessor = load('api/models/preprocessor.joblib')

# pretraitement
processed_input_1 = preprocessor.transform(df_1)
processed_input_2 = preprocessor.transform(df_2)

# application du model 
prediction_1 = model.predict(processed_input_1)
prediction_2 = model.predict(processed_input_2)


# resultats
rental_price_per_day_1 = {"rental_price_per_day": prediction_1.tolist()[0]}
rental_price_per_day_2 = {"rental_price_per_day": prediction_2.tolist()[0]}

In [32]:

print(rental_price_per_day_1)
print(rental_price_per_day_2)

{'rental_price_per_day': 153.26817321777344}
{'rental_price_per_day': 157.9227294921875}


La difference de prix par rapport aux différents critères en entrée montre que montre predit les tarfis en fonctions des features et de leur valeur.