In [9]:
import pandas as pd
import pickle


from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

In [10]:
df = pd.read_csv('./src/car_dataset_cleaned.csv')

In [11]:
X = df.drop('prix', axis=1)
y = df['prix']

In [12]:
numeric_features = ['empattement', 'longueur_voiture', 'largeur_voiture', 'hauteur_voiture',
                    'poids_vehicule', 'nombre_cylindres', 'taille_moteur', 'taux_alésage',
                    'course', 'taux_compression', 'chevaux', 'tour_moteur', 'consommation_ville',
                    'consommation_autoroute']
numeric_transformer = Pipeline([
    ('scaler', MinMaxScaler())
])
categorical_features = ['etat_de_route', 'marque', 'modele', 'carburant', 'turbo', 'nombre_portes',
                        'type_vehicule', 'transmission', 'emplacement_moteur', 'type_moteur',
                        'systeme_carburant']
categorical_transformer = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [13]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=None, 
                                  min_samples_split=2, min_samples_leaf=1)
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', rf_model)
])

In [14]:
# séparation des données en train et test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# entraînement du modèle
pipeline.fit(X_train, y_train)
# calcul de la performance du modèle sur le jeu de test
score = pipeline.score(X_test, y_test)
print(f"{score}")

0.9371310647051219


In [15]:
with open('model_car.pkl', 'wb') as file:
    pickle.dump(pipeline, file)