# 1.b Comparaison de quelques modèles sur les données simples
Pour les données obtenues avant et après un preprocessing très simple, nous regardons à présent les résultats obtenus pour la prédiction à l'aide de différents modèles.

__Résumé des résultats :__
- Nous obtenons de biens meilleurs résultats après le preprocessing pour tous les modèles comparés.
- Les meilleurs résultats sont obtenus grâce à des algorithmes de type ensemble d'arbres (xgboost, catboost, random forest)., especially state of the art gradient boosting algorithms.

In [60]:
import pandas as pd
from pycaret.regression import setup, compare_models, create_model, tune_model, plot_model, predict_model

In [2]:
data_convention = pd.read_csv('../data/comptages-routiers-permanents-convention.csv', sep=";")
data_champs = pd.read_csv('../data/comptages-routiers-permanents-champs.csv', sep=";")
data_peres = pd.read_csv('../data/comptages-routiers-permanents-peres.csv', sep=";")

In [10]:
data_convention_occupation = data_convention.rename(columns={"Taux d'occupation":"occupation"})

In [36]:
exp_reg_convention_debit_0 = setup(data = data_convention_occupation.drop(columns=["occupation", "Etat trafic"]), target = "Débit horaire",
                  experiment_name="convention0", session_id=123,
                  normalize = True, transformation = True, transform_target = True, 
                  combine_rare_levels = True, rare_level_threshold = 0.05,
                  remove_multicollinearity = True, multicollinearity_threshold = 0.95,)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,Débit horaire
2,Original Data,"(9431, 13)"
3,Missing Values,True
4,Numeric Features,0
5,Categorical Features,9
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(6549, 21)"


In [37]:
top3_debit_0 = compare_models(n_select = 3)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,292.562,110024.9384,331.6577,-0.0126,0.9403,1.5333,0.058
catboost,CatBoost Regressor,263.089,99716.4708,299.5502,-0.0227,0.8358,1.3234,1.81
en,Elastic Net,293.3407,111527.2445,333.9169,-0.0264,0.932,1.4815,0.023
br,Bayesian Ridge,293.3407,111527.2539,333.9169,-0.0264,0.932,1.4815,0.041
llar,Lasso Least Angle Regression,293.3407,111527.2498,333.9169,-0.0264,0.932,1.4815,0.027
lasso,Lasso Regression,293.3407,111527.2445,333.9169,-0.0264,0.932,1.4815,0.029
lightgbm,Light Gradient Boosting Machine,293.3407,111527.2498,333.9169,-0.0264,0.932,1.4815,0.038
ridge,Ridge Regression,293.3414,111528.5164,333.9188,-0.0265,0.932,1.4815,0.029
omp,Orthogonal Matching Pursuit,293.342,111529.79,333.9207,-0.0265,0.932,1.4814,0.031
xgboost,Extreme Gradient Boosting,293.342,111529.7867,333.9207,-0.0265,0.932,1.4814,0.165


In [34]:
exp_reg_convention_occupation_0 = setup(data = data_convention_occupation.drop(columns=["Débit horaire", "Etat trafic"]), target = "occupation",
                  experiment_name="convention0", session_id=123,
                  normalize = True, transformation = True, transform_target = True, 
                  combine_rare_levels = True, rare_level_threshold = 0.05,
                  remove_multicollinearity = True, multicollinearity_threshold = 0.95,)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,occupation
2,Original Data,"(9431, 13)"
3,Missing Values,True
4,Numeric Features,0
5,Categorical Features,9
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(6549, 21)"


In [35]:
top3_occupation_0 = compare_models(n_select = 3)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,4.3592,80.0697,8.4669,-0.1186,0.8166,1.8012,1.907
dt,Decision Tree Regressor,4.8915,90.2936,9.4781,-0.1342,0.9117,2.0144,0.031
lightgbm,Light Gradient Boosting Machine,4.8915,90.2922,9.478,-0.1342,0.9117,2.0146,0.034
gbr,Gradient Boosting Regressor,4.8915,90.2936,9.4781,-0.1342,0.9117,2.0144,0.085
et,Extra Trees Regressor,4.8915,90.2936,9.4781,-0.1342,0.9117,2.0144,0.196
rf,Random Forest Regressor,4.8915,90.2949,9.4781,-0.1342,0.9117,2.0143,0.234
omp,Orthogonal Matching Pursuit,4.8915,90.2936,9.4781,-0.1342,0.9117,2.0144,0.057
lr,Linear Regression,4.8915,90.2936,9.4781,-0.1342,0.9117,2.0144,0.056
lar,Least Angle Regression,4.8915,90.2936,9.4781,-0.1342,0.9117,2.0144,0.038
llar,Lasso Least Angle Regression,4.8915,90.2922,9.478,-0.1342,0.9117,2.0146,0.04


In [26]:
def preprocess(df: pd.DataFrame):
    """
    Enlève toutes les colonnes sauf la date, le débit horaire et le taux d'occupation
    et formatte les dates en type datetime
    pour ensuite en extraire année, jour, mois, heure et jour de la semaine
    Réordonne les données temporellement
    """
    temp = df[["Débit horaire", "Taux d'occupation"]].copy()
    temp["Date et heure de comptage"] = pd.to_datetime(df["Date et heure de comptage"], utc=True)
    temp = temp.sort_values("Date et heure de comptage")
    temp = temp.set_index("Date et heure de comptage")
    temp["datetime"] = temp.index
    temp["year"] = temp.index.year
    temp["month"] = temp.index.month
    temp["day"] = temp.index.day
    temp["hour"] = temp.index.hour
    # Récupère les jours de la semaine : 0 -> lundi, 6 -> dimanche
    temp["dayofweek"] = temp.index.dayofweek
    return temp

In [47]:
data_convention_prep = preprocess(data_convention)
data_convention_prep = data_convention_prep.dropna()
data_convention_prep = data_convention_prep.rename(columns={"Débit horaire": "debit", "Taux d'occupation":"occupation"})

In [49]:
exp_reg_convention_debit_1 = setup(data = data_convention_prep.drop(columns=["occupation"]), target = "debit",
                  experiment_name="convention0", session_id=123,
                  normalize = True, transformation = True, transform_target = True, 
                  combine_rare_levels = True, rare_level_threshold = 0.05,
                  remove_multicollinearity = True, multicollinearity_threshold = 0.95,)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,debit
2,Original Data,"(9364, 7)"
3,Missing Values,False
4,Numeric Features,2
5,Categorical Features,3
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(6554, 48)"


In [50]:
top3_debit_1 = compare_models(n_select = 3)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
xgboost,Extreme Gradient Boosting,48.0148,4676.779,68.2342,0.9562,0.181,0.1266,0.651
catboost,CatBoost Regressor,49.5078,4924.1639,69.9883,0.9539,0.192,0.1325,3.95
rf,Random Forest Regressor,49.6976,5209.1388,71.9943,0.9513,0.205,0.1433,0.943
lightgbm,Light Gradient Boosting Machine,53.1491,5654.2976,75.0468,0.9471,0.2024,0.1472,0.108
et,Extra Trees Regressor,55.2381,6787.8971,82.1539,0.9365,0.2354,0.16,1.148
dt,Decision Tree Regressor,63.3897,8825.1804,93.7717,0.9174,0.2514,0.169,0.058
gbr,Gradient Boosting Regressor,77.7974,11020.1645,104.834,0.897,0.2783,0.2246,0.403
knn,K Neighbors Regressor,102.6445,18957.2135,137.5437,0.8228,0.3723,0.3238,1.341
ada,AdaBoost Regressor,152.4556,34697.8722,186.1444,0.6757,0.5089,0.5273,0.432
omp,Orthogonal Matching Pursuit,204.15,62420.7161,249.7531,0.416,0.6466,0.7443,0.035


In [51]:
xgb_debit = create_model("xgboost", fold=5)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,47.6367,4913.6875,70.0977,0.9548,0.18,0.1213
1,50.6769,5391.7954,73.4288,0.9483,0.193,0.1294
2,46.2621,4237.0791,65.0928,0.9597,0.1742,0.1232
3,48.191,4592.5718,67.7685,0.9564,0.1954,0.1318
4,51.2677,5056.6665,71.1102,0.9548,0.1878,0.1382
Mean,48.8069,4838.3601,69.4996,0.9548,0.1861,0.1288
SD,1.8856,395.3665,2.8568,0.0037,0.0079,0.0061


In [61]:
predict_model(xgb_debit, data_convention_prep.drop(columns=["occupation","debit"]))

Unnamed: 0_level_0,datetime,year,month,day,hour,dayofweek,Label
Date et heure de comptage,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-11-01 03:00:00+00:00,2019-11-01 03:00:00+00:00,2019,11,1,3,4,271.402832
2019-11-01 04:00:00+00:00,2019-11-01 04:00:00+00:00,2019,11,1,4,4,257.163116
2019-11-01 05:00:00+00:00,2019-11-01 05:00:00+00:00,2019,11,1,5,4,240.593979
2019-11-01 06:00:00+00:00,2019-11-01 06:00:00+00:00,2019,11,1,6,4,233.072845
2019-11-01 07:00:00+00:00,2019-11-01 07:00:00+00:00,2019,11,1,7,4,479.511536
...,...,...,...,...,...,...,...
2020-11-29 19:00:00+00:00,2020-11-29 19:00:00+00:00,2020,11,29,19,6,562.425354
2020-11-29 20:00:00+00:00,2020-11-29 20:00:00+00:00,2020,11,29,20,6,442.887360
2020-11-29 21:00:00+00:00,2020-11-29 21:00:00+00:00,2020,11,29,21,6,358.330597
2020-11-29 22:00:00+00:00,2020-11-29 22:00:00+00:00,2020,11,29,22,6,232.949326


In [30]:
exp_reg_convention_occupation_1 = setup(data = data_convention_prep.drop(columns=["debit"]), target = "occupation",
                  experiment_name="convention0", session_id=123,
                  normalize = True, transformation = True, transform_target = True, 
                  combine_rare_levels = True, rare_level_threshold = 0.05,
                  remove_multicollinearity = True, multicollinearity_threshold = 0.95,)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,occupation
2,Original Data,"(9431, 7)"
3,Missing Values,True
4,Numeric Features,2
5,Categorical Features,3
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(6558, 49)"


In [31]:
top3_occupation_1 = compare_models(n_select = 3)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,1.8142,19.4218,4.3907,0.7682,0.2629,0.2522,0.976
xgboost,Extreme Gradient Boosting,1.8892,19.9596,4.45,0.762,0.2668,0.2567,0.836
catboost,CatBoost Regressor,1.939,21.0455,4.5718,0.7486,0.2682,0.2588,4.224
lightgbm,Light Gradient Boosting Machine,1.9934,21.8785,4.6629,0.7388,0.2783,0.2709,0.168
et,Extra Trees Regressor,2.044,24.4662,4.9314,0.7068,0.3001,0.2979,1.289
dt,Decision Tree Regressor,2.3997,33.964,5.8063,0.5941,0.348,0.3521,0.049
gbr,Gradient Boosting Regressor,2.5989,36.4762,6.0193,0.5652,0.3628,0.3758,0.426
knn,K Neighbors Regressor,2.917,40.5978,6.3502,0.5161,0.4453,0.5185,0.146
br,Bayesian Ridge,3.0149,41.6982,6.4372,0.5028,0.44,0.5383,0.068
ridge,Ridge Regression,3.0152,41.7207,6.4389,0.5025,0.44,0.5383,0.044


Comme la précisison est plus élevée pour le débit horaire que le taux d'occupation, il peut être intéressant de d'abord prédire le début horaire et d'utiliser cette prédiction pour le taux d'occupation. Attention car si l'on se trompe sur le débit horaire, on augmente encore plus l'erreur sur la prédiction du taux d'occupation.

Prochaine étape, regarder les résultats sur un débit horaire prédit.

In [65]:
data_convention_prep_and_debit = data_convention_prep.copy()
data_convention_prep_and_debit = data_convention_prep_and_debit.drop(columns=["debit"])
data_convention_prep_and_debit["est_debit"] = predict_model(xgb_debit, data=data_convention_prep.drop(columns=["occupation","debit"]))["Label"]
data_convention_prep_and_debit.head()

Unnamed: 0_level_0,occupation,datetime,year,month,day,hour,dayofweek,est_debit
Date et heure de comptage,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-11-01 03:00:00+00:00,1.67722,2019-11-01 03:00:00+00:00,2019,11,1,3,4,271.402832
2019-11-01 04:00:00+00:00,1.41056,2019-11-01 04:00:00+00:00,2019,11,1,4,4,257.163116
2019-11-01 05:00:00+00:00,1.35667,2019-11-01 05:00:00+00:00,2019,11,1,5,4,240.593979
2019-11-01 06:00:00+00:00,1.14056,2019-11-01 06:00:00+00:00,2019,11,1,6,4,233.072845
2019-11-01 07:00:00+00:00,1.85722,2019-11-01 07:00:00+00:00,2019,11,1,7,4,479.511536


In [66]:
exp_reg_convention_occupation_1 = setup(data = data_convention_prep_and_debit, target = "occupation",
                  experiment_name="convention0", session_id=123,
                  normalize = True, transformation = True, transform_target = True, 
                  combine_rare_levels = True, rare_level_threshold = 0.05,
                  remove_multicollinearity = True, multicollinearity_threshold = 0.95,)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,occupation
2,Original Data,"(9364, 8)"
3,Missing Values,False
4,Numeric Features,3
5,Categorical Features,3
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(6554, 49)"


In [67]:
top3_occupation_2 = compare_models(n_select = 3)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,1.7347,19.2062,4.365,0.7639,0.2325,0.1999,4.448
rf,Random Forest Regressor,1.7595,19.2741,4.3666,0.7638,0.2378,0.2077,1.417
et,Extra Trees Regressor,1.7742,19.7006,4.4152,0.7576,0.2431,0.2119,1.486
lightgbm,Light Gradient Boosting Machine,1.7582,19.9494,4.4493,0.7551,0.2343,0.1995,0.123
xgboost,Extreme Gradient Boosting,1.8061,20.6833,4.5269,0.7459,0.2415,0.2078,0.736
knn,K Neighbors Regressor,2.0349,24.3434,4.923,0.7011,0.2754,0.2607,0.145
gbr,Gradient Boosting Regressor,2.0697,27.6937,5.2412,0.6624,0.2705,0.2262,0.518
dt,Decision Tree Regressor,2.3247,32.8445,5.7213,0.593,0.3259,0.3009,0.061
lr,Linear Regression,2.444,33.531,5.7679,0.5921,0.3139,0.3434,1.464
ridge,Ridge Regression,2.4444,33.5579,5.7702,0.5918,0.3139,0.3434,0.041
