In [4]:
import json # will be needed for saving preprocesing details
import numpy as np # for data manipulation
import pandas as pd # for data manipulation
from sklearn.model_selection import train_test_split # will be used for data split
from sklearn.preprocessing import LabelEncoder # for preprocessing
from sklearn.ensemble import RandomForestClassifier # for training the algorithm
from sklearn.ensemble import ExtraTreesClassifier # for training the algorithm
import joblib # for saving algorithm and preprocessing objects

# loading dataset
df = pd.read_csv("https://carbubu.fr/includes/essential/source/data_twoyears.csv", skipinitialspace=True)
x_cols = [c for c in df.columns if c != 'date']
# set input matrix and target columns
X = df[x_cols]
y = df['date']
# show first rows
df.head()

# date split carburant / test
# utilisation de 80% du dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1234)

# fill missing values
train_mode = dict(X_train.mode().iloc[0])
X_train = X_train.fillna(train_mode)
print(train_mode)

encoders = {}
for column in ['id', 'gazole', 'sp95', 'sp98', 'e10', 'gplc', 'e85']:
    categorical_convert = LabelEncoder()
    X_train[column] = categorical_convert.fit_transform(X_train[column])
    encoders[column] = categorical_convert
    
# train the Random forest algorithm
rf = RandomForestClassifier(n_estimators = 100)
rf = rf.fit(X_train, y_train)

# train the Extra Trees algorithm
et = ExtraTreesClassifier(n_estimators = 100)
et = et.fit(X_train, y_train)

# save preprocessing objects and RF algorithm
joblib.dump(train_mode, "./train_mode.joblib", compress=True)
joblib.dump(encoders, "./encoders.joblib", compress=True)
joblib.dump(rf, "./random_forest.joblib", compress=True)
joblib.dump(et, "./extra_trees.joblib", compress=True)

{'id': 1.0, 'gazole': 1.799, 'sp95': 1.879, 'sp98': 1.99, 'e10': 1.829, 'gplc': 0.989, 'e85': 1.119}


['./extra_trees.joblib']