In [19]:
import pandas as pd
import numpy as np
import bentoml
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from sklearn.multioutput import MultiOutputRegressor

In [30]:
df= pd.read_csv("modele_df.csv")

In [31]:
X = df[["natural_gas_binary","LargestPropertyUseTypeGFA","PropertyGFATotal","electricity_binary","PropertyGFABuilding",
        "ENERGYSTARScore","building_age","NumberofFloors"]]
y = df[["TotalGHGEmissions","SiteEnergyUse(kBtu)"]]
y_log = np.log1p(y)

In [32]:
X_train,X_test,y_train,y_test = train_test_split(X,y_log,test_size = 0.2, random_state = 42)

In [33]:
base_model = GradientBoostingRegressor(
    n_estimators = 800,
    learning_rate = 0.05,
    max_depth = 2,
    subsample = 0.8,
    max_features = "sqrt"
)

multi_model = MultiOutputRegressor(base_model)

In [34]:
multi_model.fit(X_train,y_train)

0,1,2
,estimator,GradientBoost...subsample=0.8)
,n_jobs,

0,1,2
,loss,'squared_error'
,learning_rate,0.05
,n_estimators,800
,subsample,0.8
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,2
,min_impurity_decrease,0.0


In [35]:
# pour test metrics 
y_predict_test = multi_model.predict(X_test)
print({"r2_score":r2_score(y_test,y_predict_test),
      "MSE":mean_squared_error(y_test,y_predict_test),
      "MAE":mean_absolute_error(y_test,y_predict_test)})

{'r2_score': 0.7717622146623855, 'MSE': 0.2194831086011999, 'MAE': 0.34224460313234895}


In [36]:
# for train metrics 
y_predict_train = multi_model.predict(X_train)
print({"r2_score":r2_score(y_train,y_predict_train),
      "MSE":mean_squared_error(y_train,y_predict_train),
      "MAE":mean_absolute_error(y_train,y_predict_train)})

{'r2_score': 0.8333150101149609, 'MSE': 0.17093961339194536, 'MAE': 0.30585722288003836}


In [37]:
bentoml.sklearn.save_model("consommation_model",multi_model)


Model(tag="consommation_model:54xigeeogkdkdlg6", path="/var/folders/g4/lxbpq_7j1dbg6bt38hmrfgsr0000gn/T/bentoml-model-consommation_model-adsa56w2")

In [38]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2822 entries, 0 to 2821
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   natural_gas_binary         2822 non-null   int64  
 1   LargestPropertyUseTypeGFA  2822 non-null   float64
 2   PropertyGFATotal           2822 non-null   int64  
 3   electricity_binary         2822 non-null   int64  
 4   PropertyGFABuilding        2822 non-null   int64  
 5   ENERGYSTARScore            2822 non-null   float64
 6   building_age               2822 non-null   int64  
 7   NumberofFloors             2822 non-null   int64  
dtypes: float64(2), int64(6)
memory usage: 176.5 KB


In [10]:
X

Unnamed: 0,natural_gas_binary,LargestPropertyUseTypeGFA,PropertyGFATotal,electricity_binary,PropertyGFABuilding(s),ENERGYSTARScore,building_age,NumberofFloors
0,1,88434.0,88434,1,88434,60.0,89,12
1,1,83880.0,103566,1,88502,61.0,20,11
2,1,61320.0,61320,1,61320,56.0,90,10
3,1,81352.0,83008,1,83008,27.0,90,11
4,1,102761.0,102761,1,102761,75.0,90,8
...,...,...,...,...,...,...,...,...
2817,1,18261.0,18261,0,18261,75.0,34,1
2818,1,16000.0,16000,0,16000,75.0,12,1
2819,1,7583.0,13157,1,13157,75.0,42,1
2820,1,6601.0,14101,0,14101,75.0,27,1


In [39]:
y_log.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2822 entries, 0 to 2821
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   TotalGHGEmissions    2822 non-null   float64
 1   SiteEnergyUse(kBtu)  2822 non-null   float64
dtypes: float64(2)
memory usage: 44.2 KB
