In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_log_error

from sklearn.model_selection import learning_curve

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression

from sklearn.inspection import permutation_importance
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

import pickle

In [2]:
df = pd.read_csv(r'C:\Users\Admin\Documents\marianneSimplon\simplon\prediction_CO2\data\model.csv',delimiter=',', decimal='.')

In [3]:
df = df.drop(['GHGEmissionsIntensity','Unnamed: 0','ListOfAllPropertyUseTypes','DefaultData','CouncilDistrictCode','Latitude','Longitude','SiteEnergyUseWN(kBtu)','ComplianceStatus','Neighborhood','YearBuilt'], axis=1)

In [4]:
df

Unnamed: 0,BuildingType,PrimaryPropertyType,NumberofFloors,PropertyGFAParking,LargestPropertyUseType,SecondLargestPropertyUseType,SecondLargestPropertyUseTypeGFA,SourceEUI(kBtu/sf),SteamUse(kBtu),Electricity(kBtu),TotalGHGEmissions
0,NonResidential,Hotel,12,0,Hotel,Rien,28444.075817,182.500000,2003882.00,3.946027e+06,249.98
1,NonResidential,Hotel,11,15064,Hotel,Parking,15064.000000,176.100006,0.00,3.242851e+06,295.86
2,NonResidential,Hotel,41,196718,Hotel,Rien,28444.075817,241.899994,21566554.00,4.952666e+07,2089.28
3,NonResidential,Hotel,10,0,Hotel,Rien,28444.075817,216.199997,2214446.25,2.768924e+06,286.43
4,NonResidential,Hotel,18,62000,Hotel,Parking,68009.000000,211.399994,0.00,5.368607e+06,505.01
...,...,...,...,...,...,...,...,...,...,...,...
3370,Nonresidential COS,Office,1,0,Office,Rien,28444.075817,161.699997,0.00,5.242709e+05,20.94
3371,Nonresidential COS,Other,1,0,Other - Recreation,Rien,28444.075817,114.199997,0.00,3.965461e+05,32.17
3372,Nonresidential COS,Other,1,0,Other - Recreation,Fitness Center/Health Club/Gym,5574.000000,744.799988,0.00,1.792159e+06,223.54
3373,Nonresidential COS,Mixed Use Property,1,0,Other - Recreation,Fitness Center/Health Club/Gym,6501.000000,105.300003,0.00,3.488702e+05,22.11


In [10]:
df.columns.tolist()

['BuildingType',
 'PrimaryPropertyType',
 'NumberofFloors',
 'PropertyGFAParking',
 'LargestPropertyUseType',
 'SecondLargestPropertyUseType',
 'SecondLargestPropertyUseTypeGFA',
 'SourceEUI(kBtu/sf)',
 'SteamUse(kBtu)',
 'Electricity(kBtu)',
 'TotalGHGEmissions']

# Processing

In [11]:
y = df['TotalGHGEmissions']
X = df.drop('TotalGHGEmissions', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)

# Pipelines

In [12]:
numeric_transformer = Pipeline(steps=[
       ('scaler', MinMaxScaler())
])
categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(drop='first',handle_unknown = 'ignore'))
])

In [13]:
cat = X.select_dtypes(include=["object"])
non_cat = X.select_dtypes(exclude=["object"])
noncatY = df.select_dtypes(exclude=["object"])

# cat['CouncilDistrictCode'] = non_cat['CouncilDistrictCode']

In [14]:
numeric_features = non_cat.columns.values.tolist()
categorical_features = cat.columns.values.tolist()

In [15]:
numeric_features

['NumberofFloors',
 'PropertyGFAParking',
 'SecondLargestPropertyUseTypeGFA',
 'SourceEUI(kBtu/sf)',
 'SteamUse(kBtu)',
 'Electricity(kBtu)']

In [16]:
categorical_features

['BuildingType',
 'PrimaryPropertyType',
 'LargestPropertyUseType',
 'SecondLargestPropertyUseType']

In [69]:
preprocessor = ColumnTransformer(
   transformers=[
    ('numeric', numeric_transformer, numeric_features)
   ,('categorical', categorical_transformer, categorical_features)
]) 

In [70]:
preprocess = Pipeline(steps=[('preprocessor', preprocessor)])

In [71]:
training_transformed = preprocess.fit_transform(X_train)
pd.DataFrame(training_transformed)

Unnamed: 0,0
0,"(0, 0)\t0.12698412698412698\n (0, 2)\t0.044..."
1,"(0, 0)\t0.047619047619047616\n (0, 2)\t0.00..."
2,"(0, 0)\t0.047619047619047616\n (0, 2)\t0.04..."
3,"(0, 0)\t0.06349206349206349\n (0, 2)\t0.044..."
4,"(0, 0)\t0.09523809523809523\n (0, 2)\t0.003..."
...,...
2526,"(0, 0)\t0.047619047619047616\n (0, 2)\t0.04..."
2527,"(0, 0)\t0.047619047619047616\n (0, 2)\t0.00..."
2528,"(0, 0)\t0.031746031746031744\n (0, 2)\t0.02..."
2529,"(0, 0)\t0.031746031746031744\n (0, 2)\t0.04..."


# Model Selection

### Regression lineaire

In [72]:
pipe_reglin = Pipeline(steps = [
                ('preprocessor', preprocessor),
                ('linear', LinearRegression())
           ])

In [73]:
pipe_reglin.fit(X_train,y_train)
pipe_reglin.score(X_train,y_train)

0.7338966157679389

In [74]:
pipe_reglin_ypred = pipe_reglin.predict(X_test)
print(f"MSE : {mean_squared_error(y_test, pipe_reglin_ypred)}")
print(f"RMSE : {np.sqrt(mean_squared_error(y_test, pipe_reglin_ypred))}")
print(f"MAE : {mean_absolute_error(y_test, pipe_reglin_ypred)}")

MSE : 80550.45993470763
RMSE : 283.81412920203184
MAE : 105.70958487132911




In [75]:
# Perform Permutation
permutation_score = permutation_importance(pipe_reglin, 
                                           X_test, y_test,
                                           scoring='neg_mean_absolute_error',
                                           random_state=3,
                                           n_repeats=100) 

# Unstack results
importance_df = pd.DataFrame(np.vstack((X.columns,
                                        permutation_score.importances_mean)).T) 

importance_df.columns=['feature','feature importance']

# Order by importance
importance_df.sort_values(by="feature importance", ascending = False) 

















































Unnamed: 0,feature,feature importance
1,PrimaryPropertyType,165.295
4,LargestPropertyUseType,145.204
6,SecondLargestPropertyUseTypeGFA,50.3743
8,SteamUse(kBtu),28.4225
9,Electricity(kBtu),27.8297
0,BuildingType,18.0686
5,SecondLargestPropertyUseType,17.0739
3,PropertyGFAParking,8.98257
2,NumberofFloors,6.55458
7,SourceEUI(kBtu/sf),6.54545


In [76]:
importance_df.to_csv('inportance_df.csv')

'ComplianceStatus','Neighborhood','YearBuilt'

In [79]:
train_sizes, train_scores, test_scores = learning_curve(estimator = pipe_reglin,
                                          X = X,
                                          y = y,
                                          train_sizes = [5,10,20,30,40,50,100,200,300,400],
                                          cv = 5,
                                          scoring='neg_mean_absolute_error',
                                          shuffle = True,
                                          random_state=0)



