# Fit Gaussian Process Regression to Energy efficiency data set

## Load Dependencies

In [31]:
import numpy as np
from scipy.stats import uniform

from sklearn.compose import make_column_transformer
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import ConstantKernel, DotProduct, Matern, RationalQuadratic, RBF
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

## Load Data

In [2]:
%run ../data/data.py

In [3]:
df = load_energy_data()

In [4]:
df.head()

Unnamed: 0,Relative Compactness,Surface Area,Wall Area,Roof Area,Overall Height,Orientation,Glazing Area,Glazing Area Distribution,Heating Load,Cooling Load
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55,21.33
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,15.55,21.33
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55,21.33
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,15.55,21.33
4,0.9,563.5,318.5,122.5,7.0,2,0.0,0,20.84,28.28


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Relative Compactness       768 non-null    float64
 1   Surface Area               768 non-null    float64
 2   Wall Area                  768 non-null    float64
 3   Roof Area                  768 non-null    float64
 4   Overall Height             768 non-null    float64
 5   Orientation                768 non-null    int64  
 6   Glazing Area               768 non-null    float64
 7   Glazing Area Distribution  768 non-null    int64  
 8   Heating Load               768 non-null    float64
 9   Cooling Load               768 non-null    float64
dtypes: float64(8), int64(2)
memory usage: 60.1 KB


In [6]:
y1 = df['Heating Load']
y2 = df['Cooling Load']
X = df.drop(columns=['Heating Load', 'Cooling Load'])

## Fit Gaussian Process regression

In [27]:
numeric_features = X.select_dtypes('float').columns
categorical_features = X.select_dtypes(np.int64).columns

In [28]:
features_pipeline = make_column_transformer((StandardScaler(), numeric_features), (OneHotEncoder(), categorical_features))

In [29]:
gp = GaussianProcessRegressor()

In [30]:
model = make_pipeline(features_pipeline, gp)

In [35]:
param_grid = {
    'gaussianprocessregressor__kernel': [ConstantKernel(), DotProduct(), Matern(), RationalQuadratic(), RBF(),
    ConstantKernel()*DotProduct(), ConstantKernel()*Matern(), ConstantKernel()*RationalQuadratic(), ConstantKernel()*RBF(),
    ConstantKernel()*RBF()*DotProduct(), ConstantKernel()*RBF()*Matern(), ConstantKernel()*RBF()*RationalQuadratic(), ConstantKernel()*RBF()*RBF()
    ]
}

In [39]:
param_grid = {'gaussianprocessregressor__kernel': [RBF(), ConstantKernel()*RBF()]}

In [40]:
clf = GridSearchCV(model, param_grid, n_jobs=-1, cv=5, verbose=5)

In [41]:
clf.fit(X,y1)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('standardscaler',
                                                                         StandardScaler(),
                                                                         Index(['Relative Compactness', 'Surface Area', 'Wall Area', 'Roof Area',
       'Overall Height', 'Glazing Area'],
      dtype='object')),
                                                                        ('onehotencoder',
                                                                         OneHotEncoder(),
                                                                         Index(['Orientation', 'Glazing Area Distribution'], dtype='object'))])),
                                       ('gaussianprocessregressor',
                                        GaussianProcessRegressor())]),
             n_jobs=-1,
             param_grid={'gaussianpro

In [42]:
clf.best_estimator_

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  Index(['Relative Compactness', 'Surface Area', 'Wall Area', 'Roof Area',
       'Overall Height', 'Glazing Area'],
      dtype='object')),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  Index(['Orientation', 'Glazing Area Distribution'], dtype='object'))])),
                ('gaussianprocessregressor',
                 GaussianProcessRegressor(kernel=RBF(length_scale=1)))])

In [52]:
clf.best_estimator_.steps[1][1].kernel.get_params()

{'length_scale': 1.0, 'length_scale_bounds': (1e-05, 100000.0)}

In [20]:
from sklearn.svm import SVC

In [21]:
Categorical([SVC()])

Categorical(categories=(SVC(),), prior=None)

In [22]:
Categorical([RBF()], name='rbf')

TypeError: unhashable type: 'RBF'

In [23]:
param_distributions = [
    {
        'gaussianprocessregressor__kernel': Categorical([ConstantKernel()]),
        'gaussianprocessregressor__kernel__constant_value': Real(1e-5, 1e5, name='constant_value')
    },
    {
        'gaussianprocessregressor__kernel': Categorical([DotProduct()]),
        'gaussianprocessregressor__kernel__sigma_0': Real(1e-5, 1e5, name='sigma_0')
    },
    {
        'gaussianprocessregressor__kernel': Categorical([Matern()]),
        'gaussianprocessregressor__kernel__length_scale': Real(1e-5, 1e5, name='length_scale'),
        'gaussianprocessregressor__kernel__nu': [0.5, 1.5, 2.5, np.inf]
    },
    {
        'gaussianprocessregressor__kernel': Categorical([RationalQuadratic()]),
        'gaussianprocessregressor__kernel__length_scale': Real(1e-5, 1e5, name='length_scale'),
        'gaussianprocessregressor__kernel__alpha': Real(1e-5, 1e5, name='alpha')
    },
    {
        'gaussianprocessregressor__kernel': Categorical([RBF()]),
        'gaussianprocessregressor__kernel__length_scale': Real(1e-5, 1e5, name='length_scale')
    }
]

TypeError: unhashable type: 'ConstantKernel'

In [24]:
param_distributions = [
    {
        'gaussianprocessregressor__kernel': [ConstantKernel()],
        'gaussianprocessregressor__kernel__constant_value': Real(1e-5, 1e5, name='constant_value')
    },
    {
        'gaussianprocessregressor__kernel': [DotProduct()],
        'gaussianprocessregressor__kernel__sigma_0': Real(1e-5, 1e5, name='sigma_0')
    },
    {
        'gaussianprocessregressor__kernel': [Matern()],
        'gaussianprocessregressor__kernel__length_scale': Real(1e-5, 1e5, name='length_scale'),
        'gaussianprocessregressor__kernel__nu': [0.5, 1.5, 2.5, np.inf]
    },
    {
        'gaussianprocessregressor__kernel': [RationalQuadratic()],
        'gaussianprocessregressor__kernel__length_scale': Real(1e-5, 1e5, name='length_scale'),
        'gaussianprocessregressor__kernel__alpha': Real(1e-5, 1e5, name='alpha')
    },
    {
        'gaussianprocessregressor__kernel': [RBF()],
        'gaussianprocessregressor__kernel__length_scale': Real(1e-5, 1e5, name='length_scale')
    }
]

In [25]:
opt = BayesSearchCV(estimator=model, search_spaces=param_distributions, n_iter=10, n_jobs=-1, cv=5, random_state=142)

TypeError: unhashable type: 'ConstantKernel'

In [71]:
clf.fit(X,y1)



LinAlgError: ("The kernel, RationalQuadratic(alpha=9.88e+04, length_scale=2.77e+04), is not returning a positive definite matrix. Try gradually increasing the 'alpha' parameter of your GaussianProcessRegressor estimator.", '66-th leading minor of the array is not positive definite')

In [55]:
model

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  Index(['Relative Compactness', 'Surface Area', 'Wall Area', 'Roof Area',
       'Overall Height', 'Glazing Area'],
      dtype='object')),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  Index(['Orientation', 'Glazing Area Distribution'], dtype='object'))])),
                ('gaussianprocessregressor', GaussianProcessRegressor())])

In [6]:
df['Glaxing Area Distribution'].value_counts()

1    144
2    144
3    144
4    144
5    144
0     48
Name: Glaxing Area Distribution, dtype: int64