# Baseline Model

This script applies a linear regression model the data driven feature set, given a baseline preformance for comparison.

In [1]:
import numpy as np
import pandas as pd
import datetime
import os
import glob
import json
import matplotlib.pyplot as plt

In [2]:
# set variables from config file
config_path = os.path.abspath('..')

with open(config_path + '/config-example.json', 'r') as f:
    config = json.load(f)

processing_path = config['DEFAULT']['processing_path']
epc_train_clean_fname = config['DEFAULT']['epc_train_clean_fname']
epc_test_clean_fname = config['DEFAULT']['epc_test_clean_fname']
epc_train_dd_fname = config['DEFAULT']['epc_train_dd_fname']
epc_test_dd_fname = config['DEFAULT']['epc_test_dd_fname']
epc_fname_suffix = config['DEFAULT']['epc_fname_suffix']

In [94]:
epc_train = pd.read_csv(os.path.join(processing_path,epc_train_dd_fname) + epc_fname_suffix,header = 0,delimiter = ',')
epc_test = pd.read_csv(os.path.join(processing_path,epc_test_dd_fname) + epc_fname_suffix,header = 0,delimiter = ',')

  epc_train = pd.read_csv(os.path.join(processing_path,epc_train_dd_fname) + epc_fname_suffix,header = 0,delimiter = ',')
  epc_test = pd.read_csv(os.path.join(processing_path,epc_test_dd_fname) + epc_fname_suffix,header = 0,delimiter = ',')


In [95]:
epc_train.drop(['BUILDING_REFERENCE_NUMBER','COUNTY','inspection_year'],axis=1,inplace=True)
epc_test.drop(['BUILDING_REFERENCE_NUMBER','COUNTY'],axis=1,inplace=True)

### one hot encode categorical values

In [96]:
epc_train.columns

Index(['CURRENT_ENERGY_EFFICIENCY', 'TOTAL_FLOOR_AREA', 'MAINS_GAS_FLAG',
       'HEAT_LOSS_CORRIDOR', 'built_form', 'energy_tariff',
       'floor_description', 'floor_level', 'glazed_type',
       'hotwater_description', 'lighting_description', 'mainheat_controls',
       'property_type', 'roof_description', 'transaction_type',
       'walls_description', 'window_description', 'locality', 'extension',
       'floor_height', 'habitable_rooms', 'open_fireplaces'],
      dtype='object')

In [97]:
# for col in ['MAINS_GAS_FLAG','HEAT_LOSS_CORRIDOR','built_form','energy_tariff','floor_description','floor_level',
#             'glazed_type','hotwater_description','lighting_description','mainheat_controls','property_type',
#             'roof_description','transaction_type','walls_description','window_description','locality','extension',
#             'floor_height','habitable_rooms','open_fireplaces']:
#     print(col)
#     for_dummy = epc_train.pop(col)
#     epc_train = pd.concat([epc_train, pd.get_dummies(for_dummy, prefix=col)], axis=1)

In [98]:
# for col in ['MAINS_GAS_FLAG','HEAT_LOSS_CORRIDOR','built_form','energy_tariff','floor_description','floor_level',
#             'glazed_type','hotwater_description','lighting_description','mainheat_controls','property_type',
#             'roof_description','transaction_type','walls_description','window_description','locality','extension',
#             'floor_height','habitable_rooms','open_fireplaces']:
#     print(col)
#     for_dummy = epc_test.pop(col)
#     epc_test = pd.concat([epc_test, pd.get_dummies(for_dummy, prefix=col)], axis=1)

In [99]:
def process_data_for_modelling(training_data,test_data):
    
    ''' 
    Prepares the data for modelling by one hot encoding the categorical features, converting discrete values into
    strings and dropping BUILDING_REFERENCE_NUMBER
    Parameters
      training_data: a dataframe of training data with columns called LODGEMENT_DATE, BUILDING_REFERENCE_NUMBER
      test_data: a dataframe of test data with columns called LODGEMENT_DATE, BUILDING_REFERENCE_NUMBER
    Returns two dataframes 
    '''
    
    training_data['mode'] = 'train'
    test_data['mode'] = 'test'
    epc = pd.concat([training_data, test_data])
    epc = epc.reset_index(drop = True)
    
    # try:
    #     epc.drop(['LODGEMENT_DATE'],axis=1,inplace=True)
    # except:
    #     pass
    
    # try:
    #     epc['extension'] = epc['extension'].astype('str')
    #     epc['habitable_rooms'] = epc['habitable_rooms'].astype('str')
    # except:
    #     pass
    
    for col in epc.dtypes[epc.dtypes == 'object'].index:
        if col not in ['mode']:
            print(col)
            for_dummy = epc.pop(col)
            epc = pd.concat([epc, pd.get_dummies(for_dummy, prefix=col)], axis=1)
        
    # features_to_drop = ['BUILDING_REFERENCE_NUMBER']
    # epc.drop(features_to_drop,axis=1,inplace=True)
    
    #Split back into train and test
    training_data = epc[epc['mode']=='train']
    test_data = epc[epc['mode']=='test']
    test_data = test_data.reset_index(drop=True)
    inputs_train = training_data.drop('mode',axis=1)
    inputs_test = test_data.drop('mode',axis=1)
    
    return inputs_train, inputs_test

In [100]:
inputs_train, inputs_test = process_data_for_modelling(epc_train, epc_test)

MAINS_GAS_FLAG
HEAT_LOSS_CORRIDOR
built_form
energy_tariff
floor_description
floor_level
glazed_type
hotwater_description
lighting_description
mainheat_controls
property_type
roof_description
transaction_type
walls_description
window_description
locality
extension
floor_height
habitable_rooms
open_fireplaces


In [101]:
inputs_train.head()

Unnamed: 0,CURRENT_ENERGY_EFFICIENCY,TOTAL_FLOOR_AREA,MAINS_GAS_FLAG_N,MAINS_GAS_FLAG_Y,HEAT_LOSS_CORRIDOR_heated corridor,HEAT_LOSS_CORRIDOR_no corridor,HEAT_LOSS_CORRIDOR_unheated corridor,built_form_Enclosed End-Terrace,built_form_Enclosed Mid-Terrace,built_form_detached,...,floor_height_2.7+,habitable_rooms_1,habitable_rooms_2,habitable_rooms_3,habitable_rooms_4,habitable_rooms_5,habitable_rooms_6+,open_fireplaces_0,open_fireplaces_1,open_fireplaces_2+
0,69,129.0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,1,1,0,0
1,61,66.0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
2,29,290.0,1,0,0,0,0,0,0,1,...,1,0,0,0,0,0,1,0,0,1
3,55,130.0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,1,0,0
4,77,99.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0


In [102]:
inputs_test.head()

Unnamed: 0,CURRENT_ENERGY_EFFICIENCY,TOTAL_FLOOR_AREA,MAINS_GAS_FLAG_N,MAINS_GAS_FLAG_Y,HEAT_LOSS_CORRIDOR_heated corridor,HEAT_LOSS_CORRIDOR_no corridor,HEAT_LOSS_CORRIDOR_unheated corridor,built_form_Enclosed End-Terrace,built_form_Enclosed Mid-Terrace,built_form_detached,...,floor_height_2.7+,habitable_rooms_1,habitable_rooms_2,habitable_rooms_3,habitable_rooms_4,habitable_rooms_5,habitable_rooms_6+,open_fireplaces_0,open_fireplaces_1,open_fireplaces_2+
0,71,73.0,0,1,0,0,1,0,0,1,...,0,0,0,1,0,0,0,1,0,0
1,65,53.0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
2,81,122.0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,57,78.87,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
4,64,81.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0


In [59]:
#Extract the target and features
target_train = inputs_train['CURRENT_ENERGY_EFFICIENCY']
inputs_train = inputs_train.drop('CURRENT_ENERGY_EFFICIENCY',axis=1)
target_test = inputs_test['CURRENT_ENERGY_EFFICIENCY']
inputs_test = inputs_test.drop('CURRENT_ENERGY_EFFICIENCY',axis=1)

In [60]:
inputs_train.head()

Unnamed: 0,TOTAL_FLOOR_AREA,MAINS_GAS_FLAG_N,MAINS_GAS_FLAG_Y,HEAT_LOSS_CORRIDOR_heated corridor,HEAT_LOSS_CORRIDOR_no corridor,HEAT_LOSS_CORRIDOR_unheated corridor,built_form_Enclosed End-Terrace,built_form_Enclosed Mid-Terrace,built_form_detached,built_form_terraced,...,floor_height_2.7+,habitable_rooms_1,habitable_rooms_2,habitable_rooms_3,habitable_rooms_4,habitable_rooms_5,habitable_rooms_6+,open_fireplaces_0,open_fireplaces_1,open_fireplaces_2+
0,129.0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,1,0,0
1,66.0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0
2,290.0,1,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,1,0,0,1
3,130.0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,1,0,0
4,99.0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,1,0,0


In [61]:
inputs_test.head()

Unnamed: 0,TOTAL_FLOOR_AREA,MAINS_GAS_FLAG_N,MAINS_GAS_FLAG_Y,HEAT_LOSS_CORRIDOR_heated corridor,HEAT_LOSS_CORRIDOR_no corridor,HEAT_LOSS_CORRIDOR_unheated corridor,built_form_Enclosed End-Terrace,built_form_Enclosed Mid-Terrace,built_form_detached,built_form_terraced,...,floor_height_2.7+,habitable_rooms_1,habitable_rooms_2,habitable_rooms_3,habitable_rooms_4,habitable_rooms_5,habitable_rooms_6+,open_fireplaces_0,open_fireplaces_1,open_fireplaces_2+
0,73.0,0,1,0,0,1,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
1,53.0,0,1,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,1,0,0
2,122.0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,78.87,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,1,0,0
4,81.0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,1,0,0


### scale numeric values 

In [62]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))

In [63]:
def scale_numeric(df,col):
    
    ''' 
    Fits a scaler called scaler to the specified column
    Parameters
      df: a dataframe
      col: numeric variable to scale
    Returns a dataframe
    '''
    
    null_index = df[col].isnull()
    df.loc[~null_index, [col]] = scaler.fit_transform(df.loc[~null_index, [col]])
    
    return df

In [64]:
inputs_train = scale_numeric(inputs_train,'TOTAL_FLOOR_AREA')
inputs_test = scale_numeric(inputs_test,'TOTAL_FLOOR_AREA')

In [73]:
inputs_train.head()

Unnamed: 0,TOTAL_FLOOR_AREA,MAINS_GAS_FLAG_N,MAINS_GAS_FLAG_Y,HEAT_LOSS_CORRIDOR_heated corridor,HEAT_LOSS_CORRIDOR_no corridor,HEAT_LOSS_CORRIDOR_unheated corridor,built_form_Enclosed End-Terrace,built_form_Enclosed Mid-Terrace,built_form_detached,built_form_terraced,...,floor_height_2.7+,habitable_rooms_1,habitable_rooms_2,habitable_rooms_3,habitable_rooms_4,habitable_rooms_5,habitable_rooms_6+,open_fireplaces_0,open_fireplaces_1,open_fireplaces_2+
0,0.444828,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,1,0,0
1,0.227586,0,1,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0
2,1.0,1,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,1,0,0,1
3,0.448276,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,1,0,0
4,0.341379,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,1,0,0


In [65]:
inputs_test.head()

Unnamed: 0,TOTAL_FLOOR_AREA,MAINS_GAS_FLAG_N,MAINS_GAS_FLAG_Y,HEAT_LOSS_CORRIDOR_heated corridor,HEAT_LOSS_CORRIDOR_no corridor,HEAT_LOSS_CORRIDOR_unheated corridor,built_form_Enclosed End-Terrace,built_form_Enclosed Mid-Terrace,built_form_detached,built_form_terraced,...,floor_height_2.7+,habitable_rooms_1,habitable_rooms_2,habitable_rooms_3,habitable_rooms_4,habitable_rooms_5,habitable_rooms_6+,open_fireplaces_0,open_fireplaces_1,open_fireplaces_2+
0,0.251724,0,1,0,0,1,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
1,0.182759,0,1,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,1,0,0
2,0.42069,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0.271966,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,1,0,0
4,0.27931,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,1,0,0


In [66]:
target_train.head()

0    69
1    61
2    29
3    55
4    77
Name: CURRENT_ENERGY_EFFICIENCY, dtype: int64

### fill missing values

In [67]:
epc_train[['TOTAL_FLOOR_AREA']].isnull().sum().sort_values(ascending = False) / epc_train.shape[0]

TOTAL_FLOOR_AREA    0.0
dtype: float64

### Train the model

In [68]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [69]:
#Initailise
reg = LinearRegression()
#train
reg.fit(inputs_train,target_train)

In [70]:
#View the coefficients
print('Coefficients: \n', reg.coef_)

#View the intercept
print(reg.intercept_)

Coefficients: 
 [ 2.97001370e+00  4.68188379e-01  1.18058596e+01  1.11238790e+01
  5.02013178e+00  6.48582992e+00  4.62390934e-01  3.60531640e+00
 -1.58840526e+00  1.11032033e+00  6.53507339e-02 -4.01886089e-01
  2.30306608e+00  9.08568264e-01  5.26359804e-01  3.80389850e-01
 -9.75459852e+00 -9.19881323e+00 -7.70260582e+10  5.85010880e+10
  9.96802026e-01 -1.53387770e+00 -4.16896835e+00 -3.35271512e+00
  1.40319403e+00  3.82437251e+00  1.05086866e+00 -9.91911929e+00
  1.30297535e+00  1.00782649e+00 -1.52649651e+01 -2.56960700e+00
 -2.86139309e+00 -1.24367428e+00 -1.26676679e+00 -9.43396211e-01
 -7.40648597e-01  6.74310148e-01  3.63006142e-01  9.62496039e-01
  2.37751531e-01  2.06241767e+00 -3.38046265e+00 -1.23835325e+00
 -2.09669163e-01 -4.39980565e+00  1.71770122e+00 -1.15860326e+01
  3.93445347e+00  2.75814366e+00  3.12227846e+00 -8.79812284e-01
 -8.36172212e-01  2.01588307e+00 -3.91760439e+00 -6.23751850e-01
  4.49085364e+00 -5.10718337e-01  3.59729054e+00 -4.05010064e+10
 -8.75609

In [71]:
#Get performance on training data
predict_train = reg.predict(inputs_train)
train_r_squared = r2_score(target_train,predict_train)
train_mse = mean_squared_error(target_train,predict_train)
print('Variance score: %.4f' % train_r_squared)
print("Mean squared error: %.4f" % train_mse)

Variance score: 0.6833
Mean squared error: 67.0523


In [75]:
#Get performance on test data
predict_test = reg.predict(inputs_test)
test_r_squared = r2_score(target_test,predict_test)
test_mse = mean_squared_error(target_test,predict_test)
print('Variance score: %.4f' % test_r_squared)
print("Mean squared error: %.4f" % test_mse)

Variance score: -430050539301837.6250
Mean squared error: 91145503376776192.0000
