In [1]:
# basic i/o
import os

# data structures
import numpy as np
import pandas as pd

# cleaning data
from scipy import sparse
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

# regressors
from sklearn.ensemble import RandomForestRegressor

# getting rid of annoying warnings.
import warnings
warnings.filterwarnings('ignore')

In [2]:
FILE_NAME = 'energy.csv'

def load_data(file_name=FILE_NAME, ratio=0.9):
    data = pd.read_csv(file_name).sample(frac=1)
    
    slice_ind = int(data.shape[0]*ratio)
    
    return data.iloc[0:slice_ind], data.iloc[slice_ind:]

In [3]:
_data = load_data(ratio=1)[0]
COST_COLS = _data.iloc[:,644:671].columns
VAR_COLS = _data.columns.difference(COST_COLS)

In [4]:
_data.shape

(5686, 759)

In [5]:
_data[COST_COLS].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5686 entries, 4723 to 5651
Data columns (total 27 columns):
DOLLAREL        5686 non-null float64
DOLELSPH        5686 non-null float64
DOLELCOL        5686 non-null float64
DOLELWTH        5686 non-null float64
DOLELRFG        5686 non-null float64
DOLELRFG1       5686 non-null float64
DOLELRFG2       5686 non-null float64
DOLELFRZ        5686 non-null float64
DOLELCOK        5686 non-null float64
DOLELMICRO      5686 non-null float64
DOLELCW         5686 non-null float64
DOLELCDR        5686 non-null float64
DOLELDWH        5686 non-null float64
DOLELLGT        5686 non-null float64
DOLELTVREL      5686 non-null float64
DOLELTV1        5686 non-null float64
DOLELTV2        5686 non-null float64
DOLELAHUHEAT    5686 non-null float64
DOLELAHUCOL     5686 non-null float64
DOLELEVAPCOL    5686 non-null float64
DOLELCFAN       5686 non-null float64
DOLELDHUM       5686 non-null float64
DOLELHUM        5686 non-null float64
DOLELPLPMP      

In [6]:
_data[VAR_COLS].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5686 entries, 4723 to 5651
Columns: 732 entries, ADQINSUL to ZYEARMADERANGE
dtypes: float64(235), int64(493), object(4)
memory usage: 31.8+ MB


In [7]:
class Preprocessor(object):
    def __init__(self, 
                 cost_cols=COST_COLS,
                 var_cols=VAR_COLS,
                 imputer=SimpleImputer(), 
                 normalizer=MinMaxScaler()):
        
        self.var_cols = VAR_COLS
        self.cost_cols = COST_COLS
        
        self.imputer = imputer
        self.normalizer = normalizer
        self.num_inds = [] # does not include self.cost_cols
        self.cat_inds = []
        
    # separate_num_and_cat: self, pd.DataFrame --> void
    # X = data[self.var_cols]
    def seperate_num_and_cat(self, X):
        # numerical data type
        cont_type = [int, np.int64, float, np.float64]

        for col in X.columns:
            if X[col].dtype in cont_type:
                self.num_inds.append(col)
                #data[col].to_numeric()
            else:
                self.cat_inds.append(col)
                
        self.num_inds.pop(0) # pop DOEID
                
    # indata includes all columns
    # indata is not consumed, a copy is made
    def fit(self, indata):
        data = indata.copy()
        X = data[self.var_cols]
        del X['DOEID']
        
        self.seperate_num_and_cat(X)
        
        X[self.num_inds] = self.imputer.fit_transform(X[self.num_inds])
        X[self.num_inds] = self.normalizer.fit(X[self.num_inds])
    
    # transform: self, pd.DataFrame --> pd.DataFrame
    # X includes all columns
    # X is consumed and modified
    # take log of SalePrice
    def transform(self, X):
        del X['DOEID']
        
        X[self.num_inds] = self.imputer.transform(X[self.num_inds])
        X[self.num_inds] = self.normalizer.transform(X[self.num_inds])
        
        return X
    
    # fit_transform: self, pd.DataFrame --> pd.DataFrame
    # data includes all columns
    # data is consumed and modified
    # take log of SalePrice
    def fit_transform(self, X):
        del X['DOEID']
        
        self.seperate_num_and_cat(X)
        
        X[self.num_inds] = self.imputer.fit_transform(X[self.num_inds])
        X[self.num_inds] = self.normalizer.fit_transform(X[self.num_inds])
        
        return X  


In [8]:
train, test = load_data(ratio=0.9)
preproc = Preprocessor()

train = preproc.fit_transform(train)
test = preproc.transform(test)

X_train = train[preproc.num_inds]
y_train = train[preproc.cost_cols]

X_test = test[preproc.num_inds]
y_test = test[preproc.cost_cols]

In [53]:
rg = RandomForestRegressor(n_estimators=100)

rg.fit(X_train, y_train)
ypred = rg.predict(X_test)

In [54]:
# error has unit in cents
def rms(a,b):
    return np.sqrt(np.sum((a-b) ** 2, axis=0)/a.shape[0])

In [55]:
e = rms(y_test.values, ypred)

In [56]:
e/(y_test).mean()

DOLLAREL        0.231737
DOLELSPH        0.839993
DOLELCOL        0.551527
DOLELWTH        0.703298
DOLELRFG        0.337231
DOLELRFG1       0.239746
DOLELRFG2       0.719688
DOLELFRZ        0.782396
DOLELCOK        1.087714
DOLELMICRO      0.911249
DOLELCW         0.686710
DOLELCDR        0.801651
DOLELDWH        1.162136
DOLELLGT        0.937196
DOLELTVREL      0.639558
DOLELTV1        0.589680
DOLELTV2        0.966701
DOLELAHUHEAT    1.088693
DOLELAHUCOL     0.710305
DOLELEVAPCOL    3.631569
DOLELCFAN       0.801465
DOLELDHUM       2.651550
DOLELHUM        1.510667
DOLELPLPMP      4.609275
DOLELHTBPMP     3.448097
DOLELHTBHEAT    4.290489
DOLELNEC        1.106994
dtype: float64

In [57]:
e

array([0.0388176 , 0.05097551, 0.04122724, 0.05023878, 0.05567021,
       0.05681752, 0.04712786, 0.06156616, 0.06813193, 0.07907828,
       0.0645404 , 0.06188827, 0.08055816, 0.06089894, 0.0564688 ,
       0.05616455, 0.04787119, 0.04309419, 0.02837841, 0.04111706,
       0.0733898 , 0.04845283, 0.05816129, 0.03998211, 0.05612831,
       0.05903695, 0.03294202])

In [58]:
(y_test).mean()

DOLLAREL        0.167507
DOLELSPH        0.060686
DOLELCOL        0.074751
DOLELWTH        0.071433
DOLELRFG        0.165080
DOLELRFG1       0.236990
DOLELRFG2       0.065484
DOLELFRZ        0.078689
DOLELCOK        0.062638
DOLELMICRO      0.086780
DOLELCW         0.093985
DOLELCDR        0.077201
DOLELDWH        0.069319
DOLELLGT        0.064980
DOLELTVREL      0.088294
DOLELTV1        0.095246
DOLELTV2        0.049520
DOLELAHUHEAT    0.039583
DOLELAHUCOL     0.039952
DOLELEVAPCOL    0.011322
DOLELCFAN       0.091570
DOLELDHUM       0.018273
DOLELHUM        0.038500
DOLELPLPMP      0.008674
DOLELHTBPMP     0.016278
DOLELHTBHEAT    0.013760
DOLELNEC        0.029758
dtype: float64