In [1]:
# basic i/o
import os

# data structures
import numpy as np
import pandas as pd

# cleaning data
from scipy import sparse
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

# regressors
from sklearn.ensemble import RandomForestRegressor

# getting rid of annoying warnings.
import warnings
warnings.filterwarnings('ignore')

In [2]:
FILE_NAME = 'energy.csv'

def load_data(file_name=FILE_NAME, ratio=0.9):
    data = pd.read_csv(file_name).sample(frac=1)
    
    slice_ind = int(data.shape[0]*ratio)
    
    return data.iloc[0:slice_ind], data.iloc[slice_ind:]

In [3]:
_data = load_data(ratio=1)[0]
COST_COLS = _data.iloc[:,644:671].columns
VAR_COLS = _data.columns.difference(COST_COLS)

In [4]:
_data.shape

(5686, 759)

In [5]:
_data[COST_COLS].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5686 entries, 348 to 1855
Data columns (total 27 columns):
DOLLAREL        5686 non-null float64
DOLELSPH        5686 non-null float64
DOLELCOL        5686 non-null float64
DOLELWTH        5686 non-null float64
DOLELRFG        5686 non-null float64
DOLELRFG1       5686 non-null float64
DOLELRFG2       5686 non-null float64
DOLELFRZ        5686 non-null float64
DOLELCOK        5686 non-null float64
DOLELMICRO      5686 non-null float64
DOLELCW         5686 non-null float64
DOLELCDR        5686 non-null float64
DOLELDWH        5686 non-null float64
DOLELLGT        5686 non-null float64
DOLELTVREL      5686 non-null float64
DOLELTV1        5686 non-null float64
DOLELTV2        5686 non-null float64
DOLELAHUHEAT    5686 non-null float64
DOLELAHUCOL     5686 non-null float64
DOLELEVAPCOL    5686 non-null float64
DOLELCFAN       5686 non-null float64
DOLELDHUM       5686 non-null float64
DOLELHUM        5686 non-null float64
DOLELPLPMP      5

In [6]:
_data[VAR_COLS].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5686 entries, 348 to 1855
Columns: 732 entries, ADQINSUL to ZYEARMADERANGE
dtypes: float64(235), int64(493), object(4)
memory usage: 31.8+ MB


In [7]:
class Preprocessor(object):
    def __init__(self, 
                 cost_cols=COST_COLS,
                 var_cols=VAR_COLS,
                 imputer=SimpleImputer(), 
                 normalizer=MinMaxScaler()):
        
        self.var_cols = VAR_COLS
        self.cost_cols = COST_COLS
        
        self.imputer = imputer
        self.normalizer = normalizer
        self.num_inds = [] # does not include self.cost_cols
        self.cat_inds = []
        
    # separate_num_and_cat: self, pd.DataFrame --> void
    # X = data[self.var_cols]
    def seperate_num_and_cat(self, X):
        # numerical data type
        cont_type = [int, np.int64, float, np.float64]

        for col in X.columns:
            if X[col].dtype in cont_type:
                self.num_inds.append(col)
                #data[col].to_numeric()
            else:
                self.cat_inds.append(col)
                
        self.num_inds.pop(0) # pop DOEID
                
    # indata includes all columns
    # indata is not consumed, a copy is made
    def fit(self, indata):
        data = indata.copy()
        X = data[self.var_cols]
        del X['DOEID']
        
        self.seperate_num_and_cat(X)
        
        X[self.num_inds] = self.imputer.fit_transform(X[self.num_inds])
        X[self.num_inds] = self.normalizer.fit(X[self.num_inds])
    
    # transform: self, pd.DataFrame --> pd.DataFrame
    # X includes all columns
    # X is consumed and modified
    # take log of SalePrice
    def transform(self, X):
        del X['DOEID']
        
        X[self.num_inds] = self.imputer.transform(X[self.num_inds])
        X[self.num_inds] = self.normalizer.transform(X[self.num_inds])
        
        return X
    
    # fit_transform: self, pd.DataFrame --> pd.DataFrame
    # data includes all columns
    # data is consumed and modified
    # take log of SalePrice
    def fit_transform(self, X):
        del X['DOEID']
        
        self.seperate_num_and_cat(X)
        
        X[self.num_inds] = self.imputer.fit_transform(X[self.num_inds])
        X[self.num_inds] = self.normalizer.fit_transform(X[self.num_inds])
        
        return X  


In [8]:
train, test = load_data(ratio=0.9)
preproc = Preprocessor()

train = preproc.fit_transform(train)
test = preproc.transform(test)

X_train = train[preproc.num_inds]
y_train = train[preproc.cost_cols]

X_test = test[preproc.num_inds]
y_test = test[preproc.cost_cols]

In [9]:
rg = RandomForestRegressor(n_estimators=1)

rg.fit(X_train, y_train)
ypred = rg.predict(X_test)

KeyboardInterrupt: 

In [None]:
def rel_l1(a,b):
    return np.sum(np.abs(a/b-1))/len(a)

In [None]:
rel_l1(y_test.values, ypred)