In [2]:
%pylab inline
import pandas as pd
import sys
sys.path.append('../src/')
import utils
import encode as enc

Populating the interactive namespace from numpy and matplotlib


In [3]:
train = utils.get_train_data([], False)
test = utils.get_test_data(None, [])

## Testing numeric columns with Elastic Net

In [5]:
from sklearn.linear_model import ElasticNet

In [22]:
ncols = [c for c, d in zip(train.columns, train.dtypes) if str(d) in ['int64', 'float64']]
ncols.remove('SalePrice')
addback = ['Id', 'SalePrice']

In [33]:
# Test fillna modes
res = {}
for mode in ['zero', 'mean', 'median']:
    trn, tst = train.copy(), test.copy()
    trn, tst = enc.fix_numeric(trn, tst, ncols, fillna_mode=mode)
    res[mode] = utils.run_cross_val(ElasticNet(l1_ratio=.73, alpha=.00058), trn[ncols + ['SalePrice']], cv=8)

In [34]:
res

{'mean': 0.1539203108417157,
 'median': 0.15392614340375607,
 'zero': 0.1531580017962314}

In [35]:
# Test scaling modes
res = {}
for mode in ['normal', 'uniform', 'none']:
    trn, tst = train.copy(), test.copy()
    trn, tst = enc.fix_numeric(trn, tst, ncols, scaling=mode)
    res[mode] = utils.run_cross_val(ElasticNet(l1_ratio=.73, alpha=.00058), trn[ncols + ['SalePrice']], cv=8)

In [36]:
res

{'none': 0.15350751154012132,
 'normal': 0.1531580017962314,
 'uniform': 0.15285367502527303}

## Testing categorical columns with Elastic Net

In [38]:
ccols = [c for c, d in zip(train.columns, train.dtypes) if str(d) == 'object']

In [42]:
# Test whether to include null columns
res = {}
for nc in [True, False]:
    trn, tst = train.copy(), test.copy()
    trn, tst = enc.fix_numeric(trn, tst, ncols, scaling='uniform')
    trn, tst = enc.fix_categorical(trn, tst, 'one_hot', nc)
    res[nc] = utils.run_cross_val(ElasticNet(l1_ratio=.73, alpha=.00058), trn, cv=8)

In [43]:
res

{False: 0.13158821716088398, True: 0.13153808100628639}

In [46]:
# Test whether to one-hot encode
res = {}
for mode in ['one_hot', 'to_int']:
    trn, tst = train.copy(), test.copy()
    trn, tst = enc.fix_numeric(trn, tst, ncols, scaling='uniform')
    trn, tst = enc.fix_categorical(trn, tst, option=mode, nullcol=True)
    res[mode] = utils.run_cross_val(ElasticNet(l1_ratio=.73, alpha=.00058), trn, cv=8)

In [47]:
res

{'one_hot': 0.13153808100628639, 'to_int': 0.14958650219254643}

## Testing categorical columns with Random Forest

In [48]:
from sklearn.ensemble import RandomForestRegressor

In [49]:
ccols = [c for c, d in zip(train.columns, train.dtypes) if str(d) == 'object']

In [50]:
# Test whether to include null columns
res = {}
for nc in [True, False]:
    trn, tst = train.copy(), test.copy()
    trn, tst = enc.fix_numeric(trn, tst, ncols, scaling='uniform')
    trn, tst = enc.fix_categorical(trn, tst, 'one_hot', nc)
    res[nc] = utils.run_cross_val(RandomForestRegressor(50), trn, cv=8)

In [51]:
res

{False: 0.14171904077426042, True: 0.14084292696661072}

In [52]:
# Test whether to one-hot encode
res = {}
for mode in ['one_hot', 'to_int']:
    trn, tst = train.copy(), test.copy()
    trn, tst = enc.fix_numeric(trn, tst, ncols, scaling='uniform')
    trn, tst = enc.fix_categorical(trn, tst, option=mode, nullcol=True)
    res[mode] = utils.run_cross_val(RandomForestRegressor(50), trn, cv=8)

In [53]:
res

{'one_hot': 0.14084292696661072, 'to_int': 0.14455541971509378}

## Testing whether we should drop a bunch of columns

In [58]:
res = {}
for drp in [True, False]:
    trn, tst = train.copy(), test.copy()
    trn, tst = enc.fix_numeric(trn, tst, ncols, scaling='uniform')
    if drp:
        trn.drop(['Utilities', 'RoofMatl', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
                  'Heating', 'LowQualFinSF', 'BsmtFullBath', 'BsmtHalfBath', 'Functional',
                  'GarageArea', 'GarageCond', 'WoodDeckSF', 'OpenPorchSF',
                  'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence',
                  'MiscFeature', 'MiscVal'], axis=1, inplace=True)
    trn, tst = enc.fix_categorical(trn, tst, option='one_hot', nullcol=True)
    res[drp] = utils.run_cross_val(ElasticNet(l1_ratio=.73, alpha=.00058), trn, cv=8)

In [59]:
res

{False: 0.13153808100628639, True: 0.13694566103699585}