In [None]:
%pylab inline
import pandas as pd

In [None]:
from collections import defaultdict

In [None]:
def get_onehots(df, cols):
    vals = defaultdict(list)
    for c in cols:
        for v in df[c].dropna().unique():
            vals[c].append(v)
    return dict(vals)

def set_onehots(df, vals, drop=True):
    for c in vals.keys():
        for v in vals[c]:
            df[c + '_' + str(v)] = df[c].apply(lambda x: 1 if x == v else 0)
        if drop:
            df.drop(c, inplace=True, axis=1)

In [None]:
train = pd.read_csv('data/train.csv')

cats = [c for c,d in zip(train.columns, train.dtypes) if str(d) == 'object']

set(train.columns[train.isnull().any(axis=0)]) - set(cats)

train.LotFrontage.fillna(0, inplace=True)
train.MasVnrArea.fillna(0, inplace=True)

onehotvals = get_onehots(train, cats)

set_onehots(train, onehotvals, drop=True)

In [None]:
cols = list(train.columns)
cols.remove('SalePrice')
cols.remove('GarageYrBlt')
cols.remove('Id')
cols.__len__()

In [None]:
from sklearn.model_selection import cross_val_score

def run_cross_val(clf, cv=8):
    scores = cross_val_score(clf, train[cols], log(train["SalePrice"]), cv=8,
                             scoring="neg_mean_squared_error")
    return sqrt(abs(scores)).mean()

In [None]:
from sklearn.linear_model import LinearRegression
run_cross_val(LinearRegression())

In [None]:
from sklearn.ensemble import RandomForestRegressor
run_cross_val(RandomForestRegressor(n_estimators=100))

In [None]:
from sklearn.linear_model import ElasticNet
run_cross_val(ElasticNet(alpha=0.0005, l1_ratio=.95))

In [None]:
logspace(log10(1), log10(3), 10)

In [None]:
#alphas = [.00001, .00003, .0001, .0003, .001, .003, .01, .03]
alphas = logspace(log10(.0003), log10(.001), 10)
cvs = [run_cross_val(ElasticNet(alpha=a, l1_ratio=.95)) for a in alphas]
plot(alphas, cvs, 'o')
gca().set_xscale('log')

In [None]:
trn, tst = train_test_split(train)
clf.fit(trn[cols], log(trn.SalePrice))


In [None]:
_p = clf.predict(tst[cols])
sqrt(mean((_p - log(tst.SalePrice.values))**2))

## Test for Bias vs. Variance

In [None]:
from sklearn.model_selection import train_test_split

def train_test_errors(df, clf, frac=0.5):
    tst, trn = train_test_split(df, test_size=frac)
    clf.fit(trn[cols], log(trn.SalePrice))
    test_p = clf.predict(tst[cols])
    train_p = clf.predict(trn[cols])

    test_err = sqrt(mean((test_p - log(tst.SalePrice.values))**2))
    train_err = sqrt(mean((train_p - log(trn.SalePrice.values))**2))
    return train_err, test_err

#### Random forest and linear regression are both clearly overfitting our data. 

In [None]:
clf = RandomForestRegressor(n_estimators=100)
map(mean, zip(*[train_test_errors(train, clf) for _ in range(5)]))

In [None]:
clf = LinearRegression()
map(mean, zip(*[train_test_errors(train, clf) for _ in range(5)]))

In [None]:
from sklearn.linear_model import ElasticNet
clf = ElasticNet(l1_ratio=.95, alpha=.0005)
map(mean, zip(*[train_test_errors(train, clf) for _ in range(5)]))

## Build a submission

In [None]:
clf = ElasticNet(l1_ratio=.95, alpha=.0005)

In [None]:
test = pd.read_csv('data/test.csv')
test.drop('GarageYrBlt', axis=1, inplace=True)
set_onehots(test, onehotvals, drop=True)

test.LotFrontage.fillna(0, inplace=True)
test.MasVnrArea.fillna(0, inplace=True)
test.BsmtFinSF1.fillna(0, inplace=True)
test.BsmtFinSF2.fillna(0, inplace=True)
test.BsmtUnfSF.fillna(0, inplace=True)
test.TotalBsmtSF.fillna(0, inplace=True)
test.BsmtFullBath.fillna(0, inplace=True)
test.BsmtHalfBath.fillna(0, inplace=True)
test.GarageCars.fillna(0, inplace=True)
test.GarageArea.fillna(0, inplace=True)

assert(cols == list(test.columns)[1:])

In [None]:
clf.fit(train[cols], log(train['SalePrice']))

In [None]:
pred = clf.predict(test[cols])

In [None]:
sub = pd.DataFrame({'Id': test.Id, 'SalePrice': exp(pred)})

In [None]:
sub.to_csv('data/submission2.csv', index=False)