In [None]:
%pylab inline
import pandas as pd

In [None]:
from collections import defaultdict

In [None]:
def get_onehots(df, cols):
    vals = defaultdict(list)
    for c in cols:
        for v in df[c].dropna().unique():
            vals[c].append(v)
    return dict(vals)

def set_onehots(df, vals, drop=True):
    for c in vals.keys():
        for v in vals[c]:
            df[c + '_' + str(v)] = df[c].apply(lambda x: 1 if x == v else 0)
        if drop:
            df.drop(c, inplace=True, axis=1)

In [None]:
train = pd.read_csv('data/train.csv')

cats = [c for c,d in zip(train.columns, train.dtypes) if str(d) == 'object']

set(train.columns[train.isnull().any(axis=0)]) - set(cats)

train.LotFrontage.fillna(0, inplace=True)
train.MasVnrArea.fillna(0, inplace=True)

onehotvals = get_onehots(train, cats)

set_onehots(train, onehotvals, drop=True)

In [None]:
cols = list(train.columns)
cols.remove('SalePrice')
cols.remove('GarageYrBlt')
cols.remove('Id')
cols.__len__()

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

clf = LinearRegression()
cross_val_score(clf, train[cols], log(train["SalePrice"]), cv=8, scoring="neg_mean_squared_error").mean()

In [None]:
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor()
cross_val_score(clf, train[cols], log(train["SalePrice"]), cv=8, scoring='neg_mean_squared_error').mean()

### Build a submission

In [None]:
test = pd.read_csv('data/test.csv')
test.drop('GarageYrBlt', axis=1, inplace=True)
set_onehots(test, onehotvals, drop=True)

test.LotFrontage.fillna(0, inplace=True)
test.MasVnrArea.fillna(0, inplace=True)
test.BsmtFinSF1.fillna(0, inplace=True)
test.BsmtFinSF2.fillna(0, inplace=True)
test.BsmtUnfSF.fillna(0, inplace=True)
test.TotalBsmtSF.fillna(0, inplace=True)
test.BsmtFullBath.fillna(0, inplace=True)
test.BsmtHalfBath.fillna(0, inplace=True)
test.GarageCars.fillna(0, inplace=True)
test.GarageArea.fillna(0, inplace=True)

assert(cols == list(test.columns)[1:])

In [None]:
clf.fit(train[cols], log(train['SalePrice']))

In [None]:
pred = clf.predict(test[cols])

In [None]:
sub = pd.DataFrame({'Id': test.Id, 'SalePrice': exp(pred)})

In [None]:
sub.to_csv('data/submission1.csv', index=False)