We can do better.

So this time:
- grid search for better parameters

- Can we make a better RF model? **yes**

In [74]:
import pandas as pd
import numpy as np
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Lasso, BayesianRidge, ElasticNet, Ridge, SGDRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
from sklearn.pipeline import make_pipeline
from scipy.stats import skew
from sklearn.metrics import mean_squared_error

In [228]:
    #def load_data():
    train = pd.read_csv('../input/train.csv')
    test = pd.read_csv('../input/test.csv')
    
    combined = train.append(test, ignore_index=True).drop(['Id','SalePrice'], axis=1)
    
    numerics = (combined.select_dtypes(exclude=['object'])
                .apply(lambda x: np.log1p(x) if skew(x, nan_policy='omit')>0.75 else x))   

    cats = combined.select_dtypes(include=['object']).copy()


    ordered_levels = {
        "Alley": ["Grvl", "Pave"],
        "BsmtCond": ["Po", "Fa", "TA", "Gd"],
        "BsmtExposure": ["No", "Mn", "Av", "Gd"],
        "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
        "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
        "BsmtQual": ["Fa", "TA", "Gd", "Ex"],
        "CentralAir": ["N", "Y"],
        "Electrical": ["Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
        "ExterCond": ["Po", "Fa", "TA", "Gd", "Ex"],
        "ExterQual": ["Fa", "TA", "Gd", "Ex"],
        "Fence": ["MnWw", "GdWo", "MnPrv", "GdPrv"],
        "FireplaceQu": ["Po", "Fa", "TA", "Gd", "Ex"],
        'Functional': ['Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ'],
        "GarageCond": ["Po", "Fa", "TA", "Gd", "Ex"],
        "GarageFinish": ["Unf", "RFn", "Fin"],
        "GarageQual": ["Po", "Fa", "TA", "Gd", "Ex"],
        "HeatingQC": ["Po", "Fa", "TA", "Gd", "Ex"],
        "KitchenQual": ["Fa", "TA", "Gd", "Ex"],
        "LotShape": ["IR3", "IR2", "IR1", "Reg"],
        "PavedDrive": ["N", "P", "Y"],
        "PoolQC": ["Fa", "Gd", "Ex"],
        "Street": ["Grvl", "Pave"],   
        "Utilities": ["NoSeWa", "AllPub"]
    }
    
    for c in cats.columns:
        if c in ordered_levels:
            cats[c] = cats[c].astype('category', categories = ordered_levels[c], ordered=True).cat.codes
        else:
            cats[c] = cats[c].astype('category')
        
    cats = pd.get_dummies(cats)

    combined = pd.concat([numerics, cats], axis=1)
    
    combined = (combined - combined.mean()) / combined.std()
    combined = combined.fillna(0) 

    y = np.log(train['SalePrice'].values)
    X = combined.iloc[:train.shape[0],:]
    X_submission = combined.iloc[train.shape[0]:,:]
    ids_submission = test['Id'].values
    #return y, X, X_submission, ids_submission

In [102]:
# load data
#y, X, X_submission, ids_submission = load_data()

In [103]:
#poly_features = PolynomialFeatures(interaction_only=True)
#X_interactions = poly_features.fit_transform(X)

In [104]:
#0.137498547071

In [105]:
%%time
model = Lasso(random_state=1337, max_iter=5000, alpha=0.005)
kf = KFold(10, random_state=1773)

params = {'alpha': [0.006, 0.005, 0.004]}

grid = GridSearchCV(model, params, scoring='neg_mean_squared_error', cv=kf)

#score = cross_val_score(model, X_interactions, y, scoring='neg_mean_squared_error', cv=4, n_jobs=4)
#print np.mean(np.sqrt(-score))

CPU times: user 206 µs, sys: 3 µs, total: 209 µs
Wall time: 214 µs


In [57]:
#score: 0.1380

In [59]:
%%time
grid.fit(X, y);
print 'score: {:.4f}'.format(np.sqrt(-grid.best_score_))
print 'params:', grid.best_params_

score: 0.1398
params: {'alpha': 0.005}
CPU times: user 1.72 s, sys: 144 ms, total: 1.86 s
Wall time: 1.85 s


In [120]:
#rmse: 0.1219

In [156]:
%%time
model_bg_ls = BaggingRegressor(
                Lasso(fit_intercept=True, alpha=0.0015, random_state=1337, max_iter=5000),
                n_estimators=100, 
                max_features=0.9,
                max_samples=0.25, oob_score=True, 
                random_state=1337)
model_bg_ls.fit(X,y)
print 'rmse: {:.4f}'.format(np.sqrt(mean_squared_error(y, model_bg_ls.oob_prediction_)))

rmse: 0.1220
CPU times: user 1.69 s, sys: 76.8 ms, total: 1.77 s
Wall time: 1.75 s


In [285]:
%matplotlib inline
import matplotlib.pyplot as plt

In [320]:
np.arange(-5, 6, 10)

array([-5,  5])

In [332]:


coef = np.zeros(288)

for i, _ in enumerate(coef):
    _x = np.zeros((1,288)).repeat(2, axis=0)
    _x[:, i] = np.arange(-5, 6, 10)
    _y = model_bg_ls.predict(_x)
    coef[i] = (_y[1] - _y[0]) / 10.

In [339]:
coef2 = np.zeros(288)

for i, _ in enumerate(coef):
    _x = np.zeros((1,288)).repeat(2, axis=0)
    _x[:, 15] = 5
    _x[:, i] = np.arange(-5, 6, 10)
    _y = model_bg_ls.predict(_x)
    coef2[i] = (_y[1] - _y[0]) / 10.

In [348]:
index_diffs = (coef != coef2)

In [351]:
max(coef[index_diffs] - coef2[index_diffs])

1.4224732503009818e-15

In [338]:
max_index = np.argmax(coef)
print max_index, coef[max_index]

15 0.120579224585


In [278]:
np.shape(_x)

(10, 288)

In [261]:
model_bg_ls.predict(np.zeros((1,288)))

array([ 12.01863626])

In [262]:
model_bg_ls.predict(np.zeros((1,288)))

array([ 12.01863626])

In [None]:
#0.1352

In [252]:
%%time
model_bg_dt = BaggingRegressor(
                DecisionTreeRegressor(max_depth=15, random_state=1337),
                n_estimators=1000,
                max_features=0.6032, 
                max_samples=0.9, oob_score=True,
                random_state=1337)
model_bg_dt.fit(X,y)
print 'rmse: {:.4f}'.format(np.sqrt(mean_squared_error(y, model_bg_dt.oob_prediction_)))

rmse: 0.1352
CPU times: user 15.8 s, sys: 52.6 ms, total: 15.8 s
Wall time: 15.8 s


In [240]:
oob_preds = model.oob_prediction_

0

In [65]:
model.fit(X,y)

BaggingRegressor(base_estimator=Lasso(alpha=0.005, copy_X=True, fit_intercept=True, max_iter=5000,
   normalize=False, positive=False, precompute=False, random_state=1337,
   selection='cyclic', tol=0.0001, warm_start=False),
         bootstrap=True, bootstrap_features=False, max_features=0.9,
         max_samples=0.5, n_estimators=50, n_jobs=1, oob_score=True,
         random_state=1337, verbose=0, warm_start=False)

In [67]:
model.oob_score_

0.88189861526819158

In [51]:
score = cross_val_score(model, X_interactions, y, scoring = 'neg_mean_squared_error', cv=kf, n_jobs=8)
print 'score: {:.4f}'.format(np.mean(np.sqrt(-score)))

score: 0.1366


In [None]:
model = BaggingRegressor(
            make_pipeline(
                PolynomialFeatures(degree=2, interaction_only=True),
                Lasso(fit_intercept=True, random_state=1337, max_iter=5000)),
            
            n_estimators=50, 
            random_state=1337)

kf = KFold(10, random_state=1773)

#params: {'max_features': 0.4, 'max_samples': 0.6, 'bootstrap': False, 'bootstrap_features': False}

#{'max_features': 0.3, 'max_samples': 0.6, 'bootstrap': False, 'bootstrap_features': False}

parameters = {
    'base_estimator__lasso__alpha': [0.001],
    'bootstrap': [False],
    'bootstrap_features': [False],
    'max_features': [0.9],
    'max_samples': [0.3]
}

#grid = GridSearchCV(model, parameters, scoring='neg_mean_squared_error', cv = kf, n_jobs=8)

In [None]:
#0.1228

In [None]:
%%time
grid.fit(X, y);
print 'score: {:.4f}'.format(np.sqrt(-grid.best_score_))
print 'params:', grid.best_params_

In [None]:
# create submission predictions
preds_submission = model.predict(X_submission)

# save submission
pd.DataFrame({'Id': ids_submission, 'SalePrice': np.exp(preds_submission)})\
  .to_csv('../output/04_gridsearch_RF.csv', index=False)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.plot(np.sort(model.cv_results_['mean_test_score']));