In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import xgboost as xgb

import matplotlib.pyplot as plt
from scipy.stats import skew
from scipy.stats.stats import pearsonr

from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, Lasso, LassoCV, LassoLarsCV
from sklearn.model_selection import cross_val_score


%config InlineBackend.figure_format = 'retina' #set 'png' here when working on notebook
%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/house-prices-advanced-regression/train.csv
/kaggle/input/house-prices-advanced-regression/test.csv


Load data and combine input data into one DataFrame. Preprocessing from apapiu: https://www.kaggle.com/apapiu/regularized-linear-models

In [None]:
train = pd.read_csv("../input/house-prices-advanced-regression/train.csv")
test = pd.read_csv("../input/house-prices-advanced-regression/test.csv")

all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                      test.loc[:,'MSSubClass':'SaleCondition']))

In [None]:
#log transform the target:
train["SalePrice"] = np.log1p(train["SalePrice"])

#log transform skewed numeric features:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

all_data = pd.get_dummies(all_data)

#filling NA's with the mean of the column:
all_data = all_data.fillna(all_data.mean())

#creating matrices for sklearn:
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice

Below follows problem 2.2. Output submitted to Kaggle. Results for alpha = 0.1 is rmse = 0.13565

2.  Follow the data preprocessing steps from https://www.kaggle.com/apapiu/house-prices-advanced-regression-techniques/regularized-linear-models.  Then run a ridge regression usingα= 0.1.Make a submission of this prediction, what is the RMSE you get?(Hint:  remember to exponentiate np.expm1(ypred) your predictions).

In [None]:
def rmse_cv(model):
    rmse= np.sqrt(-cross_val_score(model, X_train, y, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)

ridge = Ridge(alpha=0.1)
ridge.fit(X_train, y)

out = pd.DataFrame({"SalePrice":np.expm1(ridge.predict(X_test))})
i = pd.DataFrame({"Id":test["Id"]})
out = pd.concat([i, out], axis=1)
out.to_csv("Ridge0a1.csv", index=False)

Below follows problem 2.3. CV tuned Ridge and Lasso.

Lasso score: 0.12455

3.  Compare a ridge regression and a lasso regression model.  Optimize the alphas using crossvalidation.  What is the best score you can get from a single ridge regression model and froma single lasso model?

In [None]:
model_ridge = RidgeCV(alphas = [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]).fit(X_train, y)
print("Ridge rmse mean: ", rmse_cv(model_ridge).mean())
print("Optimal Ridge Alpha: ", model_ridge.alpha_)

model_lasso = LassoCV(alphas = [1, 0.1, 0.001, 0.0005]).fit(X_train, y)
print("Lasso rmse mean: ", rmse_cv(model_lasso).mean())
print("Optimal Lasso Alpha: ", model_lasso.alpha_)

Below follows problem 2.4. Lasso coefficients with varying Alpha.

4.  Plot the l0 norm (number of nonzeros) of the coefficients that lasso produces as you vary thestrength of regularization parameter alpha.

In [None]:
alphas = [1, 0.5, 0.25, 0.1, 0.005, 0.001, 0.0005]
coef_alpha = [0, 0, 0, 0, 0, 0, 0]
for i in range(0,7):
    model_las = Lasso(alpha = alphas[i]).fit(X_train, y)
    coef = pd.Series(model_las.coef_, index = X_train.columns)
    coef_alpha[i] = sum(coef != 0)
    
df = pd.DataFrame({"alpha":alphas, "l0_norm":coef_alpha})

print(df)

plt.plot('alpha', 'l0_norm', data = df)
plt.xlabel('alpha')
plt.ylabel('l0_norm')
plt.xscale('log')
plt.show()

5.  Add the outputs of your models as features and train a ridge regression on all the featuresplus the model outputs.  Be careful not to overfit.  What score can you get?

Score: 0.12495

In [None]:
ridge_y = model_ridge.predict(X_train)
lasso_y = model_lasso.predict(X_train)
new_y = pd.DataFrame({"ridge":ridge_y, "lasso":lasso_y})

X_train = pd.concat([X_train, new_y], axis=1)

model_ridge2 = RidgeCV(alphas = [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]).fit(X_train, y)
print("Ridge rmse mean: ", rmse_cv(model_ridge2).mean())
print("Optimal Ridge Alpha: ", model_ridge2.alpha_)

ridge_y = model_ridge.predict(X_test)
lasso_y = model_lasso.predict(X_test)
new_y = pd.DataFrame({"ridge":ridge_y, "lasso":lasso_y})

X_test = pd.concat([X_test, new_y], axis=1)

out = pd.DataFrame({"SalePrice":np.expm1(model_ridge2.predict(X_test))})
i = pd.DataFrame({"Id":test["Id"]})
out = pd.concat([i, out], axis=1)
out.to_csv("BootstrapRidge.csv", index=False)

6.  Train a gradient boosting regression, e.g., using XGBoost.  What score can you get just froma single XGB? (you will need to optimize over its parameters).  XGB is a great friend to allgood Kagglers!


Attempts in chronological order:

Untuned: 0.14239
Eta=0.1: 0.13468
Fully tuned: 0.12878

In [None]:
xgb_model = xgb.XGBRegressor(learning_rate =0.1, n_estimators=1000, max_depth=6,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=1, seed=27)
xgb_model.fit(X_train, y)

out = pd.DataFrame({"SalePrice":np.expm1(xgb_model.predict(X_test))})
i = pd.DataFrame({"Id":test["Id"]})
out = pd.concat([i, out], axis=1)
out.to_csv("XGB.csv", index=False)

cv_folds=5
early_stopping_rounds=50
    
# get optimal number of estimators
xgb_param = xgb_model.get_xgb_params()
xgtrain = xgb.DMatrix(X_train, label=y)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=xgb_model.get_params()['n_estimators'], nfold=cv_folds, early_stopping_rounds=early_stopping_rounds)
xgb_model.set_params(n_estimators=cvresult.shape[0])

#n_estimators optimal at 186 with eta = 0.1, 92 with eta 0.2
    
xgb_model.fit(X_train, y)

In [None]:
from sklearn.model_selection import GridSearchCV

param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}

gsearch1 = GridSearchCV(estimator = xgb.XGBRegressor( learning_rate =0.1, n_estimators=186, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='accuracy',n_jobs=-1, cv=5)
gsearch1.fit(X_train, y)
gsearch1.best_params_, gsearch1.best_score_

In [None]:
#best max_depth: 3, min_child_weight:1

gsearch1.best_params_

In [None]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = xgb.XGBRegressor( learning_rate =0.1, n_estimators=186, max_depth=3,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=1,seed=27), 
 param_grid = param_test3, scoring='accuracy',n_jobs=-1, cv=5)
gsearch3.fit(X_train, y)
gsearch3.best_params_

In [None]:
xgb_model = xgb.XGBRegressor(learning_rate =0.1, n_estimators=1000, max_depth=3,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=1, seed=27)

# get new optimal number of estimators
xgb_param = xgb_model.get_xgb_params()
xgtrain = xgb.DMatrix(X_train, label=y)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=xgb_model.get_params()['n_estimators'], nfold=cv_folds, early_stopping_rounds=early_stopping_rounds)
xgb_model.set_params(n_estimators=cvresult.shape[0])

# new is 320 estimators

In [None]:
#optimize subsample and colsample_bytree
#value for both is 0.6 from grid search

param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch4 = GridSearchCV(estimator = xgb.XGBRegressor( learning_rate =0.1, n_estimators=320, max_depth=3,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test4, scoring='accuracy',n_jobs=-1, cv=5)
gsearch4.fit(X_train, y)
gsearch4.best_params_, gsearch4.best_score_

In [None]:
out = pd.DataFrame({"SalePrice":np.expm1(gsearch4.predict(X_test))})
i = pd.DataFrame({"Id":test["Id"]})
out = pd.concat([i, out], axis=1)
out.to_csv("XGBtuned.csv", index=False)

7.  Do your best to get the more accurate model.  Try feature engineering and stacking manymodels.  You are allowed to use any public tool in python.  No non-python tools allowed.

Update data preprocessing as shown at https://www.kaggle.com/adamml/how-to-be-in-top-10-for-beginner

Use same model as optimized in step 6. Initial results 0.12911. Used model as shown in link above, hoping for improvement. Score 0.12112, best yet.

In [26]:
train = pd.read_csv("../input/house-prices-advanced-regression/train.csv")
test = pd.read_csv("../input/house-prices-advanced-regression/test.csv")

train['SalePrice'] = np.log1p(train['SalePrice'])

y_train = train['SalePrice']
test_id = test['Id']
all_data = pd.concat([train, test], axis=0, sort=False)
all_data = all_data.drop(['Id', 'SalePrice'], axis=1)

Total = all_data.isnull().sum().sort_values(ascending=False)
percent = (all_data.isnull().sum() / all_data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([Total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(25)

all_data.drop((missing_data[missing_data['Total'] > 5]).index, axis=1, inplace=True)
print(all_data.isnull().sum().max())

# filling the numeric data
numeric_missed = ['BsmtFinSF1',
                  'BsmtFinSF2',
                  'BsmtUnfSF',
                  'TotalBsmtSF',
                  'BsmtFullBath',
                  'BsmtHalfBath',
                  'GarageArea',
                  'GarageCars']

for feature in numeric_missed:
    all_data[feature] = all_data[feature].fillna(0)
    
    #filling categorical data
categorical_missed = ['Exterior1st',
                  'Exterior2nd',
                  'SaleType',
                  'MSZoning',
                   'Electrical',
                     'KitchenQual']

for feature in categorical_missed:
    all_data[feature] = all_data[feature].fillna(all_data[feature].mode()[0])
    
#Fill in the remaining missing values with the values that are most common for this feature.

all_data['Functional'] = all_data['Functional'].fillna('Typ')

all_data.drop(['Utilities'], axis=1, inplace=True)

numeric_feats = all_data.dtypes[all_data.dtypes != 'object'].index
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x)).sort_values(ascending=False)
high_skew = skewed_feats[abs(skewed_feats) > 0.5]

for feature in high_skew.index:
    all_data[feature] = np.log1p(all_data[feature])

all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

all_data = pd.get_dummies(all_data)

x_train =all_data[:len(y_train)]
x_test = all_data[len(y_train):]

4


In [21]:
model = xgb.XGBRegressor(learning_rate =0.1, n_estimators=320, max_depth=3,
 min_child_weight=1, gamma=0, subsample=0.6, colsample_bytree=0.6, nthread=4, scale_pos_weight=1,seed=27)

model.fit(x_train, y_train)

print(model.predict(x_test))

out = pd.DataFrame({"SalePrice":np.expm1(model.predict(x_test))})
i = pd.DataFrame({"Id":test["Id"]})
out = pd.concat([i, out], axis=1)
out.to_csv("P7.csv", index=False)

[11.712854 11.997989 12.125802 ... 12.030545 11.648226 12.325117]


In [29]:
the_model = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, random_state =7, nthread = -1)
the_model.fit(x_train, y_train)

y_predict = np.floor(np.expm1(the_model.predict(x_test)))

x_train["Stack1"] = np.floor(np.expm1(the_model.predict(x_train)))
x_test["Stack1"] = y_predict

sub = pd.DataFrame()
sub['Id'] = test_id
sub['SalePrice'] = y_predict
sub.to_csv('mysubmission.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


Attempt to stack previous results. Results got worse. Score of 0.12317

In [30]:
the_model.fit(x_train, y_train)
y_predict = np.floor(np.expm1(the_model.predict(x_test)))

x_train["Stack2"] = np.floor(np.expm1(the_model.predict(x_train)))
x_test["Stack2"] = y_predict

sub = pd.DataFrame()
sub['Id'] = test_id
sub['SalePrice'] = y_predict
sub.to_csv('stacked.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Stacked XGB best results into best lasso; worse results than XGB, or lasso on its own. Score of 0.12626

In [31]:
model_lasso = LassoCV(alphas = [1, 0.1, 0.001, 0.0005]).fit(x_train, y_train)
y_predict = np.floor(np.expm1(model_lasso.predict(x_test)))

x_train["Stack3"] = np.floor(np.expm1(model_lasso.predict(x_train)))
x_test["Stack3"] = y_predict

out = pd.DataFrame({"SalePrice":np.expm1(model_lasso.predict(x_test))})
i = pd.DataFrame({"Id":test["Id"]})
out = pd.concat([i, out], axis=1)
out.to_csv("StackXGBLasso.csv", index=False)

  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  positive)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 221 is different from 222)

All models stacked. Score 0.125.

In [32]:
the_model.fit(x_train, y_train)
y_predict = np.floor(np.expm1(the_model.predict(x_test)))

x_train["Stack4"] = np.floor(np.expm1(the_model.predict(x_train)))
x_test["Stack4"] = y_predict

sub = pd.DataFrame()
sub['Id'] = test_id
sub['SalePrice'] = y_predict
sub.to_csv('superstacked.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
