### Import

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pandas import DataFrame

pd.set_option('display.max_rows', 100)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from xgboost import XGBRegressor, plot_importance
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.model_selection import cross_val_score, GridSearchCV

%matplotlib inline

### Load data

In [2]:
from subprocess import check_output
#print(check_output(["ls", "../input"]).decode("utf8"))

df_train_raw = pd.read_csv('train.csv')
df_test_raw = pd.read_csv('test.csv')

print(df_train_raw.shape)
print(df_test_raw.shape)

(1460, 81)
(1459, 80)


## Preprocess

In [3]:
Y_train = df_train_raw.SalePrice
df_train = df_train_raw.drop(['Id', 'SalePrice'], axis=1)
df_test = df_test_raw.drop(['Id'], axis=1)
cnt = df_train.count()
dtypes = df_train.dtypes
fields = DataFrame({'cnt': cnt, 'dtype': dtypes})

# drop sparse fields
sparse_cols = fields[fields.cnt < 800].index
df_train = df_train.drop(sparse_cols, axis=1)

fields = fields.drop(sparse_cols)

## Categorical Feature

In [4]:
# obj
fields_obj = fields[fields.dtype=='object']
df_train_obj = df_train[fields_obj.index]
df_test_obj = df_test[fields_obj.index]
fields_obj['nunique'] = df_train_obj.apply(lambda s: s.nunique())

# convert to categorical
for f in fields_obj.index:
    categories = df_train_obj[f].unique()
    df_train_obj[f] = df_train_obj[f].astype('category', categories=categories)
    df_test_obj[f] = df_test_obj[f].astype('category', categories=categories)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Setting NaNs in `categories` is deprecated and will be removed in a future version of pandas.
  return self.make_block(Categorical(self.values, **kwargs))


## Numeric Feature

In [5]:
# num
fields_num = fields[fields.dtype != 'object']
df_train_num = df_train[fields_num.index]
df_test_num = df_test[fields_num.index]
fields_num = fields_num.join(df_train_num.describe().T)

#df_train_num = df_train_num.fillna(df_train_num.mean())



## Integrate

In [6]:
X_train = pd.get_dummies(df_train_obj,  dummy_na=False)
X_test = pd.get_dummies(df_test_obj,  dummy_na=False)

X_train = X_train.join(df_train_num)
X_test = X_test.join(df_test_num)

Setting NaNs in `categories` is deprecated and will be removed in a future version of pandas.
  return Categorical(data, **kwargs)


## Test

In [7]:
model = XGBRegressor()
#model = Ridge()
cv = 10

scores = cross_val_score(model, X_train, Y_train, cv=cv)
print("scores: %s" % scores)
print("Avg score: %f" % np.mean(scores))

scores: [ 0.89861064  0.90099181  0.93349324  0.81662208  0.90338362  0.89130862
  0.89517434  0.90967461  0.89386115  0.87084082]
Avg score: 0.891396


#### Grid Search

In [8]:
model = XGBRegressor()
params = {
    'max_depth': [2,3],
    'n_estimators': [700,800,900,1000],
}

gs = GridSearchCV(model, params, cv=10)
gs.fit(X_train, Y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [700, 800, 900, 1000], 'max_depth': [2, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [9]:
gs.best_params_

{'max_depth': 3, 'n_estimators': 1000}

## Pred

In [10]:
model = XGBRegressor(max_depth=3, n_estimators=1000)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=1000, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [11]:
df_re = DataFrame({
    "Id": df_test_raw["Id"],
    "SalePrice": Y_pred
})
df_re.to_csv('result.csv', index=False)

df_re.head()

Unnamed: 0,Id,SalePrice
0,1461,127272.015625
1,1462,171133.359375
2,1463,184384.921875
3,1464,196101.4375
4,1465,169194.078125
