In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
X = pd.read_csv('train.csv')
X_test_full = pd.read_csv('test.csv')

X.dropna(subset=['SalePrice'], axis=0, inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

X_train_full, X_val_full, y_train, y_val = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

low_cardinalty_cols = [cols for cols in X_train_full.columns
                 if X_train_full[cols].dtype == 'object' and
                 X_train_full[cols].nunique() < 10]
numeric_cols = [cols for cols in X_train_full.columns
               if X_train_full[cols].dtype in ['float64', 'int64']]

my_cols = numeric_cols + low_cardinalty_cols

X_train = X_train_full[my_cols].copy()
X_val = X_val_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

X_train = pd.get_dummies(X_train)
X_val = pd.get_dummies(X_val)
X_test = pd.get_dummies(X_test)

X_train, X_val = X_train.align(X_val, join='left', axis=1)
X_train, X_test = X_train.align(X_test, join='left', axis=1)

In [3]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

In [4]:
my_model_1 = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_model_1.fit(X_train, y_train)
val_preds_1 = my_model_1.predict(X_val)
mae_1 = mean_absolute_error(y_val, val_preds_1)
print("MAE: ")
print(mae_1)

MAE: 
16375.472442208904


In [6]:
my_model_2 = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_model_2.fit(X_train, y_train,
              early_stopping_rounds=5,
              eval_set=[(X_val, y_val)],
              verbose=False, eval_metric='mae')



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.05, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [7]:
test_preds = my_model_2.predict(X_test)

In [11]:
output = pd.DataFrame({'Id': X_test.index,
                      'SalePrice': test_preds})
output.to_csv("XGBoost - Submission")