In [56]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_log_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor


In [27]:
X_full = pd.read_csv('train.csv')
X_test_full = pd.read_csv('test.csv')

y = X_full.SalePrice
X_full.drop('SalePrice', axis=1, inplace=True)

#Split the data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y,
                                                    train_size=0.8, test_size=0.2, 
                                                    random_state=0)

#Get Low Cardinality columns, numerical columns
low_car_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].nunique() <= 10 and 
                X_train_full[cname].dtype == 'object']
num_cols = [cname for cname in X_train_full.columns if 
            X_train_full[cname].dtype in ['int64', 'float64']]

# Keeping only low_cols and num_cols
my_cols = num_cols+low_car_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()


In [39]:
# Pre-processing numerical data
num_transformer = SimpleImputer(strategy='constant')

# Pre-processing categorical data with One hot encoding
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocess_pipeline = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, low_car_cols)
    ]
)

In [49]:
# temp pipeline just for the eval_set

X_valid_eval = X_valid.copy()
eval_set_pipeline = Pipeline(steps=[
    ('preprocessor', preprocess_pipeline)
])
X_valid_eval = eval_set_pipeline.fit(X_train).transform(X_valid_eval)

In [64]:
# Define model

house_pre_model = XGBRegressor(n_estimators = 10000, learning_rate=0.01, n_jobs=5)

In [65]:
# Pipeline to preprocess and model the data
house_model = Pipeline(steps=[
    ('preprocessor', preprocess_pipeline),
    ('model', house_pre_model)
])

In [66]:
house_model.fit(X_train, y_train,
                model__early_stopping_rounds=7,
                model__eval_set=[(X_valid_eval, y_valid)],
                model__verbose=False)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  SimpleImputer(strategy='constant'),
                                                  ['Id', 'MSSubClass',
                                                   'LotFrontage', 'LotArea',
                                                   'OverallQual', 'OverallCond',
                                                   'YearBuilt', 'YearRemodAdd',
                                                   'MasVnrArea', 'BsmtFinSF1',
                                                   'BsmtFinSF2', 'BsmtUnfSF',
                                                   'TotalBsmtSF', '1stFlrSF',
                                                   '2ndFlrSF', 'LowQualFinSF',
                                                   'GrLivArea', 'BsmtFullBath',
                                                   'BsmtHalfBath', 'F...
                              gamma=0, gpu_id=-1

In [67]:
# get predictions from the model

predictions = house_model.predict(X_valid)

In [68]:
# get MAE

mae = mean_absolute_error(y_valid, predictions)
mae

16793.935199058218

In [69]:
# get rmsle

rmsle = np.sqrt(mean_squared_log_error(y_valid, predictions))
rmsle

0.13285581649963532

In [61]:
# Get predictions for test
predictions_test=house_model.predict(X_test)

In [63]:
# Save test predictions to file
output = pd.DataFrame({'Id': X_test.Id,
                       'SalePrice': predictions_test})
output.to_csv('submission.csv', index=False)