In [10]:
# Imports

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# imputation
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# pipelines
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# models
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [34]:
X = pd.read_csv("data/train.csv", index_col="Id")
X_test  = pd.read_csv("data/test.csv" , index_col="Id")

# Removing rows with NaN for target
X.dropna(axis=0, subset=["SalePrice"], inplace=True)
y = X.SalePrice
X.drop(["SalePrice"], axis=1, inplace=True)

# train test split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

# Selecting columns - getting rid of high cardinality
cat_cols = [col for col in X.columns if X[col].nunique() < 10 and X[col].dtype == "object"]

# numeric cols
num_cols = [col for col in X.columns if X[col].dtype in ["int64", "float64"]]

my_cols = cat_cols + num_cols
X_train = X_train[my_cols].copy()
X_valid = X_valid[my_cols].copy()
X_test  = X_test[my_cols].copy()


In [35]:
# Pipelines

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy="mean")

# Prepcoressing for categorical data
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
    
# Bundle preprocessing for numerical and categorical data
preprocessor= ColumnTransformer(transformers=[
    ("num", numerical_transformer, num_cols),
    ("cat", categorical_transformer, low_cardinality)
])

#define model
model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4, random_state=0)


In [36]:
pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])


In [49]:
pipe.fit(X_train, y_train, 
             model__early_stopping_rounds=100, 
             model__eval_set=[(preprocessor.transform(X_valid), y_valid)], 
             model__verbose=False)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='mean',
                                                                verbose=0),
                                                  ['MSSubClass', 'LotFrontage',
                                                   'LotArea', 'OverallQual',
                                                   'OverallCond',

In [50]:
preds = pipe.predict(X_valid)

In [51]:
print('MAE:', mean_absolute_error(y_valid, preds))

MAE: 16237.751805971746


In [52]:
# need to adjust X_valid with preprocessing procedure first

#preprocessor.fit_transform(X_train)
#preprocessor.transform(X_valid)

In [53]:
test_preds = pipe.predict(X_test)

In [55]:
output = pd.DataFrame({"Id":X_test.index,
                        "SalePrice":test_preds})
output.to_csv("output/submission.csv", index=False)