### To-Do
- [x] Remove categorical features with high cardinality
- [x] Check the number of missing values
- [x] Cross-validation
- [x] Use it with less missing categorical columns
#### With low cardinality
- CV MAE: 16476.5398727939
- Kaggle score: 15017.66000
#### With less missing categorical columns
- CV MAE: 16272.231832972175
- Kaggle score: 14984.44846

In [1]:
# modules
import pandas as pd 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score

from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.externals import joblib

In [2]:
# get the data
X_full = pd.read_csv('train.csv')
X_test_full = pd.read_csv('test.csv')

# remove rows with missing targets
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)

# seperate target from predictros
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

# Break off validation set from training set
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, test_size=.2,
                                                               random_state=0)

In [4]:
# select columns with low cardinality 
low_cardinality_cols = [col for col in X_train_full.columns
                       if X_train_full[col].nunique() < 10 and
                       X_train_full[col].dtype == 'object']

# check the number of missing values for these cols
missing_low_card_cols = X_train_full[low_cardinality_cols].isnull().sum()
print(missing_low_card_cols[missing_low_card_cols > 0].sort_values(ascending=False))

PoolQC          1164
MiscFeature     1119
Alley           1097
Fence            954
FireplaceQu      551
GarageCond        58
GarageQual        58
GarageFinish      58
GarageType        58
BsmtFinType2      29
BsmtFinType1      28
BsmtExposure      28
BsmtCond          28
BsmtQual          28
MasVnrType         6
Electrical         1
dtype: int64


In [5]:
# all categorical with missing values
all_cat = [col for col in X_train_full.columns
          if X_train_full[col].dtype == 'object']

all_cat_missing = X_train_full[all_cat].isnull().sum()
print(all_cat_missing[all_cat_missing > 0].sort_values(ascending=False))
print(len(all_cat))

PoolQC          1164
MiscFeature     1119
Alley           1097
Fence            954
FireplaceQu      551
GarageCond        58
GarageQual        58
GarageFinish      58
GarageType        58
BsmtFinType2      29
BsmtFinType1      28
BsmtExposure      28
BsmtCond          28
BsmtQual          28
MasVnrType         6
Electrical         1
dtype: int64
43


In [6]:
# select columns with high cardinality 
high_cardinality_cols = [col for col in X_train_full.columns
                       if X_train_full[col].nunique() >9 and
                       X_train_full[col].dtype == 'object']

missing_high_card_cols = X_train_full[high_cardinality_cols].isnull().sum()
print(missing_high_card_cols[missing_high_card_cols > 0].sort_values(ascending=False))
print((high_cardinality_cols))

Series([], dtype: int64)
['Neighborhood', 'Exterior1st', 'Exterior2nd']


In [7]:
# numerical cols 
numerical_cols = [col for col in X_train_full.columns
                 if X_train_full[col].dtype in ['int64', 'float64']]

In [8]:
# keep selected columns only
my_cols = low_cardinality_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [9]:
# Pipeline

# transformers
numerical_transformer = SimpleImputer(strategy='median')

categorical_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# bundl categorical and numerical transformers
preprocessing = ColumnTransformer(transformers=[
    ('numerical', numerical_transformer, numerical_cols),
    ('categorical', categorical_transformer, low_cardinality_cols)
])

# Model
my_model = XGBRegressor(n_estimators=1000, learning_rate=.01, random_state=0, n_jobs=-1)

# Bundl preprocessing with model
my_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('model', my_model)
])

In [10]:
# fit the model 
my_pipeline.fit(X_train, y_train)

# get predictions
preds = my_pipeline.predict(X_valid)

print('MAE:', mean_absolute_error(y_valid, preds))

  if getattr(data, 'base', None) is not None and \


MAE: 16879.11071275685


In [15]:
# redefine get_score()
def get_score(X, y, model):
    
    # transformers 
    numerical_transformer = SimpleImputer(strategy='median')
    
    categorical_transformer = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    # bundl transformers
    preprocessing = ColumnTransformer(transformers=[
        ('numerical', numerical_transformer, numerical_cols),
        ('categorical', categorical_transformer, low_cardinality_cols)
    ])
    
    # bundl preprocessing with model
    my_pipeline = Pipeline(steps=[
        ('preprocessing', preprocessing),
        ('model', model)
    ])
    
    scores = -1 * cross_val_score(my_pipeline, X, y, cv=3, scoring='neg_mean_absolute_error')
    
    return scores.mean()

In [16]:
# cross-validaion on XGBoost
my_model = XGBRegressor(n_estimators=1000, learning_rate=.01, random_state=0, n_jobs=-1)
score = get_score(X_train, y_train, my_model)
print('MAE:', score)

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \


MAE: 16476.5398727939


In [18]:
# get testset predictions and submit 
sample_submission = pd.read_csv('sample_submission.csv')

test_preds = my_pipeline.predict(X_test)

output = pd.DataFrame({
    'Id': sample_submission.Id,
    'SalePrice': test_preds
})

output.to_csv('submission.csv', index=False)

In [19]:
# save 
joblib.dump(my_pipeline, 'Models/xgb_low_card.pkl')

['Models/xgb_low_card.pkl']

### Use it with less missing categorical columns 

In [20]:
# columns to keep 
keep_cols = [col for col in X_train_full.columns
            if X_train_full[col].isnull().sum() < 300]
print(keep_cols)

['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', '

In [22]:
# keep columns with the least missing values
X_train = X_train_full[keep_cols].copy()
X_valid = X_valid_full[keep_cols].copy()
X_test = X_test_full[keep_cols].copy()

In [23]:
X_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
618,619,20,RL,90.0,11694,Pave,Reg,Lvl,AllPub,Inside,...,108,0,0,260,0,0,7,2007,New,Partial
870,871,20,RL,60.0,6600,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,0,8,2009,WD,Normal
92,93,30,RL,80.0,13360,Pave,IR1,HLS,AllPub,Inside,...,0,44,0,0,0,0,8,2009,WD,Normal
817,818,20,RL,,13265,Pave,IR1,Lvl,AllPub,CulDSac,...,59,0,0,0,0,0,7,2008,WD,Normal
302,303,20,RL,118.0,13704,Pave,IR1,Lvl,AllPub,Corner,...,81,0,0,0,0,0,1,2006,WD,Normal


In [24]:
# seperate numerical and categorical features
numerical_cols = [col for col in X_train.columns
                 if X_train[col].dtype in ['int64', 'float64']]

categorical_cols = [col for col in X_train.columns
                   if X_train[col].dtype == 'object']

In [26]:
# the pipeline 

# preprocessing numerical columns 
numerical_transformer = SimpleImputer(strategy='median')

# preprocessing categorical columns 
categorical_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# bundl numerical and categorical preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

my_model = XGBRegressor(n_estimators=1000, learning_rate=.01, random_state=0, n_jobs=-1)

# bundl preprocessor and model
my_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', my_model)
])

# fit the training data
my_pipeline.fit(X_train, y_train)

# get predictions
preds = my_pipeline.predict(X_valid)

print('MAE:', mean_absolute_error(y_valid, preds))

  if getattr(data, 'base', None) is not None and \


MAE: 16491.63049818065


In [30]:
# get testset predictions and submit 
sample_submission = pd.read_csv('sample_submission.csv')

test_preds = my_pipeline.predict(X_test)

output = pd.DataFrame({
    'Id': sample_submission.Id,
    'SalePrice': test_preds
})

output.to_csv('submission.csv', index=False)

In [28]:
# save 
joblib.dump(my_pipeline, 'Models/xgb_less_missing.pkl')

['Models/xgb_less_missing.pkl']

In [29]:
# cross-validation score
score = -1 * cross_val_score(my_pipeline, X_train, y_train, cv=3, scoring='neg_mean_absolute_error')
print('MAE:', score.mean())

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \


MAE: 16272.231832972175
