### To-Do
- [x] Use Pipelines with low cardinality 
- [x] Use Pipelines with all categorical features

In [1]:
# modules
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
# get the data
X_full = pd.read_csv('train.csv')
X_test_full = pd.read_csv('test.csv')

# remove rows with missing targets
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)

# seperate target from predictors
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

# break off validation set from training set
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, test_size=.2, random_state=0)

# select categorical columns with low cardinality < 10
categorical_cols = [col for col in X_train_full.columns
                   if X_train_full[col].nunique() < 10 
                   and X_train_full[col].dtype == 'object']

# select numerical columns
numerical_cols = [col for col in X_train_full.columns
                 if X_train_full[col].dtype in ['int64', 'float64']]

# keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [3]:
# contains categorical data and missing values
X_train.head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Condition1,Condition2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
618,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Norm,Norm,...,774,0,108,0,0,260,0,0,7,2007
870,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,PosN,Norm,...,308,0,0,0,0,0,0,0,8,2009
92,RL,Pave,Grvl,IR1,HLS,AllPub,Inside,Gtl,Norm,Norm,...,432,0,0,44,0,0,0,0,8,2009
817,RL,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,Norm,Norm,...,857,150,59,0,0,0,0,0,7,2008
302,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Norm,Norm,...,843,468,81,0,0,0,0,0,1,2006


In [5]:
# The Pipeline

# preprocessing for numerical columns
numerical_transformer = SimpleImputer(strategy='median')

# preprocessing for categorical columns
categorical_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# define model
model = RandomForestRegressor(n_estimators=100, random_state=0)

# bundler preprocessing and model in a pipeline
my_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# fit model
my_pipeline.fit(X_train, y_train)

# get predictions for validation data
preds = my_pipeline.predict(X_valid)

print('MAE:', mean_absolute_error(y_valid, preds))

MAE: 17741.26297945205


###  use pipeline on all categorical data


In [7]:
new_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

new_pipeline.fit(X_train_full, y_train)

new_preds = new_pipeline.predict(X_valid_full)

print('MAE:', mean_absolute_error(y_valid, new_preds))

MAE: 17741.26297945205


In [8]:
print(X_train_full.shape)
print(X_train.shape)

(1168, 80)
(1168, 77)


In [15]:
from sklearn.externals import joblib 
joblib.dump(preds, 'Models/rf_pipeline.pkl')

['Models/rf_pipeline.pkl']