In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer


pd.set_option('max_columns', 500)

In [2]:
def make_test(train, test_size, random_state, strat_feat=None):
    if strat_feat:
        
        split = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)

        for train_index, test_index in split.split(train, train[strat_feat]):
            train_set = train.loc[train_index]
            test_set = train.loc[test_index]
            
    return train_set, test_set

In [3]:
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

train_set, test_set = make_test(df_train, 
                                test_size=0.2, random_state=654, 
                                strat_feat='Neighborhood')

In [4]:
class feat_sel(BaseEstimator, TransformerMixin):
    def __init__(self, dtype='numeric'):
        self._dtype = dtype
 
    def fit( self, X, y=None ):
        return self 
    
    def transform(self, X, y=None):
        if self._dtype == 'numeric':
            num_cols = X.columns[X.dtypes != object].tolist()
            return X[num_cols]
        elif self._dtype == 'category':
            cat_cols = X.columns[X.dtypes == object].tolist()
            return X[cat_cols]


class df_imputer(TransformerMixin):
    def __init__(self, strategy='mean'):
        self.strategy = strategy
        self.imp = None
        self.statistics_ = None

    def fit(self, X, y=None):
        self.imp = SimpleImputer(strategy=self.strategy)
        self.imp.fit(X)
        self.statistics_ = pd.Series(self.imp.statistics_, index=X.columns)
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Ximp = self.imp.transform(X)
        Xfilled = pd.DataFrame(Ximp, index=X.index, columns=X.columns)
        return Xfilled

    
class df_scaler(TransformerMixin):
    def __init__(self, method='standard'):
        self.scl = None
        self.scale_ = None
        self.method = method
        if self.method == 'sdandard':
            self.mean_ = None
        elif method == 'robust':
            self.center_ = None

    def fit(self, X, y=None):
        if self.method == 'standard':
            self.scl = StandardScaler()
            self.scl.fit(X)
            self.mean_ = pd.Series(self.scl.mean_, index=X.columns)
        elif self.method == 'robust':
            self.scl = RobustScaler()
            self.scl.fit(X)
            self.center_ = pd.Series(self.scl.center_, index=X.columns)
        self.scale_ = pd.Series(self.scl.scale_, index=X.columns)
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xscl = self.scl.transform(X)
        Xscaled = pd.DataFrame(Xscl, index=X.index, columns=X.columns)
        return Xscaled

    
class make_ordinal(BaseEstimator, TransformerMixin):
    def __init__(self, cols, extra_cols=None, unsure_conversion=True):
        self._unsure_conversion = unsure_conversion
        self.cols = cols
        self.extra_cols = extra_cols
        self.mapping = {'Po':1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        if self.extra_cols and self.unsure_conversion:
            self.cols += self.extra_cols
        for col in self.cols:
            X.loc[:, col] = X[col].map(self.mapping).fillna(0)
        return X

    
class general_cleaner(BaseEstimator, TransformerMixin):
    def __init__(self, train=True):
        self._train = train
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        #if self._train:
        #    # remove known outliers from train set
        #    X = X.loc[X.GrLivArea < 4500].reset_index(drop=True)
        #LotFrontage
        X.loc[X.LotFrontage.isnull(), 'LotFrontage'] = 0
        #Alley
        X.loc[X.Alley.isnull(), 'Alley'] = "NoAlley"
        #MSSubClass
        X['MSSubClass'] = X['MSSubClass'].astype(str)
        #MissingBasement
        fil = ((X.BsmtQual.isnull()) & (X.BsmtCond.isnull()) & (X.BsmtExposure.isnull()) &
              (X.BsmtFinType1.isnull()) & (X.BsmtFinType2.isnull()))
        fil1 = ((X.BsmtQual.notnull()) | (X.BsmtCond.notnull()) | (X.BsmtExposure.notnull()) |
              (X.BsmtFinType1.notnull()) | (X.BsmtFinType2.notnull()))
        X.loc[fil1, 'MisBsm'] = 0
        X.loc[fil, 'MisBsm'] = 1 # made explicit for safety
        #BsmtQual
        X.loc[fil, 'BsmtQual'] = "NoBsmt" #missing basement
        #BsmtCond
        X.loc[fil, 'BsmtCond'] = "NoBsmt" #missing basement
        #BsmtExposure
        X.loc[fil, 'BsmtExposure'] = "NoBsmt" #missing basement
        #BsmtFinType1
        X.loc[fil, 'BsmtFinType1'] = "NoBsmt" #missing basement
        #BsmtFinType2
        X.loc[fil, 'BsmtFinType2'] = "NoBsmt" #missing basement
        #BsmtFinSF1
        X.loc[fil, 'BsmtFinSF1'] = 0 # No bsmt
        #BsmtFinSF2
        X.loc[fil, 'BsmtFinSF2'] = 0 # No bsmt
        #BsmtUnfSF
        X.loc[fil, 'BsmtUnfSF'] = 0 # No bsmt
        #TotalBsmtSF
        X.loc[fil, 'TotalBsmtSF'] = 0 # No bsmt
        #BsmtFullBath
        X.loc[fil, 'BsmtFullBath'] = 0 # No bsmt
        #BsmtHalfBath
        X.loc[fil, 'BsmtHalfBath'] = 0 # No bsmt
        #FireplaceQu
        X.loc[(X.Fireplaces == 0) & (X.FireplaceQu.isnull()), 'FireplaceQu'] = "NoFire" #missing
        #MisGarage
        fil = ((X.GarageYrBlt.isnull()) & (X.GarageType.isnull()) & (X.GarageFinish.isnull()) &
              (X.GarageQual.isnull()) & (X.GarageCond.isnull()))
        fil1 = ((X.GarageYrBlt.notnull()) | (X.GarageType.notnull()) | (X.GarageFinish.notnull()) |
              (X.GarageQual.notnull()) | (X.GarageCond.notnull()))
        X.loc[fil1, 'MisGarage'] = 0
        X.loc[fil, 'MisGarage'] = 1
        #GarageYrBlt
        X.loc[X.GarageYrBlt > 2200, 'GarageYrBlt'] = 2007 #correct mistake
        X.loc[fil, 'GarageYrBlt'] = 0
        #GarageType
        X.loc[fil, 'GarageType'] = "NoGrg" #missing garage
        #GarageFinish
        X.loc[fil, 'GarageFinish'] = "NoGrg" #missing
        #GarageQual
        X.loc[fil, 'GarageQual'] = "NoGrg" #missing
        #GarageCond
        X.loc[fil, 'GarageCond'] = "NoGrg" #missing
        #Fence
        X.loc[X.Fence.isnull(), 'Fence'] = "NoFence" #missing fence
        #Pool
        fil = ((X.PoolArea == 0) & (X.PoolQC.isnull()))
        X.loc[fil, 'PoolQC'] = 'NoPool' 
        
        #the following transformations are only for the test set and checked by hand
        fil = X.GarageYrBlt.isna()
        X.loc[fil, 'GarageType'] = 'NoGrg'
        X.loc[fil, 'GarageYrBlt'] = 0
        X.loc[fil, 'GarageFinish'] = 'NoGrg'
        X.loc[fil, 'GarageCars'] = 0
        X.loc[fil, 'GarageArea'] = 0
        X.loc[fil, 'GarageQual'] = 'NoGrg'
        X.loc[fil, 'GarageCond'] = 'NoGrg'
        X.loc[fil, 'MisGarage'] = 1
        
        del X['Id']
        del X['MiscFeature']
        del X['MSSubClass']
        del X['Neighborhood']
        del X['Condition1']
        del X['Condition2']
        del X['ExterCond']
        del X['Exterior1st']
        del X['Exterior2nd']
        del X['Functional']
        del X['Heating']
        del X['PoolQC']
        del X['RoofMatl']
        del X['RoofStyle']
        del X['SaleCondition']
        del X['SaleType']
        del X['Utilities']
        
        return X

In [5]:
class tr_numeric(BaseEstimator, TransformerMixin):
    def __init__(self, columns=['GrLivArea', '1stFlrSF']):
        self.columns = columns

    def fit(self, X, y=None):
        return self
    
    def remove_skew(self, X, column):
        X[column] = np.log1p(X[column])
        return X
    
    def transform(self, X, y=None):
        for col in self.columns:
            X = self.remove_skew(X, col)
        return X

In [6]:
numeric_pipe = Pipeline([('fs', feat_sel('numeric')),
                         ('imputer', df_imputer(strategy='median')),
                         ('transf', tr_numeric()),
                         ('scl', df_scaler(method='standard'))])

full_pipe = Pipeline([('gen_cl', general_cleaner()), ('num_pipe', numeric_pipe)])

In [7]:
tmp = train_set.copy()
full_pipe.fit_transform(tmp)

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice,MisBsm,MisGarage
68,-0.293799,-0.640408,-1.536497,0.388314,-0.854633,-1.687632,-0.572721,-0.951802,-0.289111,0.394816,-0.702008,-1.229020,-0.809847,-0.11994,-1.969786,-0.792460,-0.235213,-1.038946,-0.781445,-1.088870,-0.217078,-1.558394,-0.955475,0.166254,-1.023099,-1.176467,-0.743920,-0.700903,-0.360845,-0.114758,-0.275037,-0.070993,-0.08847,-0.090250,1.648787,-1.265526,-0.162364,-0.240772
1097,-1.633589,-0.740946,1.365085,-0.520605,0.486653,0.054226,-0.572721,-0.951802,-0.289111,1.135134,0.034332,-0.047277,-0.809847,-0.11994,-0.843589,-0.792460,-0.235213,-1.038946,1.194156,-1.088870,-0.217078,-0.945737,-0.955475,0.259402,0.312074,-0.050133,-0.743920,0.408792,1.862782,-0.114758,-0.275037,-0.070993,-0.08847,1.384042,-0.612407,-0.145310,-0.162364,-0.240772
219,-0.407824,-0.816569,0.639689,-0.520605,1.108225,1.021925,-0.482072,-0.917099,-0.289111,1.492841,0.426146,0.384011,-0.809847,-0.11994,-0.432573,-0.792460,-0.235213,0.791354,-0.781445,-1.088870,-0.217078,-0.945737,-0.955475,0.299323,0.312074,-0.157626,0.101091,-0.700903,-0.360845,-0.114758,-0.275037,-0.070993,-0.08847,-1.195969,-1.366138,-0.179663,-0.162364,-0.240772
901,0.190806,-0.187989,-0.811101,1.297233,-0.462061,0.731615,-0.572721,0.913518,-0.289111,-0.997524,-0.150316,0.526820,-0.809847,-0.11994,-0.296476,1.147988,-0.235213,-1.038946,-0.781445,-1.088870,-0.217078,-0.945737,-0.955475,0.217264,-1.023099,1.328575,-0.743920,-0.700903,-0.360845,-0.114758,-0.275037,-0.070993,-0.08847,-0.458823,0.895056,-0.356906,-0.162364,-0.240772
505,0.076781,-0.311016,-0.811101,-0.520605,-0.625633,-1.590862,1.466898,-0.951802,-0.289111,0.877042,-0.222373,-0.440678,1.464495,-0.11994,0.919993,-0.792460,-0.235213,0.791354,-0.781445,1.429547,4.154420,2.117548,-0.955475,0.181779,0.312074,-0.335222,-0.743920,-0.700903,-0.360845,-0.114758,-0.275037,-0.070993,-0.08847,0.278323,0.895056,-0.711641,-0.162364,-0.240772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118,0.789435,0.375660,-0.811101,0.388314,-0.429347,0.102611,-0.572721,-0.951802,-0.289111,-0.089673,-1.183894,-0.247054,0.509271,-0.11994,0.313732,-0.792460,3.991376,-1.038946,-0.781445,0.170338,-0.217078,-0.333080,-0.955475,0.195086,-1.023099,-0.802580,1.446850,-0.191043,-0.360845,-0.114758,-0.275037,-0.070993,-0.08847,2.121188,0.141325,-0.518715,-0.162364,-0.240772
1308,1.217028,0.174475,-0.811101,1.297233,-0.756490,0.973540,-0.572721,-0.639470,3.425637,-0.906966,-0.303438,0.049448,-0.809847,-0.11994,-0.751410,1.147988,-0.235213,-1.038946,-0.781445,-1.088870,-0.217078,-0.333080,-0.955475,0.172908,0.312074,0.262997,-0.743920,-0.161052,-0.360845,-0.114758,-0.275037,-0.070993,-0.08847,-0.458823,0.141325,-0.431587,-0.162364,-0.240772
563,0.247818,1.252607,-0.085706,1.297233,-1.737919,-1.687632,-0.572721,-0.951802,-0.289111,1.336627,0.234743,0.162266,0.352342,-0.11994,0.447337,-0.792460,-0.235213,0.791354,-0.781445,1.429547,-0.217078,0.892234,0.594518,0.188432,0.312074,-0.353916,-0.180579,-0.161052,-0.360845,-0.114758,2.230873,-0.070993,-0.08847,0.278323,0.141325,0.041393,-0.162364,-0.240772
411,1.217028,2.671376,-0.811101,-0.520605,-0.527490,-1.445708,-0.572721,1.338638,-0.289111,-1.296368,-0.006200,-0.141112,-0.809847,-0.11994,-0.933014,1.147988,-0.235213,-1.038946,-0.781445,0.170338,-0.217078,-0.945737,-0.955475,0.188432,0.312074,0.468635,1.321663,-0.700903,-0.360845,-0.114758,-0.275037,-0.070993,-0.08847,-1.933115,-1.366138,-0.456481,-0.162364,-0.240772


In [27]:
tmp = test_set.copy()
full_pipe.transform(tmp)

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice,MisBsm,MisGarage
787,0.532880,-0.030348,0.639689,-0.520605,1.075511,0.925155,-0.572721,0.471047,-0.289111,-0.617178,-0.231381,-0.453800,1.755611,-0.119940,1.103831,1.147988,-0.235213,0.791354,1.194156,1.429547,-0.217078,0.892234,-0.955475,0.297105,0.312074,0.683620,-0.743920,-0.026089,-0.360845,-0.114758,-0.275037,-0.070993,-0.08847,-1.933115,1.648787,0.638841,-0.162364,-0.240772
284,-0.208281,-0.265156,-0.085706,-0.520605,0.682939,0.344536,-0.572721,-0.019142,-0.289111,1.295876,1.162487,1.115645,-0.809847,-0.119940,0.264671,1.147988,-0.235213,0.791354,-0.781445,-1.088870,-0.217078,-0.945737,-0.955475,0.270491,0.312074,0.267671,-0.743920,-0.700903,0.483160,-0.114758,-0.275037,-0.070993,-0.08847,0.278323,-0.612407,-0.030799,-0.162364,-0.240772
184,0.988979,-0.328433,-0.811101,2.206152,-2.065062,0.296151,-0.572721,-0.951802,-0.289111,-0.155328,-1.249196,-0.520248,-0.091155,-0.119940,-0.422986,-0.792460,-0.235213,-1.038946,-0.781445,0.170338,-0.217078,-0.945737,-0.955475,0.257184,0.312074,0.487329,0.069795,-0.700903,-0.360845,-0.114758,-0.275037,-0.070993,-0.08847,-0.090250,-1.366138,-0.680524,-0.162364,-0.240772
950,0.076781,-0.354670,-0.811101,2.206152,-0.691062,0.828385,-0.572721,-0.088549,0.621247,-0.578690,-0.438547,-0.771797,-0.809847,-0.119940,-1.534054,1.147988,-0.235213,-1.038946,-0.781445,0.170338,-0.217078,-0.945737,-0.955475,0.243877,0.312074,1.160326,0.773971,-0.700903,-0.360845,-0.114758,-0.275037,-0.070993,-0.08847,0.278323,-0.612407,-0.655630,-0.162364,-0.240772
279,0.732423,-0.045451,0.639689,-0.520605,0.192225,-0.381239,1.121296,-0.101563,-0.289111,0.442360,0.227988,0.143289,1.159733,-0.119940,1.013321,-0.792460,-0.235213,0.791354,1.194156,1.429547,-0.217078,0.892234,0.594518,0.237224,0.312074,0.155505,1.509443,1.053615,-0.360845,-0.114758,-0.275037,-0.070993,-0.08847,-1.195969,0.141325,0.128521,-0.162364,-0.240772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
383,0.076781,-0.156241,-0.085706,-2.338444,-1.410776,-1.687632,-0.572721,-0.951802,-0.289111,0.478583,-0.618691,-1.077120,-0.809847,-0.119940,-1.825026,-0.792460,-0.235213,-1.038946,-0.781445,-1.088870,-0.217078,-0.945737,-0.955475,0.177343,0.312074,-0.522165,-0.743920,-0.700903,1.116163,-0.114758,-0.275037,-0.070993,-0.08847,1.384042,0.895056,-1.315313,-0.162364,-0.240772
1069,0.076781,-0.090098,-0.811101,1.297233,-0.723776,0.876770,-0.572721,-0.474627,-0.289111,0.118612,-0.481331,-0.668047,-0.809847,-0.119940,-1.435181,-0.792460,3.991376,-1.038946,-0.781445,-1.088870,-0.217078,-1.558394,-0.955475,0.254966,0.312074,0.487329,-0.743920,-0.700903,-0.360845,-0.114758,-0.275037,-0.070993,-0.08847,-0.458823,-0.612407,-0.580949,-0.162364,-0.240772
1031,1.274040,0.600326,0.639689,-2.338444,-1.672490,-0.719933,-0.572721,0.182573,-0.289111,-0.614914,-0.528619,1.331644,1.459946,7.967268,2.276568,1.147988,-0.235213,0.791354,1.194156,2.688755,-0.217078,3.342863,2.144511,0.221699,0.312074,0.935994,0.320168,0.243837,-0.360845,-0.114758,-0.275037,-0.070993,-0.08847,0.646896,0.895056,0.190755,-0.162364,-0.240772
659,0.504374,-0.052947,-0.811101,1.297233,-0.233061,0.683230,-0.572721,0.429836,-0.289111,0.625741,0.962076,0.932760,-0.809847,-0.119940,0.090383,1.147988,-0.235213,-1.038946,-0.781445,0.170338,-0.217078,0.279577,-0.955475,0.217264,0.312074,0.038665,-0.743920,-0.700903,-0.360845,-0.114758,-0.275037,-0.070993,-0.08847,-1.195969,0.895056,-0.182650,-0.162364,-0.240772


In [8]:
class recode_cat(BaseEstimator, TransformerMixin):        
    
    def fit(self, X, y=None):
        return self
    
    
    def tr_GrgType(self, data):
        data['GrgType'] = data['GarageType'].map({'Basment': 'Attchd',
                                                  'CarPort': 'Detchd', 
                                                  '2Types': 'Attchd' }).fillna(data['GarageType'])
        return data
    
    
    def tr_LotShape(self, data):
        fil = (data.LotShape != 'Reg')
        data['LotShape'] = 1
        data.loc[fil, 'LotShape'] = 0
        return data
    
    
    def tr_LandCont(self, data):
        fil = (data.LandContour == 'HLS') | (data.LandContour == 'Low')
        data['LandContour'] = 0
        data.loc[fil, 'LandContour'] = 1
        return data
    
    
    def tr_LandSlope(self, data):
        fil = (data.LandSlope != 'Gtl')
        data['LandSlope'] = 0
        data.loc[fil, 'LandSlope'] = 1
        return data
    
    
    def tr_MSZoning(self, data):
        data['MSZoning'] = data['MSZoning'].map({'RH': 'RM', # medium and high density
                                                 'C (all)': 'RM', # commercial and medium density
                                                 'FV': 'RM'}).fillna(data['MSZoning'])
        return data
    
    
    def tr_Alley(self, data):
        fil = (data.Alley != 'NoAlley')
        data['Alley'] = 0
        data.loc[fil, 'Alley'] = 1
        return data
    
    
    def tr_LotConfig(self, data):
        data['LotConfig'] = data['LotConfig'].map({'FR3': 'Corner', # corners have 2 or 3 free sides
                                                   'FR2': 'Corner'}).fillna(data['LotConfig'])
        return data
    
    
    def tr_BldgType(self, data):
        data['BldgType'] = data['BldgType'].map({'Twnhs' : 'TwnhsE',
                                                 '2fmCon': 'Duplex'}).fillna(data['BldgType'])
        return data
    
    
    def tr_MasVnrType(self, data):
        data['MasVnrType'] = data['MasVnrType'].map({'BrkCmn': 'BrkFace'}).fillna(data['MasVnrType'])
        return data
    
    
    def transform(self, X, y=None):
        X = self.tr_GrgType(X)
        X = self.tr_LotShape(X)
        X = self.tr_LotConfig(X)
        X = self.tr_MSZoning(X)
        X = self.tr_Alley(X)
        X = self.tr_LandSlope(X)
        X = self.tr_LandCont(X)
        X = self.tr_BldgType(X)
        X = self.tr_MasVnrType(X)
        return X
    
    
class dummify(TransformerMixin):
    def __init__(self, drop_first=False):
        self.drop_first = drop_first

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = pd.get_dummies(X, drop_first=self.drop_first)
        return X

In [11]:
tmp = train_set.copy()

cat_pipe = Pipeline([('fs', feat_sel('category')),
                     ('imputer', df_imputer(strategy='most_frequent')), 
                     ('ord', make_ordinal(['BsmtQual', 'KitchenQual','GarageQual',
                                           'GarageCond', 'ExterQual', 'HeatingQC'])), 
                     ('recode', recode_cat()), 
                     ('dummies', dummify())])

full_pipe = Pipeline([('gen_cl', general_cleaner()), ('cat_pipe', cat_pipe)])

full_pipe.fit_transform(tmp)

Unnamed: 0,Alley,LotShape,LandContour,LandSlope,ExterQual,BsmtQual,HeatingQC,KitchenQual,GarageQual,GarageCond,MSZoning_RL,MSZoning_RM,Street_Grvl,Street_Pave,LotConfig_Corner,LotConfig_CulDSac,LotConfig_Inside,BldgType_1Fam,BldgType_Duplex,BldgType_TwnhsE,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,MasVnrType_BrkFace,MasVnrType_None,MasVnrType_Stone,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,BsmtCond_Fa,BsmtCond_Gd,BsmtCond_NoBsmt,BsmtCond_TA,BsmtExposure_Av,BsmtExposure_Gd,BsmtExposure_Mn,BsmtExposure_No,BsmtExposure_NoBsmt,BsmtFinType1_ALQ,BsmtFinType1_BLQ,BsmtFinType1_GLQ,BsmtFinType1_LwQ,BsmtFinType1_NoBsmt,BsmtFinType1_Rec,BsmtFinType1_Unf,BsmtFinType2_ALQ,BsmtFinType2_BLQ,BsmtFinType2_GLQ,BsmtFinType2_LwQ,BsmtFinType2_NoBsmt,BsmtFinType2_Rec,BsmtFinType2_Unf,CentralAir_N,CentralAir_Y,Electrical_FuseA,Electrical_FuseF,Electrical_FuseP,Electrical_SBrkr,FireplaceQu_Ex,FireplaceQu_Fa,FireplaceQu_Gd,FireplaceQu_NoFire,FireplaceQu_Po,FireplaceQu_TA,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_NoGrg,GarageFinish_Fin,GarageFinish_NoGrg,GarageFinish_RFn,GarageFinish_Unf,PavedDrive_N,PavedDrive_P,PavedDrive_Y,Fence_GdPrv,Fence_GdWo,Fence_MnPrv,Fence_MnWw,Fence_NoFence,GrgType_Attchd,GrgType_BuiltIn,GrgType_Detchd,GrgType_NoGrg
68,0,1,0,0,3,3.0,3,3,3.0,3.0,0,1,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0
1097,0,1,0,0,4,4.0,5,4,3.0,3.0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0
219,0,1,0,0,4,4.0,5,4,3.0,3.0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0
901,0,0,0,0,3,3.0,3,3,3.0,3.0,1,0,0,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0
505,1,1,0,0,3,3.0,4,3,3.0,3.0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118,0,1,0,0,3,3.0,3,3,3.0,3.0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0
1308,0,1,0,0,3,3.0,5,4,3.0,3.0,0,1,0,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0
563,0,1,0,0,3,4.0,5,3,3.0,3.0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0
411,0,1,0,0,3,3.0,3,3,3.0,3.0,1,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0


In [33]:
tmp = test_set.copy()

full_pipe.transform(tmp)

Unnamed: 0,Alley,LotShape,LandContour,LandSlope,ExterQual,BsmtQual,HeatingQC,KitchenQual,GarageQual,GarageCond,MSZoning_RL,MSZoning_RM,Street_Grvl,Street_Pave,LotConfig_Corner,LotConfig_CulDSac,LotConfig_Inside,BldgType_1Fam,BldgType_Duplex,BldgType_TwnhsE,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,MasVnrType_BrkFace,MasVnrType_None,MasVnrType_Stone,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,BsmtCond_Fa,BsmtCond_Gd,BsmtCond_NoBsmt,BsmtCond_Po,BsmtCond_TA,BsmtExposure_Av,BsmtExposure_Gd,BsmtExposure_Mn,BsmtExposure_No,BsmtExposure_NoBsmt,BsmtFinType1_ALQ,BsmtFinType1_BLQ,BsmtFinType1_GLQ,BsmtFinType1_LwQ,BsmtFinType1_NoBsmt,BsmtFinType1_Rec,BsmtFinType1_Unf,BsmtFinType2_ALQ,BsmtFinType2_BLQ,BsmtFinType2_GLQ,BsmtFinType2_LwQ,BsmtFinType2_NoBsmt,BsmtFinType2_Rec,BsmtFinType2_Unf,CentralAir_N,CentralAir_Y,Electrical_FuseA,Electrical_FuseF,Electrical_FuseP,Electrical_Mix,Electrical_SBrkr,FireplaceQu_Ex,FireplaceQu_Fa,FireplaceQu_Gd,FireplaceQu_NoFire,FireplaceQu_Po,FireplaceQu_TA,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_NoGrg,GarageFinish_Fin,GarageFinish_NoGrg,GarageFinish_RFn,GarageFinish_Unf,PavedDrive_N,PavedDrive_P,PavedDrive_Y,Fence_GdPrv,Fence_GdWo,Fence_MnPrv,Fence_MnWw,Fence_NoFence,GrgType_Attchd,GrgType_BuiltIn,GrgType_Detchd,GrgType_NoGrg
787,0,0,0,0,4,4.0,5,4,3.0,3.0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0
284,0,1,0,0,4,4.0,4,4,3.0,3.0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0
184,0,0,0,0,3,2.0,4,3,3.0,3.0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0
950,0,1,0,0,3,3.0,4,4,3.0,3.0,1,0,0,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0
279,0,1,0,0,3,4.0,5,3,3.0,3.0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
383,0,1,0,0,3,2.0,3,3,2.0,2.0,0,1,0,1,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,1,0
1069,0,1,0,0,3,3.0,3,4,3.0,3.0,1,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0
1031,0,1,0,0,3,3.0,5,3,3.0,3.0,1,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0
659,0,1,0,0,3,3.0,5,3,3.0,3.0,1,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0


In [34]:
set(test_cols) - set(train_cols)

{'BsmtCond_Po', 'Electrical_Mix'}

In [76]:
numeric_pipe = Pipeline([('fs', feat_sel('numeric')),
                         ('imputer', df_imputer(strategy='median')),
                         ('transf', tr_numeric())])


cat_pipe = Pipeline([('fs', feat_sel('category')),
                     ('imputer', df_imputer(strategy='most_frequent')), 
                     ('ord', make_ordinal(['BsmtQual', 'KitchenQual','GarageQual',
                                           'GarageCond', 'ExterQual', 'HeatingQC'])), 
                     ('recode', recode_cat()), 
                     ('dummies', dummify())])


processing_pipe = FeatureUnion(transformer_list=[('cat_pipe', cat_pipe),
                                                 ('num_pipe', numeric_pipe)])


full_pipe = Pipeline([('gen_cl', general_cleaner()), 
                      ('processing', processing_pipe),
                      ('scl', StandardScaler())])

tmp = df_train.copy()

full_pipe.fit_transform(tmp)

array([[-0.25782141,  0.76051192, -0.25018188, ...,  0.34727322,
        -0.16124951, -0.24235968],
       [-0.25782141,  0.76051192, -0.25018188, ...,  0.00728832,
        -0.16124951, -0.24235968],
       [-0.25782141, -1.31490378, -0.25018188, ...,  0.53615372,
        -0.16124951, -0.24235968],
       ...,
       [-0.25782141,  0.76051192, -0.25018188, ...,  1.07761115,
        -0.16124951, -0.24235968],
       [-0.25782141,  0.76051192, -0.25018188, ..., -0.48852299,
        -0.16124951, -0.24235968],
       [-0.25782141,  0.76051192, -0.25018188, ..., -0.42084081,
        -0.16124951, -0.24235968]])

In [77]:
full_pipe.steps[1][1].get_params()

{'n_jobs': None, 'transformer_list': [('cat_pipe', Pipeline(memory=None,
            steps=[('fs', feat_sel(dtype=None)),
                   ('imputer', <__main__.df_imputer object at 0x7feb8ff55b38>),
                   ('ord',
                    make_ordinal(cols=['BsmtQual', 'KitchenQual', 'GarageQual',
                                       'GarageCond', 'ExterQual', 'HeatingQC'],
                                 extra_cols=None, unsure_conversion=None)),
                   ('recode', recode_cat()),
                   ('dummies', <__main__.dummify object at 0x7feb8ff55e80>)],
            verbose=False)), ('num_pipe', Pipeline(memory=None,
            steps=[('fs', feat_sel(dtype=None)),
                   ('imputer', <__main__.df_imputer object at 0x7feb8ff55588>),
                   ('transf', tr_numeric())],
            verbose=False))], 'transformer_weights': None, 'verbose': False, 'cat_pipe': Pipeline(memory=None,
          steps=[('fs', feat_sel(dtype=None)),
               