In [40]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer


pd.set_option('max_columns', 500)

In [15]:
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

In [34]:
class feat_sel(BaseEstimator, TransformerMixin):
    def __init__(self, dtype='numeric'):
        self._dtype = dtype
 
    def fit( self, X, y=None ):
        return self 
    
    def transform(self, X, y=None):
        if self._dtype == 'numeric':
            num_cols = X.columns[X.dtypes != object].tolist()
            return X[num_cols]
        elif self._dtype == 'category':
            cat_cols = X.columns[X.dtypes == object].tolist()
            return X[cat_cols]


class df_imputer(TransformerMixin):
    def __init__(self, strategy='mean'):
        self.strategy = strategy
        self.imp = None
        self.statistics_ = None

    def fit(self, X, y=None):
        self.imp = SimpleImputer(strategy=self.strategy)
        self.imp.fit(X)
        self.statistics_ = pd.Series(self.imp.statistics_, index=X.columns)
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Ximp = self.imp.transform(X)
        Xfilled = pd.DataFrame(Ximp, index=X.index, columns=X.columns)
        return Xfilled

    
class df_scaler(TransformerMixin):
    def __init__(self, method='standard'):
        self.scl = None
        self.scale_ = None
        self.method = method
        if self.method == 'sdandard':
            self.mean_ = None
        elif method == 'robust':
            self.center_ = None

    def fit(self, X, y=None):
        if self.method == 'standard':
            self.scl = StandardScaler()
            self.scl.fit(X)
            self.mean_ = pd.Series(self.scl.mean_, index=X.columns)
        elif self.method == 'robust':
            self.scl = RobustScaler()
            self.scl.fit(X)
            self.center_ = pd.Series(self.scl.center_, index=X.columns)
        self.scale_ = pd.Series(self.scl.scale_, index=X.columns)
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xscl = self.scl.transform(X)
        Xscaled = pd.DataFrame(Xscl, index=X.index, columns=X.columns)
        return Xscaled

    
class make_ordinal(BaseEstimator, TransformerMixin):
    def __init__(self, cols, extra_cols=None, unsure_conversion=True):
        self._unsure_conversion = unsure_conversion
        self.cols = cols
        self.extra_cols = extra_cols
        self.mapping = {'Po':1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        if self.extra_cols and self.unsure_conversion:
            self.cols += self.extra_cols
        for col in self.cols:
            X.loc[:, col] = X[col].map(self.mapping).fillna(0)
        return X

    
class general_cleaner(BaseEstimator, TransformerMixin):
    def __init__(self, train=True):
        self._train = train
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        if self._train:
            # remove known outliers from train set
            X = X.loc[X.GrLivArea < 4500].reset_index(drop=True)
        #LotFrontage
        X.loc[X.LotFrontage.isnull(), 'LotFrontage'] = 0
        #Alley
        X.loc[X.Alley.isnull(), 'Alley'] = "NoAlley"
        #MSSubClass
        X['MSSubClass'] = X['MSSubClass'].astype(str)
        #MissingBasement
        fil = ((X.BsmtQual.isnull()) & (X.BsmtCond.isnull()) & (X.BsmtExposure.isnull()) &
              (X.BsmtFinType1.isnull()) & (X.BsmtFinType2.isnull()))
        fil1 = ((X.BsmtQual.notnull()) | (X.BsmtCond.notnull()) | (X.BsmtExposure.notnull()) |
              (X.BsmtFinType1.notnull()) | (X.BsmtFinType2.notnull()))
        X.loc[fil1, 'MisBsm'] = 0
        X.loc[fil, 'MisBsm'] = 1 # made explicit for safety
        #BsmtQual
        X.loc[fil, 'BsmtQual'] = "NoBsmt" #missing basement
        #BsmtCond
        X.loc[fil, 'BsmtCond'] = "NoBsmt" #missing basement
        #BsmtExposure
        X.loc[fil, 'BsmtExposure'] = "NoBsmt" #missing basement
        #BsmtFinType1
        X.loc[fil, 'BsmtFinType1'] = "NoBsmt" #missing basement
        #BsmtFinType2
        X.loc[fil, 'BsmtFinType2'] = "NoBsmt" #missing basement
        #BsmtFinSF1
        X.loc[fil, 'BsmtFinSF1'] = 0 # No bsmt
        #BsmtFinSF2
        X.loc[fil, 'BsmtFinSF2'] = 0 # No bsmt
        #BsmtUnfSF
        X.loc[fil, 'BsmtUnfSF'] = 0 # No bsmt
        #TotalBsmtSF
        X.loc[fil, 'TotalBsmtSF'] = 0 # No bsmt
        #BsmtFullBath
        X.loc[fil, 'BsmtFullBath'] = 0 # No bsmt
        #BsmtHalfBath
        X.loc[fil, 'BsmtHalfBath'] = 0 # No bsmt
        #FireplaceQu
        X.loc[(X.Fireplaces == 0) & (X.FireplaceQu.isnull()), 'FireplaceQu'] = "NoFire" #missing
        #MisGarage
        fil = ((X.GarageYrBlt.isnull()) & (X.GarageType.isnull()) & (X.GarageFinish.isnull()) &
              (X.GarageQual.isnull()) & (X.GarageCond.isnull()))
        fil1 = ((X.GarageYrBlt.notnull()) | (X.GarageType.notnull()) | (X.GarageFinish.notnull()) |
              (X.GarageQual.notnull()) | (X.GarageCond.notnull()))
        X.loc[fil1, 'MisGarage'] = 0
        X.loc[fil, 'MisGarage'] = 1
        #GarageYrBlt
        X.loc[X.GarageYrBlt > 2200, 'GarageYrBlt'] = 2007 #correct mistake
        X.loc[fil, 'GarageYrBlt'] = 0
        #GarageType
        X.loc[fil, 'GarageType'] = "NoGrg" #missing garage
        #GarageFinish
        X.loc[fil, 'GarageFinish'] = "NoGrg" #missing
        #GarageQual
        X.loc[fil, 'GarageQual'] = "NoGrg" #missing
        #GarageCond
        X.loc[fil, 'GarageCond'] = "NoGrg" #missing
        #Fence
        X.loc[X.Fence.isnull(), 'Fence'] = "NoFence" #missing fence
        #Pool
        fil = ((X.PoolArea == 0) & (X.PoolQC.isnull()))
        X.loc[fil, 'PoolQC'] = 'NoPool' 
        
        return X

In [35]:
tmp = df_train.copy()

In [36]:
numeric_pipe = Pipeline([('fs', feat_sel('numeric')),
                         ('imputer', df_imputer(strategy='median')),
                         ('scl', df_scaler(method='standard'))])

full_pipe = Pipeline([('gen_cl', general_cleaner()), ('num_pipe', numeric_pipe)])

In [37]:
full_pipe.fit_transform(tmp)

Unnamed: 0,Id,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice,MisBsm,MisGarage
0,-1.730311,0.223761,-0.203934,0.658506,-0.517649,1.052959,0.880362,0.523937,0.617283,-0.288867,-0.943795,-0.473766,-0.814309,1.167626,-0.120325,0.393013,1.113886,-0.241236,0.793546,1.229699,0.163894,-0.211605,0.921812,-0.952231,0.296262,0.313159,0.357973,-0.750831,0.225982,-0.359603,-0.11642,-0.270407,-0.063709,-0.087748,-1.601578,0.138375,0.346896,-0.161363,-0.242536
1,-1.727939,0.665308,-0.087252,-0.068293,2.177825,0.158428,-0.428115,-0.570739,1.245719,-0.288867,-0.640584,0.504925,0.277348,-0.794354,-0.120325,-0.489391,-0.819502,3.945931,0.793546,-0.760202,0.163894,-0.211605,-0.316329,0.605965,0.236767,0.313159,-0.056795,1.627328,-0.708304,-0.359603,-0.11642,-0.270407,-0.063709,-0.087748,-0.490155,-0.614427,0.007136,-0.161363,-0.242536
2,-1.725566,0.312070,0.080162,0.658506,-0.517649,0.986698,0.831900,0.334044,0.108989,-0.288867,-0.301168,-0.319490,-0.642225,1.195195,-0.120325,0.542706,1.113886,-0.241236,0.793546,1.229699,0.163894,-0.211605,-0.316329,0.605965,0.291855,0.313159,0.640770,-0.750831,-0.065025,-0.359603,-0.11642,-0.270407,-0.063709,-0.087748,0.991743,0.138375,0.535652,-0.161363,-0.242536
3,-1.723193,0.076579,-0.092325,0.658506,-0.517649,-1.862551,-0.718888,-0.570739,-0.514826,-0.288867,-0.061314,-0.714823,-0.531984,0.942481,-0.120325,0.406800,1.113886,-0.241236,-1.025620,-0.760202,0.163894,-0.211605,0.302742,0.605965,0.285244,1.652119,0.801022,-0.750831,-0.172238,4.089589,-0.11642,-0.270407,-0.063709,-0.087748,-1.601578,-1.367230,-0.515088,-0.161363,-0.242536
4,-1.720820,0.783053,0.385566,1.385305,-0.517649,0.953567,0.734975,1.384039,0.499451,-0.288867,-0.174452,0.222888,-0.037243,1.624809,-0.120325,1.354202,1.113886,-0.241236,0.793546,1.229699,1.389320,-0.211605,1.540882,0.605965,0.289651,1.652119,1.715398,0.781406,0.578253,-0.359603,-0.11642,-0.270407,-0.063709,-0.087748,2.103167,0.138375,0.869120,-0.161363,-0.242536
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,1.721999,0.135452,-0.258014,-0.068293,-0.517649,0.920436,0.734975,-0.570739,-1.013878,-0.288867,0.873212,-0.239941,-0.553494,0.800042,-0.120325,0.268925,-0.819502,-0.241236,0.793546,1.229699,0.163894,-0.211605,0.302742,0.605965,0.287448,0.313159,-0.056795,-0.750831,-0.095658,-0.359603,-0.11642,-0.270407,-0.063709,-0.087748,0.621269,-0.614427,-0.074658,-0.161363,-0.242536
1454,1.724371,0.812490,0.275478,-0.068293,0.380842,0.224690,0.153430,0.093885,0.811359,0.721263,0.049562,1.179884,2.457972,-0.794354,-0.120325,1.107996,1.113886,-0.241236,0.793546,-0.760202,0.163894,-0.211605,0.302742,2.164162,0.241174,0.313159,0.131736,2.034328,-0.708304,-0.359603,-0.11642,-0.270407,-0.063709,-0.087748,-1.601578,1.643980,0.365772,-0.161363,-0.242536
1455,1.726744,0.253198,-0.143868,0.658506,3.076316,-1.001150,1.025748,-0.570739,-0.378511,-0.288867,0.701241,0.239762,0.078376,1.852252,-0.120325,1.633893,-0.819502,-0.241236,0.793546,-0.760202,1.389320,-0.211605,1.540882,2.164162,0.159645,-1.025802,-1.037158,-0.750831,0.210665,-0.359603,-0.11642,-0.270407,-0.063709,4.949624,-0.490155,1.643980,1.076752,-0.161363,-0.242536
1456,1.729117,0.312070,-0.075381,-0.795092,0.380842,-0.702973,0.541127,-0.570739,-0.900667,6.087966,-1.283211,0.061380,-0.217393,-0.794354,-0.120325,-0.851806,1.113886,-0.241236,-1.025620,-0.760202,-1.061532,-0.211605,-0.935399,-0.952231,0.179476,-1.025802,-1.093717,2.169995,-0.708304,1.472417,-0.11642,-0.270407,-0.063709,-0.087748,-0.860629,1.643980,-0.488348,-0.161363,-0.242536


In [39]:
tmp = df_train.copy()

cat_pipe = Pipeline([('fs', feat_sel('category')),
                     ('imputer', df_imputer(strategy='most_frequent')), 
                     ('ord', make_ordinal(['BsmtQual', 'KitchenQual','GarageQual',
                                           'GarageCond', 'ExterQual', 'HeatingQC']))])

full_pipe = Pipeline([('gen_cl', general_cleaner()), ('cat_pipe', cat_pipe)])

full_pipe.fit_transform(tmp)

Unnamed: 0,MSSubClass,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,CentralAir,Electrical,KitchenQual,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,60,RL,Pave,NoAlley,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,BrkFace,4,TA,PConc,4.0,TA,No,GLQ,Unf,GasA,5,Y,SBrkr,4,Typ,NoFire,Attchd,RFn,3.0,3.0,Y,NoPool,NoFence,Shed,WD,Normal
1,20,RL,Pave,NoAlley,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,Gable,CompShg,MetalSd,MetalSd,,3,TA,CBlock,4.0,TA,Gd,ALQ,Unf,GasA,5,Y,SBrkr,3,Typ,TA,Attchd,RFn,3.0,3.0,Y,NoPool,NoFence,Shed,WD,Normal
2,60,RL,Pave,NoAlley,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,BrkFace,4,TA,PConc,4.0,TA,Mn,GLQ,Unf,GasA,5,Y,SBrkr,4,Typ,TA,Attchd,RFn,3.0,3.0,Y,NoPool,NoFence,Shed,WD,Normal
3,70,RL,Pave,NoAlley,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,Gable,CompShg,Wd Sdng,Wd Shng,,3,TA,BrkTil,3.0,Gd,No,ALQ,Unf,GasA,4,Y,SBrkr,4,Typ,Gd,Detchd,Unf,3.0,3.0,Y,NoPool,NoFence,Shed,WD,Abnorml
4,60,RL,Pave,NoAlley,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,BrkFace,4,TA,PConc,4.0,TA,Av,GLQ,Unf,GasA,5,Y,SBrkr,4,Typ,TA,Attchd,RFn,3.0,3.0,Y,NoPool,NoFence,Shed,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,60,RL,Pave,NoAlley,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,,3,TA,PConc,4.0,TA,No,Unf,Unf,GasA,5,Y,SBrkr,3,Typ,TA,Attchd,RFn,3.0,3.0,Y,NoPool,NoFence,Shed,WD,Normal
1454,20,RL,Pave,NoAlley,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,Norm,1Fam,1Story,Gable,CompShg,Plywood,Plywood,Stone,3,TA,CBlock,4.0,TA,No,ALQ,Rec,GasA,3,Y,SBrkr,3,Min1,TA,Attchd,Unf,3.0,3.0,Y,NoPool,MnPrv,Shed,WD,Normal
1455,70,RL,Pave,NoAlley,Reg,Lvl,AllPub,Inside,Gtl,Crawfor,Norm,Norm,1Fam,2Story,Gable,CompShg,CemntBd,CmentBd,,5,Gd,Stone,3.0,Gd,No,GLQ,Unf,GasA,5,Y,SBrkr,4,Typ,Gd,Attchd,RFn,3.0,3.0,Y,NoPool,GdPrv,Shed,WD,Normal
1456,20,RL,Pave,NoAlley,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,Hip,CompShg,MetalSd,MetalSd,,3,TA,CBlock,3.0,TA,Mn,GLQ,Rec,GasA,4,Y,FuseA,4,Typ,NoFire,Attchd,Unf,3.0,3.0,Y,NoPool,NoFence,Shed,WD,Normal


In [43]:
processing_pipe = FeatureUnion(transformer_list=[('cat_pipe', cat_pipe),
                                                 ('num_pipe', numeric_pipe)])

full_pipe = Pipeline([('gen_cl', general_cleaner()), ('processing', processing_pipe)])

tmp = df_train.copy()

full_pipe.fit_transform(tmp)

array([['60', 'RL', 'Pave', ..., 0.34689628721279514,
        -0.16136294930206108, -0.24253562503633297],
       ['20', 'RL', 'Pave', ..., 0.007135984786050135,
        -0.16136294930206108, -0.24253562503633297],
       ['60', 'RL', 'Pave', ..., 0.535652010783209, -0.16136294930206108,
        -0.24253562503633297],
       ...,
       ['70', 'RL', 'Pave', ..., 1.0767517516850622,
        -0.16136294930206108, -0.24253562503633297],
       ['20', 'RL', 'Pave', ..., -0.48834778958628633,
        -0.16136294930206108, -0.24253562503633297],
       ['20', 'RL', 'Pave', ..., -0.4207103219735547,
        -0.16136294930206108, -0.24253562503633297]], dtype=object)