In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

house_source_data = pd.read_csv("train.csv")
house_test = pd.read_csv("test.csv")

In [2]:
# Stage the ratings for the mapper function.
# Remember, these are ordinal features.
qual_rating_features = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
                'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual',
                'GarageCond', 'PoolQC']
qual_ordinal_ratings = [
        ('Ex',5), 
        ('Gd',4), 
        ('TA',3), 
        ('Fa',2), 
        ('Po',1), 
        ('NA',0)
    ]

# Exposure features need own mapping.
exposure_rating_feature = ['BsmtExposure']
exposure_ordinal_ratings = [
        ('Gd',4), 
        ('Av',3), 
        ('Mn',2), 
        ('No',1), 
        ('NA',0)
    ]

electrical_rating_feature = ['Electrical']
electrical_ordinal_ratings = [
        ('SBrkr',5),
        ('FuseA',4),
        ('FuseF',3),
        ('FuseP',2),
        ('Mix',1)
    ]

functional_rating_feature = ['Functional']
functional_ordinal_ratings = [
        ('Typ',7), 
        ('Min1',6), 
        ('Min2',5), 
        ('Mod',4), 
        ('Maj1',3),
        ('Maj2',2),
        ('Sev',1),
        ('Sal',0)
    ]

# Finish ratings
bsmt_finish_rating_features = ['BsmtFinType1', 'BsmtFinType2']
bsmtfin_ordinal_ratings = [
        ('GLQ',6), 
        ('ALQ',5), 
        ('BLQ',4), 
        ('Rec',3), 
        ('LwQ',2),
        ('Unf',1),
        ('NA',0)
    ]

grg_finish_rating_feature = ['GarageFinish']
grgfin_ordinal_ratings = [
        ('Fin',3), 
        ('RFn',2), 
        ('Unf',1), 
        ('NA',0)
    ]

paved_rating_feature = ['PavedDrive']
paved_ordinal_ratings = [
        ('Y',2), 
        ('P',1), 
        ('N',0)
    ]

fence_rating_feature = ['Fence']
fence_ordinal_ratings = [
        ('GdPrv',4),
        ('MnPrv',3),
        ('GdWo',2), 
        ('MnWw',1),
        ('NA',0)
    ]

alley_rating_feature = ['Alley']
alley_ordinal_ratings = [
        ('Pave',2),
        ('Grvl',1),
        ('NA',0)
    ]

utilities_rating_feature = ['Utilities']
utilities_ordinal_ratings = [
        ('AllPub',3),
        ('NoSewr',2),
        ('NoSeWa',1),
        ('ELO', 0)
    ]

In [3]:
# Select categorical features to one-hot-encode.
categorial_onehot_features = ['MSZoning',
    'Street',
    'LotShape',
    'LandContour',
    'LotConfig',
    'LandSlope',
    'Neighborhood',
    'Condition1',
    'Condition2',
    'BldgType',
    'HouseStyle',
    'RoofStyle',
    'RoofMatl',
    'Exterior1st',
    'Exterior2nd',
    'MasVnrType',
    'Foundation',
    'Heating',
    'CentralAir',
    'GarageType',
    'SaleType',
    'SaleCondition',
    'MSSubClass'
    ]

In [4]:
# Use this function for generating new ones. I can't generalize this part 
# because of all the potentially different ways to create each feature.
# but this will allow me to do the same generation for the test set.
def generator(dataframe):
    """Generate new features."""
    new_dataframe = dataframe.copy()
    #new_dataframe['The new one'] = new_dataframe['MSSubClass'] ** 2 

    return new_dataframe

In [5]:
def mapper(data_in, features, ratings):
    """Map ordinal ratings to numeric ranking."""
    counter = 0
    for feature in features:
        ordinal_mapping = [{
            "col":feature,    
            "mapping": ratings},
        ]

        encoder = ce.OrdinalEncoder(mapping = ordinal_mapping, 
                                    return_df = True, cols = feature)
        if counter == 0:
            df_train = encoder.fit_transform(data_in.copy())
        else:
            df_train = encoder.fit_transform(df_train)
        counter += 1
    return df_train

In [6]:
def data_staging(data_in):
    """Create preliminary training set."""
    data_frame = data_in.copy()
    
    # Fill in the null values with None.
    category_data = data_frame.copy().select_dtypes(exclude='number').fillna(value='None')

    # Fill in null values with 0.
    numeric_data = (data_frame
             .loc[:, data_frame.copy().columns != 'LotFrontage']
             .select_dtypes(include='number')
             .fillna(value=0)
            )

    # Re-merge the two datasets. Drop label and unneeded features.
    data_in_cleaned = (category_data
                         .merge(numeric_data, how='outer',
                                left_index=True, right_index=True)
                         .drop(columns=['SalePrice','MiscFeature', 'Id'], errors='ignore')
                        )
    data_in_cleaned = pd.concat([data_in_cleaned, data_in['LotFrontage']], axis=1)
    
    # Generate new features.
    data_in_cleaned = generator(data_in_cleaned)
    
    return data_in_cleaned

In [7]:
# Create the preliminary training set. This is for analysis.
# numeric attributes get pulled from here.
house_train_staging = data_staging(house_source_data)
house_test_staging = data_staging(house_test)

In [8]:
# Store all the numeric type column names for DataFrameSelector(). 
# MSSubClass gets encoded, so it gets droped from the numeric list of features to process.
numeric_features = list(house_train_staging.select_dtypes(include='number').columns)

In [9]:
# Create the dataframe selector class for use in pipelines
class DataFrameSelector(BaseEstimator, TransformerMixin):
    """Generate any features and convert dataframe to numpy array."""
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.attribute_names].values

In [10]:
# Create the dataframe selector class for use in pipelines
class Mapper(BaseEstimator, TransformerMixin):
    """Generate any features and convert dataframe to numpy array."""
    def __init__(self):
        self = self
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Map ratings to numbers.
        X = mapper(X, qual_rating_features, qual_ordinal_ratings)
        X = mapper(X, bsmt_finish_rating_features, bsmtfin_ordinal_ratings)
        X = mapper(X, exposure_rating_feature, exposure_ordinal_ratings)
        X = mapper(X, electrical_rating_feature, electrical_ordinal_ratings)
        X = mapper(X, functional_rating_feature, functional_ordinal_ratings)
        X = mapper(X, grg_finish_rating_feature, grgfin_ordinal_ratings)
        X = mapper(X, paved_rating_feature, paved_ordinal_ratings)
        X = mapper(X, fence_rating_feature, fence_ordinal_ratings)
        X = mapper(X, alley_rating_feature, alley_ordinal_ratings)
        X = mapper(X, utilities_rating_feature, utilities_ordinal_ratings)
        return X

In [11]:
class OneHotEncoder(BaseEstimator, TransformerMixin):
    """One-hot-encode the categorical features using pandas."""
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        train_objs_num = len(X)
        X = X[categorial_onehot_features]
        X_test = house_test_staging[categorial_onehot_features]
        dataset = pd.concat(objs=[X, X_test], axis=0, sort=True)
        dataset = pd.get_dummies(data=dataset, columns=categorial_onehot_features)
        if X.equals(X_test):
            X = dataset[train_objs_num:].values
        else:
            X = dataset[:train_objs_num].values
        return X

In [12]:
from sklearn.preprocessing import PolynomialFeatures

numeric_pipeline = Pipeline([
    ('mapping', Mapper()),
    ('selector', DataFrameSelector(numeric_features)),
    ('imputer', SimpleImputer(strategy='median')),
    ('poly', PolynomialFeatures(4)),
    ('standard_scaler', StandardScaler()),
])

categorical_pipeline = Pipeline([
    ('cat_encoder', OneHotEncoder(categorial_onehot_features))
])

In [13]:
# Create final training set
full_pipeline = FeatureUnion([
    ('num_pipe', numeric_pipeline),
    ('cat_pipe', categorical_pipeline)
])

In [14]:
# Create training set.
house_training = full_pipeline.fit_transform(house_train_staging.copy())

In [15]:
# Create test set.
house_test_data = full_pipeline.transform(house_test_staging.copy())

In [16]:
pd.to_pickle(house_training,'house_training')
pd.to_pickle(house_test_data, 'house_test_data')