In [1]:
# taken from https://www.kaggle.com/mjbahmani/a-comprehensive-ml-workflow-for-house-prices
# Adapted into the ballet framework
import ballet
import ballet.eng
from ballet.validation.base import check_from_class
from ballet.validation.feature_api_checks import FeatureApiCheck
from ballet import Feature
import numpy as np
import pandas as pd
import sklearn
import sklearn_pandas
from sklearn.model_selection import train_test_split

ballet.__version__

'0.5.1-dev'

In [2]:
def load_ames():
    '''Loads the Ames Housing dataset
    
    Source:
    
        Decock, Dean. "Ames, Iowa: Alternative to the Boston Housing Data as an 
        End of Semester Regression Project."
        <https://ww2.amstat.org/publications/jse/v19n3/decock.pdf>
    '''
    source = 'https://s3.amazonaws.com/mit-dai-ballet/ames/AmesHousing.txt'
    df = pd.read_table(source)
    X = df.drop('SalePrice', axis=1)
    y = df['SalePrice']
    return X, y

In [3]:
X, y = load_ames()
X.columns.values.tolist()

['Order',
 'PID',
 'MS SubClass',
 'MS Zoning',
 'Lot Frontage',
 'Lot Area',
 'Street',
 'Alley',
 'Lot Shape',
 'Land Contour',
 'Utilities',
 'Lot Config',
 'Land Slope',
 'Neighborhood',
 'Condition 1',
 'Condition 2',
 'Bldg Type',
 'House Style',
 'Overall Qual',
 'Overall Cond',
 'Year Built',
 'Year Remod/Add',
 'Roof Style',
 'Roof Matl',
 'Exterior 1st',
 'Exterior 2nd',
 'Mas Vnr Type',
 'Mas Vnr Area',
 'Exter Qual',
 'Exter Cond',
 'Foundation',
 'Bsmt Qual',
 'Bsmt Cond',
 'Bsmt Exposure',
 'BsmtFin Type 1',
 'BsmtFin SF 1',
 'BsmtFin Type 2',
 'BsmtFin SF 2',
 'Bsmt Unf SF',
 'Total Bsmt SF',
 'Heating',
 'Heating QC',
 'Central Air',
 'Electrical',
 '1st Flr SF',
 '2nd Flr SF',
 'Low Qual Fin SF',
 'Gr Liv Area',
 'Bsmt Full Bath',
 'Bsmt Half Bath',
 'Full Bath',
 'Half Bath',
 'Bedroom AbvGr',
 'Kitchen AbvGr',
 'Kitchen Qual',
 'TotRms AbvGrd',
 'Functional',
 'Fireplaces',
 'Fireplace Qu',
 'Garage Type',
 'Garage Yr Blt',
 'Garage Finish',
 'Garage Cars',
 'Garage 

In [4]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=3)

In [34]:
# reset the feature set
all_features = []

In [6]:
input = ['Lot Frontage', 'Neighborhood']
def impute_lot_frontage(df):
    frontage = df['Lot Frontage']
    return frontage.fillna(frontage.median())
transformer = [
    ballet.eng.GroupedFunctionTransformer(func=impute_lot_frontage, groupby_kwargs={'by': 'Neighborhood'}),
    ballet.eng.SimpleFunctionTransformer(lambda s: s.fillna(s.median()))  
]
frontage_feature = Feature(input=input, transformer=transformer,  name='Lot Frontage Fill')
all_features.append(frontage_feature)

In [7]:
# PoolQC : data description says NA means "No Pool". 
# That make sense, given the huge ratio of missing value (+99%) 
# and majority of houses have no Pool at all in general.
input = ['Pool QC']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [8]:
input = ['Misc Feature']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [9]:
input = ['Alley']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [10]:
input = ['Fence']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer, name='Fence Misc Fill')
all_features.append(misc_fill)

In [11]:
# PoolQC : data description says NA means "No Pool". 
# That make sense, given the huge ratio of missing value (+99%) 
# and majority of houses have no Pool at all in general.
input = ['Garage Type']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
garage_none_fill = Feature(input=input, transformer=transformer, name='Garage Missing Fill None')
all_features.append(garage_none_fill)

In [12]:
# PoolQC : data description says NA means "No Pool". 
# That make sense, given the huge ratio of missing value (+99%) 
# and majority of houses have no Pool at all in general.
input = ['Garage Finish']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
garage_none_fill = Feature(input=input, transformer=transformer, name='Garage Missing Fill None')
all_features.append(garage_none_fill)

In [13]:
# PoolQC : data description says NA means "No Pool". 
# That make sense, given the huge ratio of missing value (+99%) 
# and majority of houses have no Pool at all in general.
input = ['Garage Qual']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
garage_none_fill = Feature(input=input, transformer=transformer, name='Garage Missing Fill None')
all_features.append(garage_none_fill)

In [14]:
# PoolQC : data description says NA means "No Pool". 
# That make sense, given the huge ratio of missing value (+99%) 
# and majority of houses have no Pool at all in general.
input = ['Garage Cond']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
garage_none_fill = Feature(input=input, transformer=transformer, name='Garage Missing Fill None')
all_features.append(garage_none_fill)

In [15]:
# BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath and BsmtHalfBath :
# missing values are likely zero for having no basement

input = ['BsmtFin SF 1']
transformer = ballet.eng.missing.NullFiller()
basement_fill = Feature(input=input, transformer=transformer, name='Basement Fill None')
all_features.append(basement_fill)

In [16]:
# BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath and BsmtHalfBath :
# missing values are likely zero for having no basement

input = ['BsmtFin SF 2']
transformer = ballet.eng.missing.NullFiller()
basement_fill = Feature(input=input, transformer=transformer, name='Missing Fill None')
all_features.append(basement_fill)

In [17]:
# BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath and BsmtHalfBath :
# missing values are likely zero for having no basement

input = ['Bsmt Unf SF']
transformer = ballet.eng.missing.NullFiller()
basement_fill = Feature(input=input, transformer=transformer, name='Missing Fill None')
all_features.append(basement_fill)

In [18]:
# BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath and BsmtHalfBath :
# missing values are likely zero for having no basement

input = ['Total Bsmt SF']
transformer = ballet.eng.missing.NullFiller()
basement_fill = Feature(input=input, transformer=transformer, name='Missing Fill None')
all_features.append(basement_fill)

In [19]:
# BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath and BsmtHalfBath :
# missing values are likely zero for having no basement

input = ['Bsmt Full Bath']
transformer = ballet.eng.missing.NullFiller()
basement_fill = Feature(input=input, transformer=transformer, name='Missing Fill None')
all_features.append(basement_fill)

In [20]:
# BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath and BsmtHalfBath :
# missing values are likely zero for having no basement

input = ['Bsmt Half Bath']
transformer = ballet.eng.missing.NullFiller()
basement_fill = Feature(input=input, transformer=transformer, name='Missing Fill None')
all_features.append(basement_fill)

In [21]:
input = ['Mas Vnr Type']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
mason_fill = Feature(input=input, transformer=transformer, name='Mason Fill None')
all_features.append(mason_fill)

In [22]:
input = ['Mas Vnr Area']
transformer = ballet.eng.missing.NullFiller()
mason_fill = Feature(input=input, transformer=transformer, name='Mason Fill Zero')
all_features.append(mason_fill)

In [23]:
input = ['MS SubClass']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
ms_fill = Feature(input=input, transformer=transformer, name='MS Fill None')
all_features.append(ms_fill)

In [24]:
input = ['Fireplace Qu']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
fill = Feature(input=input, transformer=transformer, name='FireplaceQu Fill Zero')
all_features.append(fill)

In [35]:
input = ['Total Bsmt SF', '1st Flr SF', '2nd Flr SF']
def add_areas(df):
    total_sf = df['Total Bsmt SF'] + df['1st Flr SF'] + df['2nd Flr SF']
    total_sf.fillna(0)
    return total_sf
transformer = ballet.eng.SimpleFunctionTransformer(func=add_areas)
total_area = Feature(input=input, transformer=transformer, name='Total Area Calculation')
all_features.append(total_area)

In [26]:
input = ['Mo Sold']
month = Feature(input=input, transformer=sklearn.preprocessing.OneHotEncoder(), name='Month Categorical')
all_features.append(month)

In [27]:
input = ['Yr Sold']
year = Feature(input=input, transformer=sklearn.preprocessing.OneHotEncoder(), name='Year Categorical')
all_features.append(year)

In [28]:
mapper = ballet.make_mapper(all_features)
mapper.fit(X_tr, y_tr)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


DataFrameMapper(default=False, df_out=False,
        features=[(['Lot Frontage', 'Neighborhood'], TransformerPipeline(steps=[('groupedfunctiontransformer', DelegatingRobustTransformer(GroupedFunctionTransformer(func=<function impute_lot_frontage at 0x11440c6a8>,
              func_args=(), func_kwargs={},
              groupby_kwargs={'by': 'Neighborh...s 'numpy.float64'>, handle_unknown='error',
       n_values='auto', sparse=True)), {'alias': None})],
        input_df=True, sparse=False)

In [29]:
mapper.transform(X_tr)

array([[ 43.,   0.,   0., ...,   0.,   0.,   0.],
       [ 43.,   0.,   0., ...,   0.,   0.,   0.],
       [ 53.,   0.,   0., ...,   0.,   0.,   0.],
       ..., 
       [ 30.,   0.,   0., ...,   0.,   0.,   0.],
       [ 80.,   0.,   0., ...,   0.,   0.,   0.],
       [ 68.,   0.,   0., ...,   0.,   0.,   0.]])

In [36]:
for i in range(len(all_features)):
    success, failures = check_from_class(FeatureApiCheck, all_features[i], X, y)
    if not success:
        print('FAILURE AT INDEX ' + str(i))
        print(failures)

FAILURE AT INDEX 0
['NoMissingValuesCheck']
