In [1]:
# taken from https://www.kaggle.com/mjbahmani/a-comprehensive-ml-workflow-for-house-prices
# Adapted into the ballet framework
import ballet
import ballet.eng
from ballet import Feature
import numpy as np
import pandas as pd
import sklearn
import sklearn_pandas
from sklearn.model_selection import train_test_split

ballet.__version__

'0.4.1'

In [2]:
def load_ames():
    '''Loads the Ames Housing dataset
    
    Source:
    
        Decock, Dean. "Ames, Iowa: Alternative to the Boston Housing Data as an 
        End of Semester Regression Project."
        <https://ww2.amstat.org/publications/jse/v19n3/decock.pdf>
    '''
    source = 'https://s3.amazonaws.com/mit-dai-ballet/ames/AmesHousing.txt'
    df = pd.read_table(source)
    X = df.drop('SalePrice', axis=1)
    y = df['SalePrice']
    return X, y

In [15]:
X, y = load_ames()
X.columns.values.tolist()

['Order',
 'PID',
 'MS SubClass',
 'MS Zoning',
 'Lot Frontage',
 'Lot Area',
 'Street',
 'Alley',
 'Lot Shape',
 'Land Contour',
 'Utilities',
 'Lot Config',
 'Land Slope',
 'Neighborhood',
 'Condition 1',
 'Condition 2',
 'Bldg Type',
 'House Style',
 'Overall Qual',
 'Overall Cond',
 'Year Built',
 'Year Remod/Add',
 'Roof Style',
 'Roof Matl',
 'Exterior 1st',
 'Exterior 2nd',
 'Mas Vnr Type',
 'Mas Vnr Area',
 'Exter Qual',
 'Exter Cond',
 'Foundation',
 'Bsmt Qual',
 'Bsmt Cond',
 'Bsmt Exposure',
 'BsmtFin Type 1',
 'BsmtFin SF 1',
 'BsmtFin Type 2',
 'BsmtFin SF 2',
 'Bsmt Unf SF',
 'Total Bsmt SF',
 'Heating',
 'Heating QC',
 'Central Air',
 'Electrical',
 '1st Flr SF',
 '2nd Flr SF',
 'Low Qual Fin SF',
 'Gr Liv Area',
 'Bsmt Full Bath',
 'Bsmt Half Bath',
 'Full Bath',
 'Half Bath',
 'Bedroom AbvGr',
 'Kitchen AbvGr',
 'Kitchen Qual',
 'TotRms AbvGrd',
 'Functional',
 'Fireplaces',
 'Fireplace Qu',
 'Garage Type',
 'Garage Yr Blt',
 'Garage Finish',
 'Garage Cars',
 'Garage 

In [4]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=3)

In [None]:
# reset the feature set
all_features = []

In [None]:
# LotFrontage : Since the area of each street connected to the house property most
# likely have a similar area to other houses in its neighborhood , we can fill in
# missing values by the median LotFrontage of the neighborhood.

input = ['Lot Frontage', 'Neighborhood']
def impute_lot_frontage(df):
    frontage = df['Lot Frontage']
    return frontage.fillna(frontage.median())
transformer = ballet.eng.GroupedFunctionTransformer(func=impute_lot_frontage, groupby_kwargs={'by': 'Neighborhood'})
frontage_feature = Feature(input=input, transformer=[transformer, sklearn.preprocessing.OneHotEncoder()],  name='Lot Frontage Fill')
all_features.append(frontage_feature)

In [None]:
# PoolQC : data description says NA means "No Pool". 
# That make sense, given the huge ratio of missing value (+99%) 
# and majority of houses have no Pool at all in general.
input = ['Pool QC']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer, name='PoolQC Misc Fill')
all_features.append(misc_fill)

In [None]:
input = ['Misc Feature']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer, name='PoolQC Misc Fill')
all_features.append(misc_fill)

In [None]:
input = ['Alley']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer, name='Alley Misc Fill')
all_features.append(misc_fill)

In [None]:
input = ['Fence']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer, name='Fence Misc Fill')
all_features.append(misc_fill)

In [None]:
# PoolQC : data description says NA means "No Pool". 
# That make sense, given the huge ratio of missing value (+99%) 
# and majority of houses have no Pool at all in general.
input = ['Garage Type']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
garage_none_fill = Feature(input=input, transformer=transformer, name='Garage Missing Fill None')
all_features.append(garage_none_fill)

In [None]:
# PoolQC : data description says NA means "No Pool". 
# That make sense, given the huge ratio of missing value (+99%) 
# and majority of houses have no Pool at all in general.
input = ['Garage Finish']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
garage_none_fill = Feature(input=input, transformer=transformer, name='Garage Missing Fill None')
all_features.append(garage_none_fill)

In [None]:
# PoolQC : data description says NA means "No Pool". 
# That make sense, given the huge ratio of missing value (+99%) 
# and majority of houses have no Pool at all in general.
input = ['Garage Qual']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
garage_none_fill = Feature(input=input, transformer=transformer, name='Garage Missing Fill None')
all_features.append(garage_none_fill)

In [None]:
# PoolQC : data description says NA means "No Pool". 
# That make sense, given the huge ratio of missing value (+99%) 
# and majority of houses have no Pool at all in general.
input = ['Garage Cond']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
garage_none_fill = Feature(input=input, transformer=transformer, name='Garage Missing Fill None')
all_features.append(garage_none_fill)

In [None]:
# BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath and BsmtHalfBath :
# missing values are likely zero for having no basement

input = ['BsmtFin SF1']
transformer = ballet.eng.missing.NullFiller()
basement_fill = Feature(input=input, transformer=transformer, name='Basement Fill None')
all_features.append(basement_fill)

In [None]:
# BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath and BsmtHalfBath :
# missing values are likely zero for having no basement

input = ['BsmtFin SF2']
transformer = ballet.eng.missing.NullFiller()
basement_fill = Feature(input=input, transformer=transformer, name='Missing Fill None')
all_features.append(basement_fill)

In [None]:
# BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath and BsmtHalfBath :
# missing values are likely zero for having no basement

input = ['Bsmt Unf SF']
transformer = ballet.eng.missing.NullFiller()
basement_fill = Feature(input=input, transformer=transformer, name='Missing Fill None')
all_features.append(basement_fill)

In [None]:
# BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath and BsmtHalfBath :
# missing values are likely zero for having no basement

input = ['Total Bsmt SF']
transformer = ballet.eng.missing.NullFiller()
basement_fill = Feature(input=input, transformer=transformer, name='Missing Fill None')
all_features.append(basement_fill)

In [None]:
# BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath and BsmtHalfBath :
# missing values are likely zero for having no basement

input = ['Bsmt Full Bath']
transformer = ballet.eng.missing.NullFiller()
basement_fill = Feature(input=input, transformer=transformer, name='Missing Fill None')
all_features.append(basement_fill)

In [None]:
# BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath and BsmtHalfBath :
# missing values are likely zero for having no basement

input = ['Bsmt Half Bath']
transformer = ballet.eng.missing.NullFiller()
basement_fill = Feature(input=input, transformer=transformer, name='Missing Fill None')
all_features.append(basement_fill)

In [None]:
input = ['Mas Vnr Type']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
mason_fill = Feature(input=input, transformer=transformer, name='Mason Fill None')
all_features.append(mason_fill)

In [None]:
input = ['Mas Vnr Area']
transformer = ballet.eng.missing.NullFiller()
mason_fill = Feature(input=input, transformer=transformer, name='Mason Fill Zero')
all_features.append(mason_fill)

In [None]:
input = ['MS SubClass']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
ms_fill = Feature(input=input, transformer=transformer, name='MS Fill None')
all_features.append(ms_fill)

In [None]:
input = ['Fireplace Qu']
transformer = ballet.eng.missing.NullFiller(replacement="None")
fill = Feature(input=input, transformer=transformer, name='FireplaceQu Fill Zero')
all_features.append(fill)

In [None]:
input = ['Total Bsmt SF', '1st Flr SF', '2nd Flr SF']
def add_areas(df):
    return df['Total Bsmt SF'] + df['1st Flr SF'] + df['2nd Flr SF']
transformer = ballet.eng.SimpleFunctionTransformer(func=add_areas)
total_area = Feature(input=input, transformer=transformer, name='Total Area Calculation')
all_features.append(total_area)

In [None]:
input = ['Mo Sold']
month = Feature(input=input, transformer=sklearn.preprocessing.OneHotEncoder(), name='Month Categorical')
all_features.append(month)

In [None]:
input = ['Yr Sold']
year = Feature(input=input, transformer=sklearn.preprocessing.OneHotEncoder(), name='Year Categorical')
all_features.append(year)

In [None]:
mapper = ballet.make_mapper(all_features)
mapper.fit(X_tr, y_tr)