In [None]:
# taken from https://www.kaggle.com/mjbahmani/a-comprehensive-ml-workflow-for-house-prices
# Adapted into the ballet framework
import ballet
import ballet.eng
from ballet import Feature
import numpy as np
import pandas as pd
import sklearn
import sklearn_pandas
from sklearn.model_selection import train_test_split

ballet.__version__

In [None]:
def load_ames():
    '''Loads the Ames Housing dataset
    
    Source:
    
        Decock, Dean. "Ames, Iowa: Alternative to the Boston Housing Data as an 
        End of Semester Regression Project."
        <https://ww2.amstat.org/publications/jse/v19n3/decock.pdf>
    '''
    source = 'https://s3.amazonaws.com/mit-dai-ballet/ames/AmesHousing.txt'
    df = pd.read_table(source)
    X = df.drop('SalePrice', axis=1)
    y = df['SalePrice']
    return X, y

In [None]:
X, y = load_ames()

In [None]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=3)

In [None]:
# reset the feature set
all_features = []

In [None]:
# LotFrontage : Since the area of each street connected to the house property most
# likely have a similar area to other houses in its neighborhood , we can fill in
# missing values by the median LotFrontage of the neighborhood.

input = ['Lot Frontage', 'Neighborhood']
def impute_lot_frontage(df):
    frontage = df['Lot Frontage']
    return frontage.fillna(frontage.median())
transformer = ballet.eng.GroupedFunctionTransformer(func=impute_lot_frontage, groupby_kwargs={'by': 'Neighborhood'})
frontage_feature = Feature(input=input, transformer=[transformer, sklearn.preprocessing.OneHotEncoder()],  name='Lot Frontage Fill')
all_features.append(frontage_feature)

In [None]:
# PoolQC : data description says NA means "No Pool". 
# That make sense, given the huge ratio of missing value (+99%) 
# and majority of houses have no Pool at all in general.
input = ['PoolQC']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer, name='PoolQC Misc Fill')
all_features.append(misc_fill)

In [None]:
input = ['MiscFeature']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer, name='PoolQC Misc Fill')
all_features.append(misc_fill)

In [None]:
input = ['Alley']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer, name='Alley Misc Fill')
all_features.append(misc_fill)

In [None]:
input = ['Fence']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer, name='Fence Misc Fill')
all_features.append(misc_fill)

In [None]:
# PoolQC : data description says NA means "No Pool". 
# That make sense, given the huge ratio of missing value (+99%) 
# and majority of houses have no Pool at all in general.
input = ['GarageType']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
garage_none_fill = Feature(input=input, transformer=transformer, name='Garage Missing Fill None')
all_features.append(garage_none_fill)

In [None]:
# PoolQC : data description says NA means "No Pool". 
# That make sense, given the huge ratio of missing value (+99%) 
# and majority of houses have no Pool at all in general.
input = ['GarageFinish']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
garage_none_fill = Feature(input=input, transformer=transformer, name='Garage Missing Fill None')
all_features.append(garage_none_fill)

In [None]:
# PoolQC : data description says NA means "No Pool". 
# That make sense, given the huge ratio of missing value (+99%) 
# and majority of houses have no Pool at all in general.
input = ['GarageQual']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
garage_none_fill = Feature(input=input, transformer=transformer, name='Garage Missing Fill None')
all_features.append(garage_none_fill)

In [None]:
# PoolQC : data description says NA means "No Pool". 
# That make sense, given the huge ratio of missing value (+99%) 
# and majority of houses have no Pool at all in general.
input = ['GarageCond']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
garage_none_fill = Feature(input=input, transformer=transformer, name='Garage Missing Fill None')
all_features.append(garage_none_fill)

In [None]:
# BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath and BsmtHalfBath :
# missing values are likely zero for having no basement

input = ['BsmtFinSF1']
transformer = ballet.eng.missing.NullFiller()
basement_fill = Feature(input=input, transformer=transformer, name='Basement Fill None')
all_features.append(basement_fill)

In [None]:
# BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath and BsmtHalfBath :
# missing values are likely zero for having no basement

input = ['BsmtFinSF2']
transformer = ballet.eng.missing.NullFiller()
basement_fill = Feature(input=input, transformer=transformer, name='Missing Fill None')
all_features.append(basement_fill)

In [None]:
# BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath and BsmtHalfBath :
# missing values are likely zero for having no basement

input = ['BsmtUnfSF']
transformer = ballet.eng.missing.NullFiller()
basement_fill = Feature(input=input, transformer=transformer, name='Missing Fill None')
all_features.append(basement_fill)

In [None]:
# BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath and BsmtHalfBath :
# missing values are likely zero for having no basement

input = ['TotalBsmtSF']
transformer = ballet.eng.missing.NullFiller()
basement_fill = Feature(input=input, transformer=transformer, name='Missing Fill None')
all_features.append(basement_fill)

In [None]:
# BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath and BsmtHalfBath :
# missing values are likely zero for having no basement

input = ['BsmtFullBath']
transformer = ballet.eng.missing.NullFiller()
basement_fill = Feature(input=input, transformer=transformer, name='Missing Fill None')
all_features.append(basement_fill)

In [None]:
# BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath and BsmtHalfBath :
# missing values are likely zero for having no basement

input = ['BsmtHalfBath']
transformer = ballet.eng.missing.NullFiller()
basement_fill = Feature(input=input, transformer=transformer, name='Missing Fill None')
all_features.append(basement_fill)

In [None]:
input = ['MasVnrType']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
mason_fill = Feature(input=input, transformer=transformer, name='Mason Fill None')
all_features.append(mason_fill)

In [None]:
input = ['MasVnrArea']
transformer = ballet.eng.missing.NullFiller()
mason_fill = Feature(input=input, transformer=transformer, name='Mason Fill Zero')
all_features.append(mason_fill)

In [None]:
input = ['MSSubClass']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
ms_fill = Feature(input=input, transformer=transformer, name='MS Fill None')
all_features.append(ms_fill)

In [None]:
input = ['FireplaceQu']
transformer = ballet.eng.missing.NullFiller(replacement="None")
fill = Feature(input=input, transformer=transformer, name='FireplaceQu Fill Zero')
all_features.append(fill)

In [None]:
input = ['TotalBsmtSF', '1stFlrSF', '2ndFlrSF']
def add_areas(df):
    return df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
transformer = ballet.eng.SimpleFunctionTransformer(func=add_areas)
total_area = Feature(input=input, transformer=transformer, name='Total Area Calculation')
all_features.append(total_area)

In [None]:
input = ['MoSold']
month = Feature(input=input, transformer=sklearn.preprocessing.OneHotEncoder(), name='Month Categorical')
all_features.append(month)

In [None]:
input = ['YrSold']
year = Feature(input=input, transformer=sklearn.preprocessing.OneHotEncoder(), name='Year Categorical')
all_features.append(year)

In [None]:
mapper = ballet.make_mapper(all_features)
mapper.fit(X_tr, y_tr)