In [30]:
# taken from https://www.kaggle.com/erikbruin/house-prices-lasso-xgboost-and-a-detailed-eda
# Adapted into the ballet framework
import ballet
import ballet.eng
from ballet import Feature
import numpy as np
import pandas as pd
import sklearn
import sklearn_pandas
from sklearn.model_selection import train_test_split

ballet.__version__

'0.4.1'

In [63]:
all_features = []

In [64]:
input = ['Alley']
transformer = [ballet.eng.missing.NullFiller(replacement="NOACCESS", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer, name='Alley Misc Fill')
all_features.append(misc_fill)

In [65]:
input = ['MS Zoning']
def mode_filler(df):
    df = df.copy()
    return df.fillna(df.mode()['MS Zoning'][0])
transformer = [ballet.eng.SimpleFunctionTransformer(func=mode_filler), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [66]:
input = ['MS SubClass']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
ms_fill = Feature(input=input, transformer=transformer, name='MS Fill None')
all_features.append(ms_fill)

In [67]:
input = ['Lot Frontage']
def mean_filler(df):
    df = df.copy()
    return df.fillna(df.mean()['Lot Frontage'])
transformer = ballet.eng.SimpleFunctionTransformer(func=mean_filler)
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [68]:
input = ['Overall Cond']
transformer =  sklearn.preprocessing.OneHotEncoder()
feature = Feature(input=input, transformer=transformer)
all_features.append(feature)

In [69]:
input = ['Mas Vnr Type']
def mode_filler(df):
    df = df.copy()
    return df.fillna(df.mode()['Mas Vnr Type'][0])
transformer = [ballet.eng.SimpleFunctionTransformer(func=mode_filler), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [70]:
input = ['BsmtFin Type 1']
transformer = [ballet.eng.missing.NullFiller(replacement="NoBsmt", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [71]:
input = ['BsmtFin Type 2']
transformer = [ballet.eng.missing.NullFiller(replacement="NoBsmt", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [72]:
input = ['Bsmt Qual']
transformer = [ballet.eng.missing.NullFiller(replacement="NoBsmt", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [73]:
input = ['Bsmt Exposure']
transformer = [ballet.eng.missing.NullFiller(replacement="NoBsmt", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [74]:
input = ['Bsmt Cond']
transformer = [ballet.eng.missing.NullFiller(replacement="NoBsmt", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [75]:
input = ['Total Bsmt SF']
transformer = ballet.eng.missing.NullFiller()
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [76]:
input = ['Electrical']
def mode_filler(df):
    df = df.copy()
    return df.fillna(df.mode()['Electrical'][0])
transformer = [ballet.eng.SimpleFunctionTransformer(func=mode_filler), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [77]:
input = ['Kitchen Qual']
def mode_filler(df):
    df = df.copy()
    return df.fillna(df.mode()['Kitchen Qual'][0])
transformer = [ballet.eng.SimpleFunctionTransformer(func=mode_filler), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [78]:
input = ['Kitchen AbvGr']
transformer = [ballet.eng.missing.NullFiller(replacement="missing", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [79]:
input = ['Fireplace Qu']
transformer = [ballet.eng.missing.NullFiller(replacement="NoFP", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [80]:
input = ['Garage Type']
transformer = [ballet.eng.missing.NullFiller(replacement="missing", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [81]:
input = ['Garage Finish']
transformer = [ballet.eng.missing.NullFiller(replacement="missing", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [82]:
input = ['Garage Qual']
transformer = [ballet.eng.missing.NullFiller(replacement="missing", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [83]:
input = ['Sale Type']
def mode_filler(df):
    df = df.copy()
    return df.fillna(df.mode()['Sale Type'][0])
transformer = [ballet.eng.SimpleFunctionTransformer(func=mode_filler), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [84]:
input = ['Yr Sold']
year = Feature(input=input, transformer=sklearn.preprocessing.OneHotEncoder(), name='Year Categorical')
all_features.append(year)

In [85]:
input = ['Mo Sold']
month = Feature(input=input, transformer=sklearn.preprocessing.OneHotEncoder(), name='Month Categorical')
all_features.append(month)

In [86]:
input = ['Condition 1']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
factor = Feature(input=input, transformer=transformer)
all_features.append(factor)

In [87]:
input = ['Condition 2']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
factor = Feature(input=input, transformer=transformer)
all_features.append(factor)

In [88]:
input = ['Exterior 1st']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
factor = Feature(input=input, transformer=transformer)
all_features.append(factor)

In [89]:
input = ['Exterior 2nd']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
factor = Feature(input=input, transformer=transformer)
all_features.append(factor)

In [90]:
input = ['Total Bsmt SF', '1st Flr SF', '2nd Flr SF']
def add_areas(df):
    return df['Total Bsmt SF'] + df['1st Flr SF'] + df['2nd Flr SF']
transformer = ballet.eng.SimpleFunctionTransformer(func=add_areas)
total_area = Feature(input=input, transformer=transformer, name='Total Area Calculation')
all_features.append(total_area)

In [91]:
def load_ames():
    '''Loads the Ames Housing dataset
    
    Source:
    
        Decock, Dean. "Ames, Iowa: Alternative to the Boston Housing Data as an 
        End of Semester Regression Project."
        <https://ww2.amstat.org/publications/jse/v19n3/decock.pdf>
    '''
    source = 'https://s3.amazonaws.com/mit-dai-ballet/ames/AmesHousing.txt'
    df = pd.read_table(source)
    X = df.drop('SalePrice', axis=1)
    y = df['SalePrice']
    return X, y

In [92]:
X, y = load_ames()
# X.columns.values.tolist()

In [93]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=3)

In [94]:
mapper = ballet.make_mapper(all_features)
mapper.fit(X_tr, y_tr)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


DataFrameMapper(default=False, df_out=False,
        features=[(['Alley'], RobustTransformerPipeline(steps=[('nullfiller', NullFiller(isnull=<function isna at 0x1033d7378>, replacement='NOACCESS')), ('onehotencoder', OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='error',
       n_values=...onTransformer(func=<function add_areas at 0x107f3e048>,
             func_args=(), func_kwargs={}))],
        input_df=True, sparse=False)

In [95]:
mapper.transform(X_tr)

array([[  0.00000000e+00,   0.00000000e+00,   1.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   2.92500000e+03],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   1.47000000e+03],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   1.75800000e+03],
       ..., 
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   1.90000000e+03],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   3.46800000e+03],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   1.22400000e+03]])