In [1]:
# taken from https://www.kaggle.com/tannercarbonati/detailed-data-analysis-ensemble-modeling
# Adapted into the ballet framework
import ballet
import ballet.eng
from ballet import Feature
import numpy as np
import pandas as pd
import sklearn
import sklearn_pandas
from sklearn.model_selection import train_test_split

ballet.__version__

'0.4.1'

In [51]:
all_features = []

In [52]:
input = ['Pool QC']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer, name='PoolQC Misc Fill')
all_features.append(misc_fill)

In [53]:
input = ['Year Built', 'Garage Yr Blt']
def calc_age(df):
    mask = pd.isnull(df['Year Built'])
    df['Year Built'][mask] = df['Garage Yr Blt'][mask]
    return df['Year Built']
transformer = ballet.eng.SimpleFunctionTransformer(func=calc_age)
age = Feature(input=input, transformer=transformer)
all_features.append(age)

In [54]:
input = ['Garage Type']
transformer = [ballet.eng.missing.NullFiller(replacement="missing", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [55]:
input = ['Garage Finish']
transformer = [ballet.eng.missing.NullFiller(replacement="missing", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [56]:
input = ['Garage Qual']
transformer = [ballet.eng.missing.NullFiller(replacement="missing", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [57]:
input = ['Garage Cond']
transformer = [ballet.eng.missing.NullFiller(replacement="missing", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [58]:
input = ['Garage Cars']
transformer = [ballet.eng.missing.NullFiller(), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [59]:
input = ['Garage Area']
transformer = [ballet.eng.missing.NullFiller(), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [60]:
input = ['Garage Type']
transformer = [ballet.eng.missing.NullFiller(replacement="missing", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [61]:
input = ['Garage Finish']
transformer = [ballet.eng.missing.NullFiller(replacement="missing", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [62]:
input = ['Garage Qual']
transformer = [ballet.eng.missing.NullFiller(replacement="missing", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [63]:
input = ['Exterior 1st']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
factor = Feature(input=input, transformer=transformer)
all_features.append(factor)

In [64]:
input = ['Exterior 2nd']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
factor = Feature(input=input, transformer=transformer)
all_features.append(factor)

In [65]:
input = ['Electrical']
def mode_filler(df):
    df = df.copy()
    return df.fillna(df.mode()['Electrical'][0])
transformer = [ballet.eng.SimpleFunctionTransformer(func=mode_filler), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [66]:
input = ['Utilities']
def mode_filler(df):
    df = df.copy()
    return df.fillna(df.mode()['Utilities'][0])
transformer = [ballet.eng.SimpleFunctionTransformer(func=mode_filler), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [67]:
input = ['Sale Type']
def mode_filler(df):
    df = df.copy()
    return df.fillna(df.mode()['Sale Type'][0])
transformer = [ballet.eng.SimpleFunctionTransformer(func=mode_filler), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [68]:
input = ['Kitchen Qual']
def mode_filler(df):
    df = df.copy()
    return df.fillna(df.mode()['Kitchen Qual'][0])
transformer = [ballet.eng.SimpleFunctionTransformer(func=mode_filler), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [69]:
input = ['Mas Vnr Type']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [70]:
input = ['Mas Vnr Area']
transformer = [ballet.eng.missing.NullFiller(), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [71]:
input = ['Lot Frontage', 'Neighborhood']
def impute_lot_frontage(df):
    frontage = df['Lot Frontage']
    return frontage.fillna(frontage.median())
transformer = [
    ballet.eng.GroupedFunctionTransformer(func=impute_lot_frontage, groupby_kwargs={'by': 'Neighborhood'}),
    ballet.eng.SimpleFunctionTransformer(lambda s: s.fillna(s.median()))  
]
frontage_feature = Feature(input=input, transformer=transformer,  name='Lot Frontage Fill')
all_features.append(frontage_feature)

In [72]:
input = ['Fence']
transformer = [ballet.eng.missing.NullFiller(replacement="missing", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [73]:
input = ['Misc Feature']
transformer = [ballet.eng.missing.NullFiller(replacement="missing", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [74]:
input = ['Fireplace Qu']
transformer = [ballet.eng.missing.NullFiller(replacement="missing", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [75]:
input = ['Alley']
transformer = [ballet.eng.missing.NullFiller(replacement="missing", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer, name='Alley Misc Fill')
all_features.append(misc_fill)

In [76]:
def has_reg(df):
    return (df['Lot Shape'] == 'Reg') * 1
input = ['Lot Shape']
transformer = ballet.eng.SimpleFunctionTransformer(func=has_reg)
reg = Feature(input=input, transformer=transformer)
all_features.append(reg)

In [77]:
def has_contour(df):
    return (df['Land Contour'] == 'Lvl') * 1
input = ['Land Contour']
transformer = ballet.eng.SimpleFunctionTransformer(func=has_contour)
contour = Feature(input=input, transformer=transformer)
all_features.append(contour)

In [78]:
def has_qual(df):
    return (df['Paved Drive'] == 'Y') * 1
input = ['Paved Drive']
transformer = ballet.eng.SimpleFunctionTransformer(func=has_qual)
qual = Feature(input=input, transformer=transformer)
all_features.append(qual)

In [79]:
def has_qual(df):
    return (df['Land Slope'] == 'Gtl') * 1
input = ['Land Slope']
transformer = ballet.eng.SimpleFunctionTransformer(func=has_qual)
qual = Feature(input=input, transformer=transformer)
all_features.append(qual)

In [80]:
def has_qual(df):
    return (df['Electrical'] == 'SBrkr') * 1
input = ['Electrical']
transformer = ballet.eng.SimpleFunctionTransformer(func=has_qual)
qual = Feature(input=input, transformer=transformer)
all_features.append(qual)

In [81]:
def has_qual(df):
    return (df['Sale Condition'] == 'Partial') * 1
input = ['Sale Condition']
transformer = ballet.eng.SimpleFunctionTransformer(func=has_qual)
qual = Feature(input=input, transformer=transformer)
all_features.append(qual)

In [82]:
def has_qual(df):
    return (df['Wood Deck SF'] > 0) * 1
input = ['Wood Deck SF']
transformer = ballet.eng.SimpleFunctionTransformer(func=has_qual)
qual = Feature(input=input, transformer=transformer)
all_features.append(qual)

In [83]:
def has_qual(df):
    return (df['Mas Vnr Area'] > 0) * 1
input = ['Mas Vnr Area']
transformer = ballet.eng.SimpleFunctionTransformer(func=has_qual)
qual = Feature(input=input, transformer=transformer)
all_features.append(qual)

In [84]:
def has_qual(df):
    return (df['Neighborhood'].isin(['Crawfor', 'Somerst, Timber', 'StoneBr', 'NoRidge', 'NridgeHt'])) * 1
input = ['Neighborhood']
transformer = ballet.eng.SimpleFunctionTransformer(func=has_qual)
qual = Feature(input=input, transformer=transformer)
all_features.append(qual)

In [85]:
input = ['Year Built', 'Year Remod/Add']
def is_remod(df):
    return df['Year Built'] != df['Year Remod/Add']
transformer = ballet.eng.SimpleFunctionTransformer(func=is_remod)
remod = Feature(input=input, transformer=transformer, name='Remodeled')
all_features.append(remod)

In [86]:
input = ['Yr Sold', 'Year Remod/Add']
def calc_age(df):
    return df['Yr Sold'] - df['Year Remod/Add']
transformer = ballet.eng.SimpleFunctionTransformer(func=calc_age)
age = Feature(input=input, transformer=transformer, name='Age')
all_features.append(age)

In [87]:
input = ['Year Built']
def calc_age(df):
    return 2018 - df['Year Built']
transformer = ballet.eng.SimpleFunctionTransformer(func=calc_age)
age = Feature(input=input, transformer=transformer, name='Age')
all_features.append(age)

In [88]:
input = ['Yr Sold']
def calc_age(df):
    return 2018 - df['Yr Sold']
transformer = ballet.eng.SimpleFunctionTransformer(func=calc_age)
age = Feature(input=input, transformer=transformer, name='Age')
all_features.append(age)

In [89]:
def load_ames():
    '''Loads the Ames Housing dataset
    
    Source:
    
        Decock, Dean. "Ames, Iowa: Alternative to the Boston Housing Data as an 
        End of Semester Regression Project."
        <https://ww2.amstat.org/publications/jse/v19n3/decock.pdf>
    '''
    source = 'https://s3.amazonaws.com/mit-dai-ballet/ames/AmesHousing.txt'
    df = pd.read_table(source)
    X = df.drop('SalePrice', axis=1)
    y = df['SalePrice']
    return X, y

In [90]:
X, y = load_ames()
# X.columns.values.tolist()

In [91]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=3)

In [92]:
mapper = ballet.make_mapper(all_features)
mapper.fit(X_tr, y_tr)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


DataFrameMapper(default=False, df_out=False,
        features=[(['Pool QC'], RobustTransformerPipeline(steps=[('nullfiller', NullFiller(isnull=<function isna at 0x105a9d378>, replacement='None')), ('onehotencoder', OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='error',
       n_values=No...ionTransformer(func=<function calc_age at 0x10b4c2488>,
             func_args=(), func_kwargs={}))],
        input_df=True, sparse=False)

In [93]:
mapper.transform(X_tr)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return lambda *a: func(*(a + args))


array([[  0.,   0.,   0., ...,   9.,  21.,  11.],
       [  0.,   0.,   0., ...,  33.,  45.,  12.],
       [  0.,   0.,   0., ...,  33.,  44.,  11.],
       ..., 
       [  0.,   0.,   0., ...,  36.,  47.,  11.],
       [  0.,   0.,   0., ...,   0.,  11.,  11.],
       [  0.,   0.,   0., ...,  45.,  56.,  11.]])