In [1]:
# taken from https://www.kaggle.com/poonaml/house-prices-data-exploration-and-visualisation
# Adapted into the ballet framework
# some features are not in her kernel but implied from her analyses
import ballet
import ballet.eng
from ballet import Feature
import numpy as np
import pandas as pd
import scipy
import sklearn
import sklearn_pandas
from sklearn.model_selection import train_test_split
ballet.__version__

'0.4.1'

In [62]:
all_features = []

In [63]:
input = ['Lot Area']
def calc_sqrt(df):
    return np.sqrt(df['Lot Area'])
transformer = ballet.eng.SimpleFunctionTransformer(func=calc_sqrt)
sqrt = Feature(input=input, transformer=transformer)
all_features.append(sqrt)

In [64]:
input = ['Lot Area', 'Lot Frontage']
def fill_area(df):
    mask = pd.isnull(df['Lot Frontage'])
    df['Lot Frontage'][mask] = np.sqrt(df['Lot Area'])[mask]
    return df['Lot Frontage']
transformer = ballet.eng.SimpleFunctionTransformer(func=fill_area)
area = Feature(input=input, transformer=transformer)
all_features.append(area)

In [65]:
input = ['Mas Vnr Type']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [66]:
input = ['Mas Vnr Area']
transformer = ballet.eng.missing.NullFiller(isnull=pd.isnull)
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [67]:
input = ['Electrical']
def mode_filler(df):
    df = df.copy()
    return df.fillna(df.mode()['Electrical'][0])
transformer = [ballet.eng.SimpleFunctionTransformer(func=mode_filler), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [68]:
input = ['Alley']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer, name='Alley Misc Fill')
all_features.append(misc_fill)

In [69]:
input = ['BsmtFin Type 1']
transformer = [ballet.eng.missing.NullFiller(replacement="missing", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [70]:
input = ['BsmtFin Type 2']
transformer = [ballet.eng.missing.NullFiller(replacement="missing", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [71]:
input = ['Bsmt Qual']
def mode_filler(df):
    df = df.copy()
    return df.fillna(df.mode()['Bsmt Qual'][0])
transformer = [ballet.eng.SimpleFunctionTransformer(func=mode_filler), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [72]:
input = ['Bsmt Cond']
def mode_filler(df):
    df = df.copy()
    return df.fillna(df.mode()['Bsmt Cond'][0])
transformer = [ballet.eng.SimpleFunctionTransformer(func=mode_filler), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [73]:
input = ['Fireplace Qu']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [74]:
input = ['Garage Cond']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [75]:
input = ['Garage Qual']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [76]:
input = ['Garage Type']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [77]:
input = ['Garage Finish']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [78]:
input = ['Garage Cars']
transformer = ballet.eng.missing.NullFiller(isnull=pd.isnull)
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [79]:
input = ['Garage Area']
transformer = ballet.eng.missing.NullFiller(isnull=pd.isnull)
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [80]:
input = ['Pool QC']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [81]:
input = ['Fence']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [82]:
input = ['Misc Feature']
transformer = [ballet.eng.missing.NullFiller(replacement="missing", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [83]:
def has_qual(df):
    return (df['Fireplaces'] > 1) * 1
input = ['Fireplaces']
transformer = ballet.eng.SimpleFunctionTransformer(func=has_qual)
qual = Feature(input=input, transformer=transformer)
all_features.append(qual)

In [84]:
def has_qual(df):
    return (df['Fireplace Qu'] == "Ex") * 1
input = ['Fireplace Qu']
transformer = ballet.eng.SimpleFunctionTransformer(func=has_qual)
qual = Feature(input=input, transformer=transformer)
all_features.append(qual)

In [85]:
def has_qual(df):
    return (df['Misc Feature'] == "TenC") * 1
input = ['Misc Feature']
transformer = ballet.eng.SimpleFunctionTransformer(func=has_qual)
qual = Feature(input=input, transformer=transformer)
all_features.append(qual)

In [86]:
def has_qual(df):
    return (~pd.isnull(df['Pool QC'])) * 1
input = ['Pool QC']
transformer = ballet.eng.SimpleFunctionTransformer(func=has_qual)
qual = Feature(input=input, transformer=transformer)
all_features.append(qual)

In [87]:
def load_ames():
    '''Loads the Ames Housing dataset
    
    Source:
    
        Decock, Dean. "Ames, Iowa: Alternative to the Boston Housing Data as an 
        End of Semester Regression Project."
        <https://ww2.amstat.org/publications/jse/v19n3/decock.pdf>
    '''
    source = 'https://s3.amazonaws.com/mit-dai-ballet/ames/AmesHousing.txt'
    df = pd.read_table(source)
    X = df.drop('SalePrice', axis=1)
    y = df['SalePrice']
    return X, y

In [88]:
X, y = load_ames()
# X.columns.values.tolist()

In [89]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=3)

In [90]:
mapper = ballet.make_mapper(all_features)
mapper.fit(X_tr, y_tr)

DataFrameMapper(default=False, df_out=False,
        features=[(['Lot Area'], SimpleFunctionTransformer(func=<function calc_sqrt at 0x1133dad90>,
             func_args=(), func_kwargs={})), (['Lot Area', 'Lot Frontage'], SimpleFunctionTransformer(func=<function fill_area at 0x113d79158>,
             func_args=(), func_kwargs={})), (['Mas Vnr Type'],...ionTransformer(func=<function has_qual at 0x1146b6378>,
             func_args=(), func_kwargs={}))],
        input_df=True, sparse=False)

In [91]:
mapper.transform(X_tr)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return lambda *a: func(*(a + args))


array([[  86.01744009,   66.        ,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       [  44.19275959,   21.        ,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       [ 108.62780491,   97.        ,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       ..., 
       [  98.78258956,   61.        ,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       [ 109.93179704,   93.        ,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       [  99.2068546 ,   70.        ,    0.        , ...,    0.        ,
           0.        ,    0.        ]])