In [1]:
# taken from https://www.kaggle.com/mjbahmani/a-comprehensive-ml-workflow-for-house-prices
# Adapted into the ballet framework
import ballet
import ballet.eng
from ballet.validation.base import check_from_class
from ballet.validation.feature_api_checks import FeatureApiCheck
from ballet import Feature
import numpy as np
import pandas as pd
import sklearn
import sklearn_pandas
from sklearn.model_selection import train_test_split

ballet.__version__

'0.5.1-dev'

In [35]:
for i in range(len(all_features)):
    success, failures = check_from_class(FeatureApiCheck, all_features[i], X, y)
    if not success:
        print('FAILURE AT INDEX ' + str(i))
        print(failures)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [12]:
all_features = []

In [13]:
input = ['MS SubClass']
def add_missing_0_to_mssubclass(df):
    """Zeros in 020-090 get cut off. This function prepends them back."""

    return df['MS SubClass'].apply(
        lambda x: '00' + str(x) if len(str(x)) == 2 else '0' + str(x))
transformer = [ballet.eng.SimpleFunctionTransformer(func=add_missing_0_to_mssubclass), sklearn.preprocessing.OneHotEncoder()]
feature = Feature(input=input, transformer=transformer)
all_features.append(feature)

In [14]:
input = ['Enclosed Porch', '3Ssn Porch', 'Open Porch SF']
def calc_porch_type(df):    
    # Porch features
    total_porch_area = df.apply(np.sum, axis=1)
    porch_type = pd.Series('Missing', index=df.index)
    porch_type[(total_porch_area == df['Enclosed Porch']) & (df['Enclosed Porch'] > 0) ] = 'Enclosed'
    porch_type[(total_porch_area == df['3Ssn Porch']) & (df['3Ssn Porch'] > 0) ] = '3Ssn'
    porch_type[(total_porch_area == df['Open Porch SF']) & (df['Open Porch SF'] > 0) ] = 'Open'
    return porch_type
transformer = [ballet.eng.SimpleFunctionTransformer(func=calc_porch_type), sklearn.preprocessing.OneHotEncoder()]
porch = Feature(input=input, transformer=transformer, name='Porch Type Calculation')
all_features.append(porch)

In [15]:
input = ['Enclosed Porch', '3Ssn Porch', 'Open Porch SF']
def calc_porch_area(df):
    total_area = df['Enclosed Porch'] + df['3Ssn Porch'] + df['Open Porch SF']
    return total_area.fillna(0)
transformer = ballet.eng.SimpleFunctionTransformer(func=calc_porch_area)
total_area = Feature(input=input, transformer=transformer, name='Porch Area Calculation')
all_features.append(total_area)

In [16]:
input = ['Total Bsmt SF', '1st Flr SF', '2nd Flr SF']
def add_areas(df):
    total_area = df['Total Bsmt SF'] + df['1st Flr SF'] + df['2nd Flr SF']
    return total_area.fillna(0)
transformer = ballet.eng.SimpleFunctionTransformer(func=add_areas)
total_area = Feature(input=input, transformer=transformer, name='Total Area Calculation')
all_features.append(total_area)

In [17]:
input = ['Full Bath', 'Half Bath', 'Bsmt Full Bath', 'Bsmt Half Bath']
def calc_bath(df):
    total_area = df['Full Bath'] + df['Half Bath'] + df['Bsmt Full Bath'] + df['Bsmt Half Bath']
    return total_area.fillna(0)
transformer = ballet.eng.SimpleFunctionTransformer(func=calc_bath)
baths = Feature(input=input, transformer=transformer, name='Bathroom Count')
all_features.append(baths)

In [18]:
input = ['Garage Area', 'Garage Cars']
def calc_garage_per_car(df):
    df['Garage Area Per Car'] = df['Garage Area'] / df['Garage Cars']
    df.loc[~np.isfinite(df['Garage Area Per Car']), 'Garage Area Per Car'] = 0
    return df['Garage Area Per Car']
transformer = ballet.eng.SimpleFunctionTransformer(func=calc_garage_per_car)
cars = Feature(input=input, transformer=transformer, name='Garage Area Per Car')
all_features.append(cars)

In [19]:
input = ['Alley']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [20]:
input = ['Bsmt Cond']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [21]:
input = ['Bsmt Qual']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [22]:
input = ['Bsmt Exposure']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [23]:
input = ['BsmtFin Type 1']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [24]:
input = ['BsmtFin Type 2']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [25]:
input = ['Fireplace Qu']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [26]:
input = ['Garage Type']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [27]:
input = ['Garage Finish']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [28]:
input = ['Garage Qual']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [29]:
input = ['Garage Cond']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [30]:
input = ['Pool QC']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [31]:
input = ['Fence']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [32]:
input = ['Misc Feature']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [33]:
def load_ames():
    '''Loads the Ames Housing dataset
    
    Source:
    
        Decock, Dean. "Ames, Iowa: Alternative to the Boston Housing Data as an 
        End of Semester Regression Project."
        <https://ww2.amstat.org/publications/jse/v19n3/decock.pdf>
    '''
    source = 'https://s3.amazonaws.com/mit-dai-ballet/ames/AmesHousing.txt'
    df = pd.read_table(source)
    X = df.drop('SalePrice', axis=1)
    y = df['SalePrice']
    return X, y

In [34]:
X, y = load_ames()
# X.columns.values.tolist()

In [None]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=3)

In [None]:
mapper = ballet.make_mapper(all_features)
mapper.fit(X_tr, y_tr)

In [None]:
mapper.transform(X_tr)

In [None]:
X[]