In [1]:
# taken from https://www.kaggle.com/erikbruin/house-prices-lasso-xgboost-and-a-detailed-eda
# Adapted into the ballet framework
import ballet
import ballet.eng
from ballet import Feature
import numpy as np
import pandas as pd
import sklearn
import sklearn_pandas
from sklearn.model_selection import train_test_split

ballet.__version__

'0.4.1'

In [2]:
all_features = []

In [3]:
input = ['YrSold']
year = Feature(input=input, transformer=sklearn.preprocessing.OneHotEncoder(), name='Year Categorical')
all_features.append(year)

In [4]:
input = ['MoSold']
month = Feature(input=input, transformer=sklearn.preprocessing.OneHotEncoder(), name='Month Categorical')
all_features.append(month)

In [6]:
input = ['GarageYrBlt', 'YearBuilt']
def fill_garage(df):
    new_garage = df['GarageYrBlt'].copy()
    mask = df['GarageYrBlt'].isnan()
    new_garage[mask] = df['YearBuilt'][mask]
    return new_garage
transformer = ballet.eng.SimpleFunctionTransformer(func=fill_garage)
garage = Feature(input=input, transformer=transformer)
all_features.append(garage)

In [7]:
input = ['Heating']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
factor = Feature(input=input, transformer=transformer)
all_features.append(factor)

In [None]:
input = ['Foundation']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
factor = Feature(input=input, transformer=transformer)
all_features.append(factor)

In [None]:
input = ['RoofStyle']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
factor = Feature(input=input, transformer=transformer)
all_features.append(factor)

In [None]:
input = ['LandContour']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
factor = Feature(input=input, transformer=transformer)
all_features.append(factor)

In [None]:
input = ['BldgType']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
factor = Feature(input=input, transformer=transformer)
all_features.append(factor)

In [None]:
input = ['HouseStyle']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
factor = Feature(input=input, transformer=transformer)
all_features.append(factor)

In [None]:
input = ['Neighborhood']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
factor = Feature(input=input, transformer=transformer)
all_features.append(factor)

In [None]:
input = ['Condition1']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
factor = Feature(input=input, transformer=transformer)
all_features.append(factor)

In [None]:
input = ['Condition2']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
factor = Feature(input=input, transformer=transformer)
all_features.append(factor)

In [None]:
input = ['KitchenQual']
transformer = [ballet.eng.missing.NullFiller(replacement="TA"), sklearn.preprocessing.OneHotEncoder()]
factor = Feature(input=input, transformer=transformer)
all_features.append(factor)

In [None]:
# PoolQC : data description says NA means "No Pool". 
# That make sense, given the huge ratio of missing value (+99%) 
# and majority of houses have no Pool at all in general.
input = ['PoolQC']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer, name='PoolQC Misc Fill')
all_features.append(misc_fill)

In [None]:
input = ['MiscFeature']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer, name='PoolQC Misc Fill')
all_features.append(misc_fill)

In [None]:
input = ['Fence']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer, name='Fence Misc Fill')
all_features.append(misc_fill)

In [None]:
input = ['Alley']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer, name='Alley Misc Fill')
all_features.append(misc_fill)

In [None]:
# LotFrontage : Since the area of each street connected to the house property most
# likely have a similar area to other houses in its neighborhood , we can fill in
# missing values by the median LotFrontage of the neighborhood.

input = ['Lot Frontage', 'Neighborhood']
def impute_lot_frontage(df):
    frontage = df['Lot Frontage']
    return frontage.fillna(frontage.median())
transformer = ballet.eng.GroupedFunctionTransformer(func=impute_lot_frontage, groupby_kwargs={'by': 'Neighborhood'})
frontage_feature = Feature(input=input, transformer=transformer)
all_features.append(frontage_feature)

In [None]:
input = ['FireplaceQu']
transformer = ballet.eng.missing.NullFiller(replacement="None")
fill = Feature(input=input, transformer=transformer)
all_features.append(fill)

In [None]:
input = ['GarageType']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
garage_none_fill = Feature(input=input, transformer=transformer)
all_features.append(garage_none_fill)

In [None]:
input = ['GarageFinish']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
garage_none_fill = Feature(input=input, transformer=transformer)
all_features.append(garage_none_fill)

In [None]:
input = ['GarageQual']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
garage_none_fill = Feature(input=input, transformer=transformer)
all_features.append(garage_none_fill)

In [None]:
input = ['GarageCond']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
garage_none_fill = Feature(input=input, transformer=transformer)
all_features.append(garage_none_fill)

In [None]:
input = ['MasVnrType']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
mason_fill = Feature(input=input, transformer=transformer, name='Mason Fill None')
all_features.append(mason_fill)input = ['MSSubClass']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
ms_fill = Feature(input=input, transformer=transformer, name='MS Fill None')
all_features.append(ms_fill)

In [None]:
input = ['MSSubClass']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
ms_fill = Feature(input=input, transformer=transformer, name='MS Fill None')
all_features.append(ms_fill)

In [None]:
input = ['MasVnrArea']
transformer = ballet.eng.missing.NullFiller()
mason_fill = Feature(input=input, transformer=transformer, name='Mason Fill Zero')
all_features.append(mason_fill)

In [None]:
input = ['BasementCond']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
garage_none_fill = Feature(input=input, transformer=transformer)
all_features.append(garage_none_fill)

In [8]:
input = ['FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath']
def calc_bath(df):
    return df[FullBath] + 0.5 * df['HalfBath'] + df['BsmtFullBath'] + 0.5 * df['BsmtHalfBath']
transformer = ballet.eng.SimpleFunctionTransformer(func=calc_bath)
baths = Feature(input=input, transformer=transformer, name='Bathroom Count')
all_features.append(baths)

In [9]:
input = ['YrSold', 'YearRemodAdd']
def calc_age(df):
    return df['YrSold'] - df['YearRemodAdd']
transformer = ballet.eng.SimpleFunctionTransformer(func=calc_age)
age = Feature(input=input, transformer=transformer, name='Age')
all_features.append(age)

In [10]:
input = ['YearBuilt', 'YearRemodAdd']
def is_remod(df):
    return df['YearBuilt'] == df['YearRemodAdd']
transformer = ballet.eng.SimpleFunctionTransformer(func=is_remod)
remod = Feature(input=input, transformer=transformer, name='Remodeled')
all_features.append(remod)

In [None]:
input = ['GrLivArea', 'TotalBsmtSF']
def total_area(df):
    return df['GrLivArea'] + df['TotalBsmtSF']
transformer = ballet.eng.SimpleFunctionTransformer(func=total_area)
area = Feature(input=input, transformer=transformer, name='Total Area')
all_features.append(area)