In [1]:
# taken from https://www.kaggle.com/tannercarbonati/detailed-data-analysis-ensemble-modeling
# Adapted into the ballet framework
import ballet
import ballet.eng
from ballet import Feature
import numpy as np
import pandas as pd
import sklearn
import sklearn_pandas
from sklearn.model_selection import train_test_split

ballet.__version__

'0.4.1'

In [2]:
all_features = []

In [3]:
input = ['Pool QC']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer, name='PoolQC Misc Fill')
all_features.append(misc_fill)

In [4]:
input = ['Year Built', 'Garage Yr Blt']
def calc_age(df):
    mask = df['Year Built'].isna()
    df['Year Built'][mask] = df['Garage Yr Blt'][mask]
    return df['Year Built']
transformer = ballet.eng.SimpleFunctionTransformer(func=calc_age)
age = Feature(input=input, transformer=transformer)
all_features.append(age)

In [5]:
input = ['Garage Type']
transformer = [ballet.eng.missing.NullFiller(replacement="missing"), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [6]:
input = ['Garage Finish']
transformer = [ballet.eng.missing.NullFiller(replacement="missing"), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [7]:
input = ['Garage Qual']
transformer = [ballet.eng.missing.NullFiller(replacement="missing"), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [8]:
input = ['Garage Cond']
transformer = [ballet.eng.missing.NullFiller(replacement="missing"), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [9]:
input = ['Garage Cars']
transformer = [ballet.eng.missing.NullFiller(replacement=0), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [10]:
input = ['Garage Area']
transformer = [ballet.eng.missing.NullFiller(replacement=0), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [11]:
input = ['Garage Type']
transformer = [ballet.eng.missing.NullFiller(replacement="missing"), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [12]:
input = ['Garage Finish']
transformer = [ballet.eng.missing.NullFiller(replacement="missing"), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [13]:
input = ['Garage Qual']
transformer = [ballet.eng.missing.NullFiller(replacement="missing"), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [14]:
input = ['Exterior 1st']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
factor = Feature(input=input, transformer=transformer)
all_features.append(factor)

In [15]:
input = ['Exterior 2nd']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
factor = Feature(input=input, transformer=transformer)
all_features.append(factor)

In [16]:
input = ['Electrical']
def mode_filler(df):
    df = df.copy()
    return df['Electrical'].fillna(df['Electrical'].mode()[0])
transformer = [ballet.eng.SimpleFunctionTransformer(func=mode_filler), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [17]:
input = ['Utilities']
def mode_filler(df):
    df = df.copy()
    return df['Utilities'].fillna(df['Utilities'].mode()[0])
transformer = [ballet.eng.SimpleFunctionTransformer(func=mode_filler), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [18]:
input = ['Sale Type']
def mode_filler(df):
    df = df.copy()
    return df['Sale Type'].fillna(df['Sale Type'].mode()[0])
transformer = [ballet.eng.SimpleFunctionTransformer(func=mode_filler), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [19]:
input = ['Kitchen Qual']
def mode_filler(df):
    df = df.copy()
    return df['Kitchen Qual'].fillna(df['Kitchen Qual'].mode()[0])
transformer = [ballet.eng.SimpleFunctionTransformer(func=mode_filler), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [20]:
input = ['Mas Vnr Type']
transformer = [ballet.eng.missing.NullFiller(replacement="None"), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [21]:
input = ['Mas Vnr Area']
transformer = [ballet.eng.missing.NullFiller(replacement=0), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [22]:
# LotFrontage : Since the area of each street connected to the house property most
# likely have a similar area to other houses in its neighborhood , we can fill in
# missing values by the median LotFrontage of the neighborhood.

input = ['Lot Frontage', 'Neighborhood']
def impute_lot_frontage(df):
    frontage = df['Lot Frontage']
    return frontage.fillna(frontage.median())
transformer = ballet.eng.GroupedFunctionTransformer(func=impute_lot_frontage, groupby_kwargs={'by': 'Neighborhood'})
frontage_feature = Feature(input=input, transformer=transformer)
all_features.append(frontage_feature)

In [23]:
input = ['Fence']
transformer = [ballet.eng.missing.NullFiller(replacement="missing"), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [24]:
input = ['Misc Feature']
transformer = [ballet.eng.missing.NullFiller(replacement="missing"), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [25]:
input = ['Fireplace Qu']
transformer = [ballet.eng.missing.NullFiller(replacement="missing"), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
all_features.append(misc_fill)

In [26]:
input = ['Alley']
transformer = [ballet.eng.missing.NullFiller(replacement="missing"), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer, name='Alley Misc Fill')
all_features.append(misc_fill)

In [27]:
def has_reg(df):
    return (df['Lot Shape'] == 'Reg') * 1
input = ['Lot Shape']
transformer = ballet.eng.SimpleFunctionTransformer(func=has_reg)
reg = Feature(input=input, transformer=transformer)
all_features.append(reg)

In [28]:
def has_contour(df):
    return (df['Land Contour'] == 'Lvl') * 1
input = ['Land Contour']
transformer = ballet.eng.SimpleFunctionTransformer(func=has_contour)
contour = Feature(input=input, transformer=transformer)
all_features.append(contour)

In [29]:
def has_qual(df):
    return (df['Paved Drive'] == 'Y') * 1
input = ['Paved Drive']
transformer = ballet.eng.SimpleFunctionTransformer(func=has_qual)
qual = Feature(input=input, transformer=transformer)
all_features.append(qual)

In [30]:
def has_qual(df):
    return (df['Land Slope'] == 'Gtl') * 1
input = ['Land Slope']
transformer = ballet.eng.SimpleFunctionTransformer(func=has_qual)
qual = Feature(input=input, transformer=transformer)
all_features.append(qual)

In [31]:
def has_qual(df):
    return (df['Electrical'] == 'SBrkr') * 1
input = ['Electrical']
transformer = ballet.eng.SimpleFunctionTransformer(func=has_qual)
qual = Feature(input=input, transformer=transformer)
all_features.append(qual)

In [32]:
def has_qual(df):
    return (df['Sale Condition'] == 'Partial') * 1
input = ['Sale Condition']
transformer = ballet.eng.SimpleFunctionTransformer(func=has_qual)
qual = Feature(input=input, transformer=transformer)
all_features.append(qual)

In [33]:
def has_qual(df):
    return (df['Wood Deck SF'] > 0) * 1
input = ['Wood Deck SF']
transformer = ballet.eng.SimpleFunctionTransformer(func=has_qual)
qual = Feature(input=input, transformer=transformer)
all_features.append(qual)

In [34]:
def has_qual(df):
    return (df['Mas Vnr Area'] > 0) * 1
input = ['Mas Vnr Area']
transformer = ballet.eng.SimpleFunctionTransformer(func=has_qual)
qual = Feature(input=input, transformer=transformer)
all_features.append(qual)

In [35]:
def has_qual(df):
    return (df['Neighborhood'].isin(['Crawfor', 'Somerst, Timber', 'StoneBr', 'NoRidge', 'NridgeHt'])) * 1
input = ['Mas Vnr Area']
transformer = ballet.eng.SimpleFunctionTransformer(func=has_qual)
qual = Feature(input=input, transformer=transformer)
all_features.append(qual)

In [36]:
input = ['Year Built', 'Year Remod/Add']
def is_remod(df):
    return df['Year Built'] != df['Year Remod/Add']
transformer = ballet.eng.SimpleFunctionTransformer(func=is_remod)
remod = Feature(input=input, transformer=transformer, name='Remodeled')
all_features.append(remod)

In [37]:
input = ['Yr Sold', 'Year Remod/Add']
def calc_age(df):
    return df['Yr Sold'] - df['Year Remod/Add']
transformer = ballet.eng.SimpleFunctionTransformer(func=calc_age)
age = Feature(input=input, transformer=transformer, name='Age')
all_features.append(age)

In [None]:
input = ['Year Built']
def calc_age(df):
    return 2018 - df['Year Built']
transformer = ballet.eng.SimpleFunctionTransformer(func=calc_age)
age = Feature(input=input, transformer=transformer, name='Age')
all_features.append(age)

In [None]:
input = ['Yr Sold']
def calc_age(df):
    return 2018 - df['Yr Sold']
transformer = ballet.eng.SimpleFunctionTransformer(func=calc_age)
age = Feature(input=input, transformer=transformer, name='Age')
all_features.append(age)