In [1]:
# taken from https://www.kaggle.com/erikbruin/house-prices-lasso-xgboost-and-a-detailed-eda
# Adapted into the ballet framework
import ballet
import ballet.eng
from ballet import Feature
import numpy as np
import pandas as pd
import sklearn
import sklearn_pandas
from sklearn.model_selection import train_test_split

ballet.__version__

'0.4.1'

In [2]:
all_features = []

In [3]:
input = ['YrSold']
def cat_year(df):
    return df['YrSold'].astype(str)
transformer = ballet.eng.SimpleFunctionTransformer(func=cat_year)
year = Feature(input=input, transformer=[transformer, sklearn.preprocessing.OneHotEncoder()], name='Year Categorical')
all_features.append(year)

In [4]:
input = ['MoSold']
def cat_month(df):
    return df['MoSold'].astype(str)
transformer = ballet.eng.SimpleFunctionTransformer(func=cat_month)
month = Feature(input=input, transformer=[transformer, sklearn.preprocessing.OneHotEncoder()], name='Month Categorical')
all_features.append(month)

In [6]:
input = ['GarageYrBlt', 'YearBuilt']
def fill_garage(df):
    new_garage = df['GarageYrBlt'].copy()
    mask = df['GarageYrBlt'].isnan()
    new_garage[mask] = df['YearBuilt'][mask]
    return new_garage
transformer = ballet.eng.SimpleFunctionTransformer(func=fill_garage)

In [7]:
input = ['Heating', 'Foundation', 'RoofStyle', 'LandContour', 'BldgType', 'HouseStyle', 'Neighborhood', 'Condition1', 'Condition2']
factor = Feature(input=input, transformer=sklearn.preprocessing.OneHotEncoder(), name='Factors')
all_features.append(factor)

In [8]:
input = ['FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath']
def calc_bath(df):
    return df[FullBath] + 0.5 * df['HalfBath'] + df['BsmtFullBath'] + 0.5 * df['BsmtHalfBath']
transformer = ballet.eng.SimpleFunctionTransformer(func=calc_bath)
baths = Feature(input=input, transformer=transformer, name='Bathroom Count')
all_features.append(baths)

In [9]:
input = ['YrSold', 'YearRemodAdd']
def calc_age(df):
    return df['YrSold'] - df['YearRemodAdd']
transformer = ballet.eng.SimpleFunctionTransformer(func=calc_age)
age = Feature(input=input, transformer=transformer, name='Age')
all_features.append(age)

In [10]:
input = ['YearBuilt', 'YearRemodAdd']
def is_remod(df):
    return df['YearBuilt'] == df['YearRemodAdd']
transformer = ballet.eng.SimpleFunctionTransformer(func=is_remod)
remod = Feature(input=input, transformer=transformer, name='Remodeled')
all_features.append(remod)

In [None]:
input = ['GrLivArea', 'TotalBsmtSF']
def total_area(df):
    return df['GrLivArea'] + df['TotalBsmtSF']
transformer = ballet.eng.SimpleFunctionTransformer(func=total_area)
area = Feature(input=input, transformer=transformer, name='Total Area')
all_features.append(area)