In [1]:
import platform
platform.python_version()

'3.6.5'

In [2]:
import ballet
ballet.__version__

'0.4.1'

import ballet.util.log
ballet.util.log.enable()

In [3]:
import ballet.eng
from ballet import Feature

In [4]:
import numpy as np
import pandas as pd
import sklearn
import sklearn_pandas
from sklearn.model_selection import train_test_split

# Prepare the Ames Housing dataset

The Ames housing dataset is a more "elaborate" version of the toy Boston housing prices dataset. The goal is to predict the sale price of houses in Ames, Iowa. We are faced with more "real-world" data, thus feature engineering is an important part of achieving a good solution.

In [5]:
def load_ames():
    '''Loads the Ames Housing dataset
    
    Source:
    
        Decock, Dean. "Ames, Iowa: Alternative to the Boston Housing Data as an 
        End of Semester Regression Project."
        <https://ww2.amstat.org/publications/jse/v19n3/decock.pdf>
    '''
    source = 'https://s3.amazonaws.com/mit-dai-ballet/ames/AmesHousing.txt'
    df = pd.read_table(source)
    X = df.drop('SalePrice', axis=1)
    y = df['SalePrice']
    return X, y

In [6]:
X, y = load_ames()

In [7]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=3)

In [9]:
X['MS SubClass'].dtypes

dtype('int64')

In [None]:
X_tr.head()

In [None]:
y_tr.head()

# Feature engineering in ballet

Now that we have our data loaded, we can begin feature engineering. In each cell below, we will create a new `Feature` object, which produces a single semantic feature.

These features are adapted from https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard.

In [None]:
all_features = []

In [None]:
# LotFrontage : Since the area of each street connected to the house property most
# likely have a similar area to other houses in its neighborhood , we can fill in
# missing values by the median LotFrontage of the neighborhood.

input = ['Lot Frontage', 'Neighborhood']
def impute_lot_frontage(df):
    frontage = df['Lot Frontage']
    return frontage.fillna(frontage.median())
transformer = ballet.eng.GroupedFunctionTransformer(func=impute_lot_frontage, groupby_kwargs={'by': 'Neighborhood'})
feature = Feature(input=input, transformer=transformer)
all_features.append(feature)
all_features.append(feature)

In [None]:
# MSSubClass : Na most likely means No building class. We can replace missing values with None
# Transforming some numerical variables that are really categorical

input = 'MS SubClass'
transformer = [
    ballet.eng.NullFiller(replacement=-9999999),
    sklearn.preprocessing.OneHotEncoder(),
]
feature = Feature(input=input, transformer=transformer)
all_features.append(feature)

In [None]:
mapper = ballet.make_mapper(all_features)


In [None]:
mapper.fit(X_tr, y_tr)

In [None]:
mapper.transform(X_tr)

In [None]:
b = mapper.transform(X_te)

In [None]:
mapper.transformed_names_