In [1]:
import ballet
import ballet.util.log
import logging
ballet.util.log.enable(level=logging.INFO)

[2018-09-26 18:11:40,688] {ballet: log.py:19} INFO - Logging enabled.


In [2]:
import ballet.eng
from ballet import Feature

In [3]:
import numpy as np
import pandas as pd
import sklearn
import sklearn_pandas
from sklearn.model_selection import train_test_split

# Prepare the Ames Housing dataset

The Ames housing dataset is a more "elaborate" version of the toy Boston housing prices dataset. The goal is to predict the sale price of houses in Ames, Iowa. We are faced with more "real-world" data, thus feature engineering is an important part of achieving a good solution.

In [4]:
def load_ames():
    '''Loads the Ames Housing dataset
    
    Source:
    
        Decock, Dean. "Ames, Iowa: Alternative to the Boston Housing Data as an 
        End of Semester Regression Project."
        <https://ww2.amstat.org/publications/jse/v19n3/decock.pdf>
    '''
    source = 'https://ww2.amstat.org/publications/jse/v19n3/decock/AmesHousing.txt'
    df = pd.read_table(source)
    X = df.drop('SalePrice', axis=1)
    y = df['SalePrice']
    return X, y

In [5]:
X, y = load_ames()

In [6]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=3)

In [7]:
X_tr.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition
1849,1850,533241030,60,FV,66.0,7399,Pave,Pave,IR1,Lvl,...,198,0,,,,0,6,2007,WD,Normal
2366,2367,527450210,160,RM,21.0,1953,Pave,,Reg,Lvl,...,0,0,,,,0,6,2006,WD,Normal
2134,2135,907200130,20,RL,97.0,11800,Pave,,IR1,Bnk,...,0,0,,,,0,8,2007,WD,Family
2305,2306,526302030,20,RL,,11027,Pave,,IR1,Lvl,...,0,0,,,,0,5,2006,WD,Normal
2339,2340,527252090,120,RL,60.0,8147,Pave,,Reg,HLS,...,0,0,,,,0,8,2006,WD,Normal


In [8]:
y_tr.head()

1849    239000
2366     83000
2134    131000
2305    149900
2339    318000
Name: SalePrice, dtype: int64

# Feature engineering in ballet

Now that we have our data loaded, we can begin feature engineering. In each cell below, we will create a new `Feature` object, which produces a single semantic feature.

These features are adapted from https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard.

In [9]:
all_features = []

In [10]:
# LotFrontage : Since the area of each street connected to the house property most
# likely have a similar area to other houses in its neighborhood , we can fill in
# missing values by the median LotFrontage of the neighborhood.

input = ['Lot Frontage', 'Neighborhood']
def impute_lot_frontage(df):
    frontage = df['Lot Frontage']
    return frontage.fillna(frontage.median())
transformer = ballet.eng.GroupedFunctionTransformer(func=impute_lot_frontage, groupby_kwargs={'by': 'Neighborhood'})
feature = Feature(input=input, transformer=transformer)
all_features.append(feature)

In [11]:
# MSSubClass : Na most likely means No building class. We can replace missing values with None
# Transforming some numerical variables that are really categorical

input = 'MS SubClass'
transformer = [
    ballet.eng.NullFiller(replacement=-9999999),
    sklearn.preprocessing.OneHotEncoder(),
]
feature = Feature(input=input, transformer=transformer)
all_features.append(feature)

In [12]:
mapper = ballet.make_mapper(all_features)

In [13]:
mapper.fit(X_tr, y_tr)

DataFrameMapper(default=False, df_out=False,
        features=[(['Lot Frontage', 'Neighborhood'], GroupedFunctionTransformer(func=<function impute_lot_frontage at 0x10ea4b400>,
              func_args=(), func_kwargs={},
              groupby_kwargs={'by': 'Neighborhood'})), ('MS SubClass', RobustTransformerPipeline(steps=[('nullfiller', NullFiller(is...l', dtype=<class 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True))]))],
        input_df=True, sparse=False)

In [14]:
mapper.transform(X_tr)

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


array([[43.,  0.,  0., ...,  0.,  0.,  0.],
       [43.,  0.,  0., ...,  1.,  0.,  0.],
       [53.,  1.,  0., ...,  0.,  0.,  0.],
       ...,
       [30.,  1.,  0., ...,  0.,  0.,  0.],
       [80.,  1.,  0., ...,  0.,  0.,  0.],
       [68.,  0.,  0., ...,  0.,  0.,  0.]])

In [15]:
mapper.transform(X_te)

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


array([[51. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       [43. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       [43. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       ...,
       [72.5,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       [50. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       [72.5,  0. ,  1. , ...,  0. ,  0. ,  0. ]])