In [1]:
import platform
platform.python_version()

'3.6.5'

In [2]:
import ballet
ballet.__version__

'0.5.3-dev'

import ballet.util.log
ballet.util.log.enable()

In [3]:
import ballet.eng
from ballet import Feature
from ballet.validation.feature_acceptance.validator import (
    GFSSFAcceptanceEvaluator)
from ballet.validation.feature_pruning.validator import GFSSFPruningEvaluator
from ballet.eng import GroupwiseTransformer, SimpleFunctionTransformer
import ballet.validation.entropy as entropy
from ballet.util import asarray2d
from sklearn.impute import SimpleImputer

In [4]:
import numpy as np
import pandas as pd
import sklearn
import sklearn_pandas
from sklearn.model_selection import train_test_split


# Prepare the Ames Housing dataset

The Ames housing dataset is a more "elaborate" version of the toy Boston housing prices dataset. The goal is to predict the sale price of houses in Ames, Iowa. We are faced with more "real-world" data, thus feature engineering is an important part of achieving a good solution.

In [5]:
def load_ames():
    '''Loads the Ames Housing dataset
    
    Source:
    
        Decock, Dean. "Ames, Iowa: Alternative to the Boston Housing Data as an 
        End of Semester Regression Project."
        <https://ww2.amstat.org/publications/jse/v19n3/decock.pdf>
    '''
    source = 'https://s3.amazonaws.com/mit-dai-ballet/ames/AmesHousing.txt'
    df = pd.read_table(source)
    X = df.drop('SalePrice', axis=1)
    y = df['SalePrice']
    return X, y

In [6]:
X, y = load_ames()
y = asarray2d(y)
X.columns

Index(['Order', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area',
       'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities',
       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual',
       'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1',
       'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF',
       '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
       'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt',
      

In [7]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=3)

In [8]:
input = ['Lot Frontage', 'Neighborhood']
transformer = [
    SimpleFunctionTransformer(lambda df: df.set_index('Neighborhood',
                                                      append=True)),
    GroupwiseTransformer(
        SimpleImputer(strategy='median'),
        groupby_kwargs={'level': 'Neighborhood'},
        handle_error='ignore',
    ),
    SimpleImputer(strategy='median'),
]
f_1 = Feature(input=input,
                  transformer=transformer,
                  name='Imputed Lot Frontage')

In [9]:
input = 'Lot Area'
transformer = SimpleFunctionTransformer(np.sqrt)
name = 'Sqrt Lot Area'
f_2 = Feature(input=input, transformer=transformer, name=name)

In [10]:
from sklearn.impute import SimpleImputer

input = ["Lot Frontage"]
transformer = SimpleImputer(strategy="mean")
name = "Lot Frontage Fill"
f_3 = Feature(input=input, transformer=transformer, name=name)

In [15]:
input = ["Overall Qual", "Overall Cond"]


def calc_qual(df):
    return df["Overall Qual"] - df["Overall Cond"]


transformer = SimpleFunctionTransformer(calc_qual)
name = "Qual"
f_4 = Feature(input=input, transformer=transformer, name=name)

In [11]:
def calc_mi(x,y, n_1, n_2):
    mi = entropy.estimate_mutual_information(x,y)
    print('Found Mutual Information between {} and {} to be: {}'.format(n_1, n_2, mi))
    
def calc_cmi(x,y,z, n_1, n_2, n_3):
    cmi = entropy.estimate_conditional_information(x,y,z)
    print('Found Cond Information between {} and {} given {} to be: {}'.format(n_1, n_2, n_3, cmi))

In [17]:
df_1 = asarray2d(f_1.as_dataframe_mapper().fit_transform(X, y))
df_2 = asarray2d(f_2.as_dataframe_mapper().fit_transform(X, y))
df_4 = asarray2d(f_4.as_dataframe_mapper().fit_transform(X, y))
calc_mi(df_1, y, 'Lot Frontage', 'Target')
calc_mi(df_2, y, 'Sqrt Area', 'Target')
calc_mi(df_4, y, 'Cond', 'Target')

# calc_mi(df_1, df_2, 'Lot Frontage', 'Sqrt Area')
# calc_cmi(df_1, y, df_2, 'Lot Frontage', 'Target', 'Sqrt Area')
# calc_cmi(df_2, y, df_1, 'Sqrt Area', 'Target', 'Lot Frontage')

Found Mutual Information between Lot Frontage and Target to be: 2.57903767669501
Found Mutual Information between Sqrt Area and Target to be: 4.848567742968713
Found Mutual Information between Cond and Target to be: 1.108826810459087
