In [1]:
import warnings
warnings.filterwarnings('ignore') # for pandas <> scikit-learn warnings about ints -> floats

import pandas as pd
from matplotlib import pyplot as plt

from sklearn.preprocessing import scale
from sklearn.metrics import r2_score, explained_variance_score, median_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from statsmodels.api import OLS

from donatello.components import core, data, transformers, estimator, measure

## Initial dataset
Housing price data from single family homes in Denver CO

In [2]:
prices = pd.read_csv('./data/single_family_home_values.csv')
prices['last_sale_date'] = pd.to_datetime(prices['last_sale_date'])
display(prices.shape)
prices.head()

(3417, 14)

Unnamed: 0,address,city,state,zipcode,property_type,square_footage,lot_size,num_rooms,num_bedrooms,num_baths,year_built,last_sale_amount,last_sale_date,estimated_value
0,2495 S Milwaukee St,Denver,CO,80210,Single Family Residence,3885,7500.0,13,4,5.0,2010,350000,2010-06-07,1684147
1,2690 Niagara St,Denver,CO,80207,Single Family Residence,1343,7500.0,6,3,1.0,1948,31500,1978-02-01,343104
2,3425 Grove St,Denver,CO,80211,Single Family Residence,992,6250.0,5,2,2.0,1921,258000,2002-02-19,510436
3,4794 Claude Ct,Denver,CO,80216,Single Family Residence,491,2990.0,3,1,1.0,1891,0,2011-04-27,184105
4,3640 N Gaylord St,Denver,CO,80205,Single Family Residence,1365,4690.0,6,3,1.0,1912,52600,1980-09-01,390055


### Augementation
With a larger dataet a model could learn to relate the sale date and sale amount, but if we had an existing measure for this it'd be helpful. Fortunately consumer price index is readily available in csvs! If this wasn't a indepedent measure
and we needed to control for leakage we could embed the merging in donatello's ModelDAG (a lightweight execution graph) but since it's not pernicious we can do it up front for simplicity.

In [3]:
cpi = pd.read_csv('./data/denver_cpi.csv')
cpi['DATE'] = pd.to_datetime(cpi['DATE'])
cpi['cpi_key'] = cpi.DATE.dt.strftime('%Y-%m')
prices['price_key'] = prices.last_sale_date.dt.strftime('%Y-%m')
cpi.head()

Unnamed: 0,DATE,CUUSA433SAH,cpi_key
0,1984-01-01,104.1,1984-01
1,1985-01-01,106.8,1985-01
2,1986-01-01,108.2,1986-01
3,1987-01-01,109.6,1987-01
4,1988-01-01,110.0,1988-01


In [4]:
reload(data)

<module 'donatello.components.data' from 'donatello/components/data.pyc'>

In [5]:
def build_mask(dataset, columns, threshold):
    X = scale(dataset.designData[columns])
    y = dataset.targetData.values
    model = OLS(y, X)
    mask = pd.np.abs((model.fit().outlier_test()[:, 0]) < threshold)
    return mask

def transform(dataset, columns=('square_footage', 'num_bedrooms', 'last_sale_amount'), threshold=2):
    columns = list(columns) if columns else list(dataset.designData)
    mask = build_mask(dataset, columns, threshold)
    X = dataset.designData.loc[mask]
    y = dataset.targetData.loc[mask]
    return X, y

In [6]:
model = transformers.ModelDAG(set([]), {})
n1 = transformers.Node('scale', transformers.StandardScaler(), enforceTarget=True)
n2 = transformers.Node('rm_outliers', transformers.Apply(func=transform, fitOnly=True))
n3 = transformers.Node('ml', LinearRegression())

## Branching
Scikit-learn enables branching feature engineering by nesting FeatureUnions and Pipelines. While servicable, this interface can be hard to manage and makes it more difficult to share transformers and components between problems.

To support this feature, the edges of donatello's ModelDAG are actually transformers themselves with the sole task of passing (conducting) data through them. Conduction transformers follow a pattern. By default all columns/keys found during fitting will be passed along but the ModelDAG's add_edge_conductor method makes it easy to modify these options but selecting or inverting (selecting all but) the columns seen during fit by name, a regex match, or dtype.  

In [7]:
model = transformers.ModelDAG(set([]), {})
# intitate branching by selecting all
n0 = transformers.Node('select', transformers.DatasetFlow(invert=True)) 

# first branch
n11 = transformers.Node('ohe', transformers.OneHotEncoder(dropOne=True))

# second branch
n21 = transformers.Node('scale', transformers.StandardScaler(), enforceTarget=True)
n22 = transformers.Node('rm_outliers', transformers.Apply(func=transform, fitOnly=True))

# terminal node
n3 = transformers.Node('ml', LinearRegression())

In [8]:
# send zipcode data only to OHE and don't pass target through first branch
model.add_edge_flow(n0, n11, passTarget=False, selectValue=['zipcode'], invert=False)
# send output of ohe to Linear Regression 
model.add_edge_flow(n11, n3)

# send all other design data and the target through second branch
model.add_edge_flow(n0, n21, selectValue=['zipcode'], invert=True)
model.add_edge_flow(n21, n22)
model.add_edge_flow(n22, n3)

In [9]:
reload(data)

<module 'donatello.components.data' from 'donatello/components/data.pyc'>

In [10]:
def load_sculpture(df, parent=True,
                   model=LinearRegression(), 
                   paramGrid=None,
                   searchKwargs=None):

    """
    Helper to load sculpture
    """
    e = estimator.Estimator(model=model, paramGrid=paramGrid, searchKwargs=searchKwargs)

    if parent:
        d = data.Dataset(raw=df, primaryKey='prices', target='estimated_value',
                         groupDap={'attrPath': ['price_key']}, dataMap={'cpi': 'cpi_key'})    
        
        # sklearn metrics
        m = [measure.Metric(metric) for metric in 
             (r2_score, explained_variance_score, median_absolute_error)]
        # donatello custom metric
        m.append(measure.FeatureWeights())
    else:
        d = data.Dataset(raw=df, target='estimated_value')
        m = None
    sculpture = core.Sculpture(dataset=d, estimator=e, metrics=m,
                               validation='search', holdout=True)

    return sculpture


In [11]:
sub_sculpture = load_sculpture(None, model=model,
                               paramGrid={'model__ml__C': [.1, .5, 1, 2]},
                               searchKwargs={'scoring': 'r2', 'cv': 3})

In [12]:
def merger(dataset):
    
    X = pd.merge(dataset.data['prices'], dataset.data['cpi'], 
                   left_on='price_key', right_on='cpi_key', how='left')
    return X, dataset.targetData


In [13]:
pipe = transformers.ModelDAG(set([]), {})
p1 = transformers.Node('merge', transformers.Apply(func=merger))
#p2 = transformers.Node('ml', sub_sculpture)

In [14]:
#pipe.add_edge_flow(p1, p2)

In [15]:
dfs = {'prices': prices, 'cpi': cpi}

parent_sculpture = load_sculpture(dfs, model=pipe)

In [16]:
parent_sculpture.fit()

Cross Validation


KeyError: None

In [None]:
parent_sculpture.dataset.data.keys()

In [None]:
ds = parent_sculpture.dataset

In [None]:
a = next(ds.fold.fold(ds))

In [None]:
a

In [None]:
ds.raw['prices']

In [None]:
sculpture.fit()

In [None]:
print('{} fold validation'.format(sculpture.dataset.fold.folder.get_n_splits()))
for name, scores in sculpture.measurements.crossValidation.items():
    if name != 'feature_weights':
        display(view_sk_metric(scores).rename(columns={'score': name}))
    else:
        display(pd.concat([scores.mean.rename(columns={i: '{}_mean'.format(i) for i in scores.mean}), 
                           scores.std.rename(columns={i: '{}_std'.format(i) for i in scores.std})],
                           axis=1).sort_values('coefficients_mean')
               )
