## Boston Housing Dataset

In [1]:
import os
os.chdir('/home/megatron/work')

In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston


boston = load_boston()
print(boston.DESCR)

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [3]:
X = pd.DataFrame(boston.data, columns=boston.feature_names)
y = boston.target

X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


## Build feature pipeline

In [4]:
from megatron import Input, Graph, Lambda
from megatron.transforms import SklearnTransformation

from sklearn.preprocessing import StandardScaler, QuantileTransformer, MinMaxScaler, Imputer

pipeline = Graph()

# Define input nodes
crime = Input(pipeline, 'CRIM', input_shape=(1,))
zone = Input(pipeline, 'ZN', input_shape=(1,))
industrial = Input(pipeline, 'INDUS', input_shape=(1,))
river = Input(pipeline, 'CHAS', input_shape=(1,))
nox = Input(pipeline, 'NOX', input_shape=(1,))
rooms = Input(pipeline, 'RM', input_shape=(1,))
age = Input(pipeline, 'AGE', input_shape=(1,))
distance = Input(pipeline, 'DIS', input_shape=(1,))
highways = Input(pipeline, 'RAD', input_shape=(1,))
tax = Input(pipeline, 'TAX', input_shape=(1,))
school = Input(pipeline, 'PTRATIO', input_shape=(1,))
lower = Input(pipeline, 'LSTAT', input_shape=(1,))

# Group features arbitrarily
concat = lambda *group: np.hstack(group)

group_a = Lambda(concat)([crime, distance, tax, lower])
group_a = SklearnTransformation(StandardScaler())(group_a)
group_a = SklearnTransformation(Imputer())(group_a)


group_b = Lambda(concat)([zone, industrial])
group_b = SklearnTransformation(QuantileTransformer(n_quantiles=10))(group_b)
group_b = SklearnTransformation(Imputer())(group_b)


group_c = Lambda(concat)([river, nox])
group_c = SklearnTransformation(MinMaxScaler())([group_c])
group_c = SklearnTransformation(Imputer())([group_c])


group_d = Lambda(concat)([rooms, age, highways, school])
group_d = SklearnTransformation(StandardScaler())(group_d)

In [5]:
data = {k: np.expand_dims(v, axis=-1) for k, v in X.to_dict(orient='list').items()}

In [9]:
pipeline.run([group_a, group_b, group_c, group_d], data)

[array([[-0.41771335,  0.1402136 , -0.66660821, -1.0755623 ],
        [-0.41526932,  0.55715988, -0.98732948, -0.49243937],
        [-0.41527165,  0.55715988, -0.98732948, -1.2087274 ],
        ...,
        [-0.41137448, -0.77368357, -0.80321172, -0.98304761],
        [-0.40568883, -0.66843684, -0.80321172, -0.86530163],
        [-0.41292893, -0.61324648, -0.80321172, -0.66905833]]),
 array([[7.66666667e-01, 7.39408473e-02],
        [9.99999998e-08, 3.83161512e-01],
        [9.99999998e-08, 3.83161512e-01],
        ...,
        [9.99999998e-08, 5.75380974e-01],
        [9.99999998e-08, 5.75380974e-01],
        [9.99999998e-08, 5.75380974e-01]]),
 array([[0.        , 0.31481481],
        [0.        , 0.17283951],
        [0.        , 0.17283951],
        ...,
        [0.        , 0.38683128],
        [0.        , 0.38683128],
        [0.        , 0.38683128]]),
 array([[ 0.41367189, -0.12001342, -0.98284286, -1.45900038],
        [ 0.19427445,  0.36716642, -0.8678825 , -0.30309415],
   

In [10]:
from IPython.display import SVG
from megatron.visuals import pipeline_to_dot

SVG(pipeline_to_dot(pipeline, [group_a, group_b, group_c, group_d]).create(prog='dot', format='svg'))

ImportError: Failed to import `pydot`. Please install `pydot` in your current environment.

In [5]:
from megatron.visuals import plot_pipeline

plot_pipeline(pipeline, [group_a, group_b, group_c, group_d])