Transformation Pipeline

In [1]:
import numpy  as np
import pandas as pd

In [4]:
housing = pd.read_csv('datasets/housing/housing_predictors.csv')
housing_num = pd.read_csv('datasets/housing/housing_predictors_without_categorial_data.csv')

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
            bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [6]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])
housuing_num_tr = num_pipline.fit_transform(housing_num)

housuing_num_tr

array([[ 0.38862581, -0.94135046,  1.34743822, ..., -0.22688044,
        -0.17964609, -0.07316449],
       [ 0.86480228,  1.17178212, -1.19243966, ..., -0.30315533,
         0.06801106,  1.4572123 ],
       [-1.24161376,  0.26758118, -0.1259716 , ...,  0.19913442,
         0.21525077, -0.30958996],
       ...,
       [ 1.4938503 , -1.5707942 ,  1.31001828, ...,  0.51128366,
        -0.15720638, -0.3634411 ],
       [ 1.47327787, -1.56080303,  1.2492109 , ..., -0.23792712,
         0.27117942,  0.15111082],
       [ 1.57915062, -1.28105026,  2.02567448, ...,  0.01614391,
        -0.18358   , -0.3017964 ]])

In [25]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ['ocean_proximity']
full_pipeLine = ColumnTransformer([
    ('num' , num_pipline , num_attribs),
    ('cat' , OneHotEncoder() , cat_attribs),
])
housing_prepared = full_pipeLine.fit_transform(housing)
housing_prepared = pd.DataFrame(housing_prepared)
housing_prepared.to_csv('datasets/housing/housing_prepared')
housing_prepared

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,0.388626,-0.941350,1.347438,0.027564,0.584777,0.640371,0.732602,0.556286,-0.893647,-0.226880,-0.179646,-0.073164,0.0,1.0,0.0,0.0,0.0
1,0.864802,1.171782,-1.192440,-1.722018,1.261467,0.781561,0.533612,0.721318,1.292168,-0.303155,0.068011,1.457212,0.0,0.0,0.0,0.0,1.0
2,-1.241614,0.267581,-0.125972,1.220460,-0.469773,-0.545138,-0.674675,-0.524407,-0.525434,0.199134,0.215251,-0.309590,0.0,1.0,0.0,0.0,0.0
3,0.622449,1.221738,-1.351474,-0.370069,-0.348652,-0.036367,-0.467617,-0.037297,-0.865929,-0.116266,0.627243,-0.208365,0.0,0.0,0.0,0.0,1.0
4,1.700076,0.437431,-0.635818,-0.131489,0.427179,0.272790,0.374060,0.220898,0.325752,-0.212943,-0.196465,-0.079489,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16507,0.809942,1.251711,-1.220505,-1.165333,1.890456,1.696862,0.543471,1.341519,0.637374,-0.275537,0.735171,0.763591,1.0,0.0,0.0,0.0,0.0
16508,0.389629,-0.921368,1.342761,-1.085806,2.468471,2.161816,3.002174,2.451492,-0.557509,-0.305797,-0.391151,0.887355,0.0,1.0,0.0,0.0,0.0
16509,1.493850,-1.570794,1.310018,1.538566,-0.895802,-0.895679,-0.862013,-0.865118,-0.365475,0.511284,-0.157206,-0.363441,1.0,0.0,0.0,0.0,0.0
16510,1.473278,-1.560803,1.249211,-1.165333,0.249005,0.112126,-0.189747,0.010616,0.168261,-0.237927,0.271179,0.151111,1.0,0.0,0.0,0.0,0.0


In [30]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.attribute_names].values

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoder = OneHotEncoder(handle_unknown='ignore')

    def fit(self, X, y=None):
        self.encoder.fit(X)
        return self

    def transform(self, X):
        return self.encoder.transform(X)

class HousingPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.num_attribs = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
                            'total_bedrooms', 'population', 'households', 'median_income']
        self.cat_attribs = ['ocean_proximity']
        self.num_pipeline = Pipeline([
            ('selector', DataFrameSelector(self.num_attribs)),
            ('imputer', SimpleImputer(strategy="median")),
            ('std_scaler', StandardScaler())
        ])
        self.cat_pipeline = Pipeline([
            ('selector', DataFrameSelector(self.cat_attribs)),
            ('encoder', CategoricalEncoder())
        ])
        self.full_pipeline = ColumnTransformer([
            ('num', self.num_pipeline, self.num_attribs),
            ('cat', self.cat_pipeline, self.cat_attribs)
        ])

    def fit_transform(self, X, y=None):
        return pd.DataFrame(self.full_pipeline.fit_transform(X))

# Load the housing data
housing = pd.read_csv('datasets/housing/housing_predictors')

# Preprocess the housing data
housing_prepared = HousingPreprocessor().fit_transform(housing)
housing_prepared.to_csv('datasets/housing/housing_prepared')
housing_prepared

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,-0.941350,1.347438,0.027564,0.584777,0.640371,0.732602,0.556286,-0.893647,0.0,1.0,0.0,0.0,0.0
1,1.171782,-1.192440,-1.722018,1.261467,0.781561,0.533612,0.721318,1.292168,0.0,0.0,0.0,0.0,1.0
2,0.267581,-0.125972,1.220460,-0.469773,-0.545138,-0.674675,-0.524407,-0.525434,0.0,1.0,0.0,0.0,0.0
3,1.221738,-1.351474,-0.370069,-0.348652,-0.036367,-0.467617,-0.037297,-0.865929,0.0,0.0,0.0,0.0,1.0
4,0.437431,-0.635818,-0.131489,0.427179,0.272790,0.374060,0.220898,0.325752,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16507,1.251711,-1.220505,-1.165333,1.890456,1.696862,0.543471,1.341519,0.637374,1.0,0.0,0.0,0.0,0.0
16508,-0.921368,1.342761,-1.085806,2.468471,2.161816,3.002174,2.451492,-0.557509,0.0,1.0,0.0,0.0,0.0
16509,-1.570794,1.310018,1.538566,-0.895802,-0.895679,-0.862013,-0.865118,-0.365475,1.0,0.0,0.0,0.0,0.0
16510,-1.560803,1.249211,-1.165333,0.249005,0.112126,-0.189747,0.010616,0.168261,1.0,0.0,0.0,0.0,0.0
