# Getting Started

In [1]:
import pandas as pd
import numpy as np

In [2]:
filepath="../data/preprocessed_data/train.csv"

In [3]:
housing=pd.read_csv(filepath)

# Custom Transformer
- Let's start with building our custom transformer

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 4, 5, 6, 7

In [5]:
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
             bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [6]:
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

# Feature Scaling 
- This is another important task in transformation we need to scale the values
- There are two common ways to get all attributes to have the same scale: **min-max scaling and standardization**

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Transformation Piplelines

## Numerical Column Pipeline

In [8]:
housing_num =housing.drop('ocean_proximity',axis=1)

In [9]:
from sklearn.pipeline import Pipeline

In [10]:
num_pipeline=Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('attrib_adder',CombinedAttributesAdder()),
    ('scaler',StandardScaler())
    
])

In [11]:
housing_num_tr=num_pipeline.fit(housing_num)

## Column Transformer

In [12]:
from sklearn.compose import ColumnTransformer
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

In [13]:
full_pipeline=ColumnTransformer([
    ("num",num_pipeline,num_attribs),
    ("cat",OneHotEncoder(),cat_attribs)
])

In [14]:
housing_prepared=full_pipeline.fit_transform(housing)

In [16]:
import joblib
joblib.dump(full_pipeline, '../models/preprocessing_pipeline.pkl')

['../models/preprocessing_pipeline.pkl']