<a href="https://colab.research.google.com/github/muhammadibrohimov-ai/Machine_Learning_Intro_California_housing/blob/main/Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
URL = "https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true"
df = pd.read_csv(URL)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df, test_size = 0.2, random_state=42)

housing = train_set.drop('median_house_value', axis = 1)
housing_label = train_set['median_house_value'].copy()
housing_numeric = housing.drop('ocean_proximity', axis = 1)

In [4]:
room_ix, bedroom_ix, population_ix, household_ix = 3, 4, 5, 6

from sklearn.base import BaseEstimator, TransformerMixin

class MultipleAttributeAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedroom_per_room = True):
        self.add_bedroom_per_room = add_bedroom_per_room

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        rooms_per_household = X[:, room_ix] / X[:, household_ix]
        bedroom_per_household = X[:, bedroom_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]

        if self.add_bedroom_per_room:
            bedroom_per_room = X[:, bedroom_ix] / X[:, room_ix]
            return np.c_[X, rooms_per_household, bedroom_per_household, population_per_household, bedroom_per_room]

        else:
            return np.c_[X, rooms_per_household, bedroom_per_household, population_per_household]

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

numeric_pipline = Pipeline([
    ('imputer', SimpleImputer(strategy = "median")),
    ("attribute_adder", MultipleAttributeAdder(add_bedroom_per_room=True)),
    ("std_scaler", StandardScaler())
])

In [11]:
numeric_pipline.fit_transform(housing_numeric)

array([[ 1.27258656, -1.3728112 ,  0.34849025, ..., -0.20836543,
         0.05137609, -0.2117846 ],
       [ 0.70916212, -0.87669601,  1.61811813, ..., -0.12853018,
        -0.11736222,  0.34218528],
       [-0.44760309, -0.46014647, -1.95271028, ..., -0.25753771,
        -0.03227969, -0.66165785],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ..., -0.03921583,
         0.02030568,  0.99951387],
       [-1.18553953,  0.90651045, -1.07984112, ..., -0.06626528,
         0.00707608, -0.79086209],
       [-1.41489815,  0.99543676,  1.85617335, ..., -0.08750798,
        -0.08535429,  1.69520292]])

In [12]:
from sklearn.compose import ColumnTransformer

numeric_attributes = list(housing_numeric.columns)
cat_attributes = ['ocean_proximity']

full_pipline = ColumnTransformer([
    ("num_pipline", numeric_pipline, numeric_attributes),
    ("cat_pipline", OneHotEncoder(), cat_attributes)
])

In [16]:
full_pipline.fit_transform(housing)[:, 0]

array([ 1.27258656,  0.70916212, -0.44760309, ...,  0.59946887,
       -1.18553953, -1.41489815])