In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.utils import estimator_html_repr

df = pd.read_csv("https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv")
df['popbin'] = pd.qcut(df['population'], q=3, labels = ['small', 'medium', 'large'])
target = df['median_house_value']
features = df.loc[:, ['median_income', 'popbin', 'ocean_proximity']]

# Randomly assign 10% of observations to be missing
features = features.mask(np.random.random(features.shape) < .1)
features.isnull().values.any()

X_train, X_test, y_train, y_test = train_test_split(
   features,
   target,
   test_size= 0.2,
   random_state= 0)

In [2]:
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,popbin
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,small
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,large
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,small
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,small
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,small
...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND,small
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND,small
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND,medium
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND,small


In [3]:
df.dtypes

longitude              float64
latitude               float64
housing_median_age     float64
total_rooms            float64
total_bedrooms         float64
population             float64
households             float64
median_income          float64
median_house_value     float64
ocean_proximity         object
popbin                category
dtype: object

In [8]:
df['popbin'].unique()

['small', 'large', 'medium']
Categories (3, object): ['small' < 'medium' < 'large']

In [None]:
df['ocean_proximity'].unique()

Numeric Data

In [4]:
num_steps = [
    ('imputer', SimpleImputer(strategy='median')),
    ('standardize', StandardScaler())
]

num_transformer = Pipeline(steps=num_steps)

Ordinal Data

In [9]:
ord_steps = [
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoding', OrdinalEncoder())
]

ord_transformer = Pipeline(steps=ord_steps)

Categorical Data

In [10]:
ohe_steps = [
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoding', OneHotEncoder(handle_unknown='ignore'))
]

ohe_transformer = Pipeline(steps=ohe_steps)

In [11]:
# Split up the columns by data type for separate preprocessing
num_cols = ['median_income']
ord_cols = ['popbin']
ohe_cols = ['ocean_proximity']

# steps for the transformation
transformer_steps = [
    ('num', num_transformer, num_cols),
    ('ord', ord_transformer, ord_cols),
    ('ohe', ohe_transformer, ohe_cols)
]

In [12]:
steps = [
    ('transformation', ColumnTransformer(transformers=transformer_steps)),
    ('linreg', LinearRegression())
]

pipe = Pipeline(steps=steps)
lm = pipe.fit(X_train, y_train)

In [15]:
lm