# Pipelines
## Set Data

In [2]:
# importing libraries
import pandas as pd

# scikit libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [3]:
# reading data with house pricing
data = pd.read_csv('Data/melb_data.csv')

In [4]:
# examining data
data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [5]:
# setting target and features
y = data.Price
X = data.drop(['Price'], axis=1)

# splitting data for training and validation purposes
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [6]:
# select categorical columns with cardinality below 10
categorical_cols = [col for col in X_train_full.columns if
                    X_train_full[col].dtype == 'object' and 
                    X_train_full[col].nunique() < 10]

In [7]:
# select numerical columns
numerical_cols = [col for col in X_train_full.columns if
                  X_train_full[col].dtype in ['int64', 'float64']]

In [8]:
# concatenattion of categorical and numerical columns
actual_features = categorical_cols + numerical_cols

# set train and valid subsets
X_train = X_train_full[actual_features].copy()
X_valid = X_valid_full[actual_features].copy()

## Define preprocessing steps

In [9]:
# preprocessing numerical data
numerical_transformer = SimpleImputer(strategy='median')

# preprocessing categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

## Define model

In [10]:
# model
model = RandomForestRegressor(n_estimators=50, random_state=0)

## Create and evaluate the pipeline

In [11]:
# create pipeline
house_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])