In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv("../data/melb_data.csv")

y = data.Price

X = data.drop("Price" , axis=1)

X_train_full , X_valid_full , y_train , y_valid = train_test_split(X , y , random_state=1 , train_size=0.8 , test_size=0.2)

categorical_col = [col for col in X_train_full if X_train_full[col].nunique() < 10 and X_train_full[col].dtype == "object"]

numerical_col = [col for col in X_train_full if  X_train_full[col].dtype in ['int64' , 'float64']]

my_cols = categorical_col + numerical_col

X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [28]:
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
1041,h,S,Southern Metropolitan,3,11.2,3186.0,3.0,1.0,2.0,366.0,156.0,1920.0,-37.9038,145.0001,10579.0
1989,h,S,Northern Metropolitan,3,7.8,3058.0,3.0,1.0,0.0,238.0,131.0,1900.0,-37.7539,144.9612,11204.0
10157,h,S,Northern Metropolitan,3,5.2,3056.0,3.0,1.0,1.0,439.0,,,-37.77047,144.97005,11918.0
1711,u,S,Southern Metropolitan,2,11.4,3163.0,2.0,1.0,2.0,0.0,100.0,1973.0,-37.8863,145.066,7822.0
11565,h,S,Western Metropolitan,4,11.0,3018.0,4.0,2.0,4.0,615.0,,,-37.87057,144.83623,5301.0


### Step 1 : Define Preprocessing Steps

In [31]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

numerical_transform = SimpleImputer(strategy="constant")

categorical_transform = Pipeline(steps=[
    ('impute' , SimpleImputer(strategy="most_frequent")),
    ('one_hot' , OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num' , numerical_transform , numerical_col),
        ('cat' , categorical_transform , categorical_col),
    ]
)

### Step 2 : Define Model

In [32]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100 , random_state=0)

### Step 3 : Create and Evaluate the Pipeline

In [36]:
from sklearn.metrics import mean_absolute_error

my_pipeline = Pipeline(
    steps=[
        ('preprocess' , preprocessor),
        ('model' , model)
    ]
)

my_pipeline.fit(X_train , y_train)

predicts = my_pipeline.predict(X_valid)

MAE = mean_absolute_error(y_valid , predicts)

print(f"This is mean absolute error : {MAE}")

This is mean absolute error : 156312.91707447925
