In [61]:
import pandas as pd
import numpy as np

melbourne_file_path = 'data/melb_data.csv'

In [62]:
data = pd.read_csv(melbourne_file_path)
data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [63]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

# Beginner's Approach to Implementing Decision Trees
- Drop all null values (no imputation)
- No fine-tuning of hyperparameters
- No model research, just a simple RandomForestRegressor

In [64]:
y = data.Price
melbourne_features = [
    "Rooms",
    "Bathroom",
    "Landsize",
    "BuildingArea",
    "YearBuilt",
    "Lattitude",
    "Longtitude",
]
X = data[melbourne_features]

from sklearn.model_selection import train_test_split

# split data into training and validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=42)

In [65]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Instantiate the model class
forest_model = RandomForestRegressor(random_state=1)

# Fit the model
forest_model.fit(train_X, train_y)

#
melb_preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, melb_preds))
print(mean_squared_error(val_y, melb_preds))

373157.83042832674
256339760549.31235


In [66]:
# Compare predictions with actual values
compare = pd.DataFrame({'Actual': val_y, 'Predicted': melb_preds})
print(compare.head(10))

          Actual     Predicted
1061   2600000.0  1.144371e+06
6482    620000.0  8.973414e+05
8395   1000000.0  1.037368e+06
4659    430000.0  1.003625e+06
7386    392250.0  8.973414e+05
6607    700000.0  1.019357e+06
8094    439000.0  8.960220e+05
4590   2236000.0  1.092050e+06
13475   671000.0  9.783383e+05
10318  1870000.0  1.035489e+06


# Introduction to Pipelines, Transformers, and Imputers

In [67]:
# Preparing the data 
y = data.Price

# Lets try using all the features
X = data.drop(['Price'], axis=1)

In [68]:
# Convert categorical data to numerical data

from sklearn.compose import make_column_selector as selector

numerical_selector = selector(dtype_exclude=object)
category_selector = selector(dtype_include=object)

numerical_features = numerical_selector(X)
category_features = category_selector(X)

In [69]:
# Create a transformer for numerical data
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Create a transformer for categorical data
from sklearn.preprocessing import OneHotEncoder

category_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [70]:
# Apply the transformers to the data
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', category_pipeline, category_features)
    ]
)


In [71]:
# Apply the classifier to the data
from sklearn.ensemble import RandomForestRegressor

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestRegressor(random_state=42))
])


In [72]:
model

In [73]:
# Checking the predictions
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)

model.fit(X_train, y_train)
y_pred = model.predict(X_val)

print(mean_absolute_error(y_val, y_pred))
print(mean_squared_error(y_val, y_pred))

166534.76155817378
77358563314.20914


In [74]:
# Compare predictions with actual values
compare = pd.DataFrame({'Actual': y_val, 'Predicted': y_pred})
print(compare.head(10))

          Actual   Predicted
1061   2600000.0  2170470.00
6482    620000.0   673652.50
8395   1000000.0   836475.00
4659    430000.0   563730.00
7386    392250.0   904420.00
6607    700000.0   806051.78
8094    439000.0   485676.88
4590   2236000.0  1520320.00
13475   671000.0   590833.00
10318  1870000.0  1565760.00
