# Melbourne Housing Prices Prediction

The data is from Kaggle and can be found [here](https://www.kaggle.com/anthonypino/melbourne-housing-market)

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as pyplot
pyplot.rcParams['figure.dpi'] = 300
pyplot.rcParams['savefig.dpi'] = 300

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error

import pandas_profiling as pdpr

# Data

In [2]:
full_data = pd.read_csv("./data/Melbourne_housing_FULL.csv")

In [3]:
y = full_data.loc[:, 'Price']
X = full_data.drop(columns=['Price'])


In [4]:
train_size = 0.8

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=train_size, random_state=42
    )

In [5]:
full_training_data = pd.concat([X_train, y_train], axis=1)
full_training_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Method', 'SellerG', 'Date',
       'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car', 'Landsize',
       'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude', 'Longtitude',
       'Regionname', 'Propertycount', 'Price'],
      dtype='object')

# Cleaning

In [6]:
def make_dummies(frame: pd.DataFrame, column: str) -> None:
    dummies = pd.get_dummies(
        frame.loc[:, column], prefix=column, drop_first=True)
    frame = pd.concat([X_train, dummies], axis=1)
    frame.drop(columns=[column], inplace=True)
    return frame


# Price
# Drop rows with missing values on target
full_train_wprices = full_training_data.dropna(subset=['Price'])

# Save target and features separately
y_train = full_train_wprices.loc[:, 'Price']
X_train = full_train_wprices.drop(columns=['Price'])

# Address - drop, too many uniqe values
X_train.drop(columns=['Address'], inplace=True)

# Turn no. or rooms into dummies, cut at 5+
msk_many_rooms = X_train.loc[:, 'Rooms'] > 4
X_train.loc[msk_many_rooms, 'Rooms'] = '5+'
X_train = make_dummies(X_train, 'Rooms')

# Type - convert to dummies
X_train = make_dummies(X_train, 'Type')

# Method - convert to dummies
X_train = make_dummies(X_train, 'Method')

# SellerG - drop for now, too many uniqe - possibly extract categories later?
X_train.drop(columns=['SellerG'], inplace=True)

# Date - convert to date, extract year to use as a feature - month should be used as well, but consider cyclical nature
X_train.loc[:, 'Date'] = pd.to_datetime(X_train.loc[:, 'Date'])
X_train.loc[:, 'Year'] = X_train.loc[:, 'Date'].dt.year
X_train.drop(columns=['Date'], inplace=True)

# Distance -keep and fill one missing
mean_dist = X_train.loc[:, 'Distance'].mean()
X_train.loc[:, 'Distance'].fillna(mean_dist, inplace=True)

# Bedroom2 - turn into dummies for room sizes as with room
msk_many_bedrooms = X_train.loc[:, 'Bedroom2'] >= 5
msk_miss_bedrooms = X_train.loc[:, 'Bedroom2'].isnull()
X_train.loc[msk_many_bedrooms, 'Bedroom2'] = '5+'
X_train.loc[msk_miss_bedrooms, 'Bedroom2'] = 'unk'
X_train = make_dummies(X_train, 'Bedroom2')

# Bathroom - as bedroom
msk_many_bedrooms = X_train.loc[:, 'Bathroom'] >= 4
msk_miss_bedrooms = X_train.loc[:, 'Bathroom'].isnull()
X_train.loc[msk_many_bedrooms, 'Bathroom'] = '4+'
X_train.loc[msk_miss_bedrooms, 'Bathroom'] = 'unk'
X_train = make_dummies(X_train, 'Bathroom')

# Car - dummies - has none, one, several, or unknown
msk_many_bedrooms = X_train.loc[:, 'Car'] >= 2
msk_miss_bedrooms = X_train.loc[:, 'Car'].isnull()
X_train.loc[msk_many_bedrooms, 'Car'] = '2+'
X_train.loc[msk_miss_bedrooms, 'Car'] = 'unk'
X_train = make_dummies(X_train, 'Car')

# Landsize - drop, too many missing (could be used a categorical as has a garden etc)
msk_miss_landsize = X_train.loc[:, 'Landsize'].isnull()

landsize_99p = np.percentile(
    X_train.loc[~msk_miss_landsize, 'Landsize'].values, [99.0]
    )[0]
msk_over99perc_landsize = X_train.loc[:, 'Landsize'] > landsize_99p
X_train.loc[msk_over99perc_landsize, 'Landsize'] = landsize_99p

landsize_mean = X_train.loc[~msk_miss_landsize, 'Landsize'].mean()
X_train.loc[msk_miss_landsize, 'Landsize'] = landsize_mean

# BuidingArea - drop, too many missing values
X_train.drop(columns=['BuildingArea'], inplace=True)

# YearBuilt - drop for, should be changed to categorical (which 50 years or unknown)
msk_out_yearbuilt = (X_train.loc[:, 'YearBuilt'] > 2019)\
                    & (X_train.loc[:, 'YearBuilt'] < 1800)
X_train.loc[msk_out_yearbuilt, 'YearBuilt'] = np.nan
msk_miss_yearbuilt = X_train.loc[:, 'YearBuilt'].isnull()
msk_1800s_yearbuilt = (X_train.loc[:, 'YearBuilt'] >= 1800)\
                    & (X_train.loc[:, 'YearBuilt'] < 1900)
msk_prewar_yearbuilt = (X_train.loc[:, 'YearBuilt'] >= 1900)\
                    & (X_train.loc[:, 'YearBuilt'] < 1946)
msk_postwar_yearbuilt = (X_train.loc[:, 'YearBuilt'] >= 1946)\
                    & (X_train.loc[:, 'YearBuilt'] < 2000)
msk_new_yearbuilt = X_train.loc[:, 'YearBuilt'] >= 2000

X_train.loc[msk_miss_yearbuilt, 'YearBuilt'] = 'unk'
X_train.loc[msk_1800s_yearbuilt, 'YearBuilt'] = '1800s'
X_train.loc[msk_prewar_yearbuilt, 'YearBuilt'] = 'prewar'
X_train.loc[msk_postwar_yearbuilt, 'YearBuilt'] = 'postwar'
X_train.loc[msk_new_yearbuilt, 'YearBuilt'] = 'new'

X_train = make_dummies(X_train, 'YearBuilt')

# drop geographic data for now - these could be used to impute lat/long?
# CouncilArea
# Postcode
# Suburb
X_train.drop(columns=['CouncilArea', 'Postcode', 'Suburb'], inplace=True)

# Lattitude - keep these and fill with mean. Would be better to impute from other geo-date or from above
mean_lat = X_train.loc[:, 'Lattitude'].mean()
X_train.loc[:, 'Lattitude'].fillna(mean_lat, inplace=True)

# Longtitude
mean_lat = X_train.loc[:, 'Longtitude'].mean()
X_train.loc[:, 'Longtitude'].fillna(mean_lat, inplace=True)

# Regionname keep it as categorical, convert to dummies
msk_south_regionname = X_train.loc[:, 'Regionname'] == 'Southern Metropolitan'
msk_north_regionname = X_train.loc[:, 'Regionname'] == 'Northern Metropolitan'
msk_east_regionname = X_train.loc[:, 'Regionname'] == 'Eastern Metropolitan'
msk_west_regionname = X_train.loc[:, 'Regionname'] == 'Western Metropolitan'
msk_other_regionname = ~msk_south_regionname & ~ msk_north_regionname & ~msk_east_regionname & ~msk_west_regionname
X_train.loc[msk_other_regionname, 'Regionname'] = 'Other'
X_train = make_dummies(X_train, 'Regionname')

# Propertycount - keep this, already numerical - fill couple of missing with mean
mean_pcount = X_train.loc[:, 'Propertycount'].mean()
X_train.loc[:, 'Propertycount'].fillna(mean_pcount, inplace=True)

print(X_train.columns)


Index(['Distance', 'Landsize', 'Lattitude', 'Longtitude', 'Propertycount',
       'Rooms_2', 'Rooms_3', 'Rooms_4', 'Rooms_5+', 'Type_t', 'Type_u',
       'Method_S', 'Method_SA', 'Method_SP', 'Method_VB', 'Year',
       'Bedroom2_1.0', 'Bedroom2_2.0', 'Bedroom2_3.0', 'Bedroom2_4.0',
       'Bedroom2_5+', 'Bedroom2_unk', 'Bathroom_1.0', 'Bathroom_2.0',
       'Bathroom_3.0', 'Bathroom_4+', 'Bathroom_unk', 'Car_1.0', 'Car_2+',
       'Car_unk', 'YearBuilt_1800s', 'YearBuilt_new', 'YearBuilt_postwar',
       'YearBuilt_prewar', 'YearBuilt_unk', 'Regionname_Northern Metropolitan',
       'Regionname_Other', 'Regionname_Southern Metropolitan',
       'Regionname_Western Metropolitan'],
      dtype='object')


# Exploratory

In [7]:
profile = pdpr.ProfileReport(full_training_data)
profile.to_file(output_file='profile.html')


Summarize dataset: 100%|██████████| 233/233 [00:34<00:00,  6.70it/s, Completed]
Generate report structure: 100%|██████████| 1/1 [00:06<00:00,  6.90s/it]
Render HTML: 100%|██████████| 1/1 [00:04<00:00,  4.34s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 46.25it/s]


In [8]:
profile = pdpr.ProfileReport(full_train_wprices)
profile.to_file(output_file='profile_wprices.html')


Summarize dataset: 100%|██████████| 233/233 [00:31<00:00,  7.36it/s, Completed]
Generate report structure: 100%|██████████| 1/1 [00:07<00:00,  7.02s/it]
Render HTML: 100%|██████████| 1/1 [00:04<00:00,  4.48s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 54.61it/s]


In [9]:
profile = pdpr.ProfileReport(X_train)
profile.to_file(output_file='profile_clean.html')


Summarize dataset: 100%|██████████| 89/89 [00:20<00:00,  4.37it/s, Completed]
Generate report structure: 100%|██████████| 1/1 [00:10<00:00, 10.64s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.81s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 157.54it/s]


# Models

In [10]:
from sklearn.model_selection import cross_validate
linreg_model = LinearRegression()

linreg_model.fit(X_train, y_train)
y_pred = linreg_model.predict(X_train)
print('LinearReg: ', np.sqrt(mean_squared_error(y_train, y_pred)))

clas = RandomForestRegressor()
clas.fit(X_train, y_train)
y_pred_rf = clas.predict(X_train)

print('RF: ', np.sqrt(mean_squared_error(y_train, y_pred_rf)))


lr_cv = cross_validate(linreg_model, X_train, y_train,
                       scoring='neg_mean_squared_error')
rf_cv = cross_validate(clas, X_train, y_train,
                       scoring='neg_mean_squared_error')


LinearReg:  411686.8497953522
RF:  130069.78993143204


In [11]:
print("LinReg scores:", np.round(np.sqrt(-lr_cv['test_score'])))
print("LinReg mean score:", np.round(np.mean(np.sqrt(-lr_cv['test_score']))))

print("RF scores:", np.round(np.sqrt(-rf_cv['test_score'])))
print("RF mean score:", np.round(np.mean(np.sqrt(-rf_cv['test_score']))))


LinReg scores: [443016. 383452. 388125. 408050. 438491.]
LinReg mean score: 412227.0
RF scores: [327277. 300739. 292662. 307381. 339404.]
RF mean score: 313493.0
