In [603]:
import pandas as pd
import calendar
import numpy as np
import plotly.express as px
pd.options.plotting.backend = "plotly"

# Main data frame pre-processing

In [604]:
# Import main data frame
df = pd.read_csv('data/Melbourne_housing_FULL_OG.csv')

In [605]:
numToMonth = {
    "1": "January",
    "2": "February",
    "3": "March",
    "4": "April",
    "5": "May",
    "6": "June",
    "7": "July",
    "8": "August",
    "9": "September",
    "10": "October",
    "11": "November",
    "12": "December",
}

In [606]:
# df date splitting
df['Date'] = pd.to_datetime(df['Date'], format = '%d/%m/%Y')
df['Year'] = pd.DatetimeIndex(df['Date']).year
df['Month'] = pd.DatetimeIndex(df['Date']).month
df['Day'] = pd.DatetimeIndex(df['Date']).day
df['Count'] = 1
df['MonthStr'] = df['Month'].apply(lambda x: numToMonth[str(x)])
df = df.drop(columns=['Date', 'Bedroom2', 'YearBuilt'])

In [607]:
# df['Postcode'] = df['Postcode'].astype(str)
df['Postcode'] = df['Postcode'].astype('Int64')
df['Bathroom'] = df['Bathroom'].astype('Int64')
df['Propertycount'] = df['Propertycount'].astype('Int64')
df['Landsize'] = df['Landsize'].astype('Int64')
df['Car'] = df['Car'].astype('Int64')

In [608]:
df.sample(10)

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Distance,Postcode,Bathroom,...,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount,Year,Month,Day,Count,MonthStr
24505,Kew,37 Derby St,5,h,,VB,Knight,5.4,3101,3.0,...,Boroondara City Council,-37.80248,145.03711,Southern Metropolitan,10331,2017,10,14,1,October
31844,Ivanhoe,2/110 Hawker St,3,t,,SN,Miles,7.8,3079,,...,Banyule City Council,,,Eastern Metropolitan,5549,2018,3,3,1,March
9995,Sunshine West,18 Norton St,3,h,586000.0,S,Barry,13.5,3020,1.0,...,Brimbank City Council,-37.7843,144.8052,Western Metropolitan,6763,2016,6,27,1,June
14435,Heidelberg West,1 Ebony Pde,3,h,710000.0,S,William,8.8,3081,1.0,...,Banyule City Council,-37.74307,145.0399,Eastern Metropolitan,2674,2017,7,1,1,July
33682,Reservoir,32 Pershing St,3,h,800000.0,VB,Nelson,12.0,3073,,...,Darebin City Council,-37.72508,144.9929,Northern Metropolitan,21650,2018,3,17,1,March
20881,Wantirna South,127 Fraser Cr,6,h,1155000.0,S,Biggin,14.7,3152,3.0,...,Knox City Council,-37.88153,145.23003,Eastern Metropolitan,7082,2017,9,9,1,September
2799,Camberwell,11 Webster St,3,h,,S,Woodards,7.8,3124,1.0,...,Boroondara City Council,-37.8366,145.0948,Southern Metropolitan,8920,2016,5,28,1,May
24839,Ascot Vale,29 Queens Av,4,h,1700000.0,S,Nelson,4.3,3032,2.0,...,Moonee Valley City Council,-37.77872,144.9241,Western Metropolitan,6567,2017,10,21,1,October
6200,Maidstone,38 Suffolk St,3,h,930000.0,SP,Jas,9.2,3012,1.0,...,Maribyrnong City Council,-37.7909,144.8777,Western Metropolitan,3873,2016,11,19,1,November
26434,South Yarra,90 River St,3,h,2300000.0,SP,Marshall,2.7,3141,2.0,...,Melbourne City Council,-37.83629,144.99798,Southern Metropolitan,14887,2017,10,28,1,October


In [609]:
df.dtypes

Suburb            object
Address           object
Rooms              int64
Type              object
Price            float64
Method            object
SellerG           object
Distance         float64
Postcode           Int64
Bathroom           Int64
Car                Int64
Landsize           Int64
BuildingArea     float64
CouncilArea       object
Lattitude        float64
Longtitude       float64
Regionname        object
Propertycount      Int64
Year               int64
Month              int64
Day                int64
Count              int64
MonthStr          object
dtype: object

In [610]:
missing_val_count_by_column = (df.isnull().sum())
missing_val_count_by_column

Suburb               0
Address              0
Rooms                0
Type                 0
Price             7610
Method               0
SellerG              0
Distance             1
Postcode             1
Bathroom          8226
Car               8728
Landsize         11810
BuildingArea     21115
CouncilArea          3
Lattitude         7976
Longtitude        7976
Regionname           3
Propertycount        3
Year                 0
Month                0
Day                  0
Count                0
MonthStr             0
dtype: int64

In [611]:
df = df.drop(22632)
df = df.drop(29483)

# Creating seasonality line chart

In [612]:
sea = df.groupby(by=["Year","MonthStr"]).size().reset_index()
sea['Count'] = sea[0]
sea = sea.drop(columns=[0])
# sea[(sea['Year'] == year) & (sea['MonthStr'].isin(['September','October','November' ]))]['Count'].sum()
sea

Unnamed: 0,Year,MonthStr,Count
0,2016,April,502
1,2016,August,1173
2,2016,December,1000
3,2016,February,44
4,2016,January,3
5,2016,July,689
6,2016,June,1242
7,2016,May,1531
8,2016,November,1787
9,2016,October,852


# Creating count by suburbs

In [613]:
# Create count by suburb df
cbs = df.groupby(by=["Suburb", "Year"]).size().reset_index()
cbs['Count'] = cbs[0]
cbs = cbs.drop(columns=[0])

In [614]:
# Find total count for the year
yearTotal = cbs.groupby(by=["Year"]).size().reset_index()
yearTotal['Total'] = yearTotal[0]
yearTotal = yearTotal.drop(columns=[0])
yearTotal

Unnamed: 0,Year,Total
0,2016,142
1,2017,346
2,2018,311


In [615]:
# Find out ranks of each suburb
cbs = cbs.merge(yearTotal, on = 'Year', how = 'left')
cbs['Rank'] = cbs.groupby('Year')['Count'].rank(ascending=False,method='first').astype(int)
cbsTop = cbs.sort_values(by=['Rank']).head(30)

In [616]:
# Add corrections for cbsTop df
cbsTop = cbsTop.append(cbs.loc[(cbs['Suburb'] == 'Craigieburn') &  (cbs['Year'] == 2017)])
cbsTop = cbsTop.append(cbs.loc[(cbs['Suburb'] == 'Richmond') &  (cbs['Year'] == 2018)])
cbsTop = cbsTop.append(cbs.loc[(cbs['Suburb'] == 'Glen Iris') &  (cbs['Year'] == 2018)])
cbsTop = cbsTop.append(cbs.loc[(cbs['Suburb'] == 'Brighton') &  (cbs['Year'] == 2018)])
cbsTop = cbsTop.append(cbs.loc[(cbs['Suburb'] == 'Kew') &  (cbs['Year'] == 2018)])
cbsTop = cbsTop.append(cbs.loc[(cbs['Suburb'] == 'Balwyn North') &  (cbs['Year'] == 2017)])
cbsTop = cbsTop.append(cbs.loc[(cbs['Suburb'] == 'Preston') &  (cbs['Year'] == 2016)])
cbsTop = cbsTop.append(cbs.loc[(cbs['Suburb'] == 'Northcote') &  (cbs['Year'] == 2017)])
cbsTop = cbsTop.append(cbs.loc[(cbs['Suburb'] == 'Brunswick') &  (cbs['Year'] == 2016)])
cbsTop = cbsTop.append(cbs.loc[(cbs['Suburb'] == 'Camberwell') &  (cbs['Year'] == 2017)])
cbsTop = cbsTop.append(cbs.loc[(cbs['Suburb'] == 'Mill Park') &  (cbs['Year'] == 2017)])
cbsTop = cbsTop.append(cbs.loc[(cbs['Suburb'] == 'Hawthorn') &  (cbs['Year'] == 2017)])
cbsTop = cbsTop.append(cbs.loc[(cbs['Suburb'] == 'Glenroy') &  (cbs['Year'] == 2016)])
cbsTop = cbsTop.append(cbs.loc[(cbs['Suburb'] == 'Glenroy') &  (cbs['Year'] == 2018)])
cbsTop = cbsTop.append(cbs.loc[(cbs['Suburb'] == 'Brighton East') &  (cbs['Year'] == 2017)])
cbsTop = cbsTop.append(cbs.loc[(cbs['Suburb'] == 'Mount Waverley') &  (cbs['Year'] == 2017)])
cbsTop = cbsTop.append(cbs.loc[(cbs['Suburb'] == 'South Yarra') &  (cbs['Year'] == 2018)])
cbsTop['Rank'] = cbsTop['Rank'].apply(lambda x: 11 if x > 10 else x)
cbsTop['Rank'] = cbsTop['Rank'].astype(str)
cbsTop['Rank'] = cbsTop['Rank'].apply(lambda x: "10+" if x == "11" else x)

In [617]:
cbsTop

Unnamed: 0,Suburb,Year,Count,Total,Rank
609,Reservoir,2018,90,311,1
607,Reservoir,2016,349,142,1
608,Reservoir,2017,405,346,1
77,Bentleigh East,2016,236,142,2
78,Bentleigh East,2017,288,346,2
216,Craigieburn,2018,75,311,2
611,Richmond,2017,265,346,3
79,Bentleigh East,2018,59,311,3
610,Richmond,2016,236,142,3
344,Glen Iris,2016,205,142,4


In [618]:
df[df['Method'] == 'VB']

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Distance,Postcode,Bathroom,...,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount,Year,Month,Day,Count,MonthStr
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,2.5,3067,2,...,Yarra City Council,-37.81140,145.01160,Northern Metropolitan,4019,2016,2,4,1,February
6,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,2.5,3067,1,...,Yarra City Council,-37.80720,144.99410,Northern Metropolitan,4019,2016,6,4,1,June
23,Abbotsford,411/8 Grosvenor St,2,u,700000.0,VB,Jellis,2.5,3067,2,...,Yarra City Council,-37.81100,145.00670,Northern Metropolitan,4019,2016,11,12,1,November
24,Abbotsford,40 Nicholson St,3,h,1350000.0,VB,Nelson,2.5,3067,2,...,Yarra City Council,-37.80850,144.99640,Northern Metropolitan,4019,2016,11,12,1,November
67,Airport West,50 Bedford St,3,h,730000.0,VB,Nelson,13.5,3042,2,...,Moonee Valley City Council,-37.72030,144.87550,Western Metropolitan,3464,2016,12,3,1,December
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34789,Thornbury,98 Hutton St,3,h,1200000.0,VB,Nelson,7.0,3071,1,...,Darebin City Council,-37.75468,144.98959,Northern Metropolitan,8870,2018,2,24,1,February
34802,Vermont,19 Caldwell Rd,4,h,900000.0,VB,Jellis,17.2,3133,1,...,Maroondah City Council,-37.83473,145.21116,Eastern Metropolitan,4181,2018,2,24,1,February
34817,Watsonia,32 Kenmare St,3,h,840000.0,VB,Stockdale,14.5,3087,1,...,Banyule City Council,-37.71152,145.07794,Northern Metropolitan,2329,2018,2,24,1,February
34840,Williamstown,3/2 Thompson St,2,u,520000.0,VB,Raine,6.8,3016,,...,Hobsons Bay City Council,,,Western Metropolitan,6380,2018,2,24,1,February


In [619]:
np.unique(df['Method'].values)

array(['PI', 'PN', 'S', 'SA', 'SN', 'SP', 'SS', 'VB', 'W'], dtype=object)

# Creating average values per suburb data frame

In [620]:
msd = df.groupby(by=['Suburb']).mean()

In [621]:
msd = msd[['Price', 'Rooms', 'Distance', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'Propertycount', 'Postcode']]

In [622]:
msd['Postcode'] = msd['Postcode'].astype(int).astype(str)
decimals = pd.Series([1, 1, 1, 1,1,1,1,0], index = ['Price', 'Rooms', 'Distance', 'Bathroom', 'Car', 'Landsize','BuildingArea', 'Propertycount'])
msd = msd.round(decimals)
msd

Unnamed: 0_level_0,Price,Rooms,Distance,Bathroom,Car,Landsize,BuildingArea,Propertycount,Postcode
Suburb,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Abbotsford,1033549.0,2.5,2.7,1.4,1.0,354.3,103.6,4019.0,3067
Aberfeldie,1307192.9,3.2,8.3,1.9,1.9,536.9,185.2,1543.0,3040
Airport West,751364.2,3.0,11.8,1.5,1.6,453.2,137.2,3464.0,3042
Albanvale,536055.6,3.1,14.0,1.5,2.2,527.8,127.4,1899.0,3021
Albert Park,1927650.5,2.9,3.2,1.6,0.8,198.2,136.9,3280.0,3206
...,...,...,...,...,...,...,...,...,...
Yallambie,820861.1,3.4,14.3,1.8,1.7,591.5,161.3,1369.0,3085
Yarra Glen,620000.0,3.0,31.4,2.0,1.0,863.0,180.6,1160.0,3775
Yarraville,991245.0,2.9,6.7,1.4,1.5,325.6,127.9,6543.0,3013
croydon,730000.0,3.0,23.0,,,,,11925.0,3136


# Dot map data preparation

In [623]:
dm = df[["Suburb", "Address", "Rooms", "Type", "Price", "Distance", "Postcode", "Bathroom", "Car", "Landsize", "CouncilArea", "Lattitude","Longtitude","Regionname"]]

In [624]:
dm = dm.dropna(subset=["Price","Bathroom", "Car", "Lattitude", "Longtitude", "Landsize"])

In [625]:
dm.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Distance,Postcode,Bathroom,Car,Landsize,CouncilArea,Lattitude,Longtitude,Regionname
1,Abbotsford,85 Turner St,2,h,1480000.0,2.5,3067,1,1,202,Yarra City Council,-37.7996,144.9984,Northern Metropolitan
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,2.5,3067,1,0,156,Yarra City Council,-37.8079,144.9934,Northern Metropolitan
4,Abbotsford,5 Charles St,3,h,1465000.0,2.5,3067,2,0,134,Yarra City Council,-37.8093,144.9944,Northern Metropolitan
5,Abbotsford,40 Federation La,3,h,850000.0,2.5,3067,2,1,94,Yarra City Council,-37.7969,144.9969,Northern Metropolitan
6,Abbotsford,55a Park St,4,h,1600000.0,2.5,3067,1,2,120,Yarra City Council,-37.8072,144.9941,Northern Metropolitan


In [626]:
# Exclude for now
dm.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17678 entries, 1 to 34856
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Suburb       17678 non-null  object 
 1   Address      17678 non-null  object 
 2   Rooms        17678 non-null  int64  
 3   Type         17678 non-null  object 
 4   Price        17678 non-null  float64
 5   Distance     17678 non-null  float64
 6   Postcode     17678 non-null  Int64  
 7   Bathroom     17678 non-null  Int64  
 8   Car          17678 non-null  Int64  
 9   Landsize     17678 non-null  Int64  
 10  CouncilArea  17678 non-null  object 
 11  Lattitude    17678 non-null  float64
 12  Longtitude   17678 non-null  float64
 13  Regionname   17678 non-null  object 
dtypes: Int64(4), float64(4), int64(1), object(5)
memory usage: 2.1+ MB


In [627]:
dm.isnull().sum()

Suburb         0
Address        0
Rooms          0
Type           0
Price          0
Distance       0
Postcode       0
Bathroom       0
Car            0
Landsize       0
CouncilArea    0
Lattitude      0
Longtitude     0
Regionname     0
dtype: int64

In [628]:
factor = "Bathroom"
print(dm[factor].min(), dm[factor].max())

0 9


In [629]:
dm['Type'].unique()

array(['h', 'u', 't'], dtype=object)

# Distribution of selling method and house types

In [630]:
fig = px.pie(df, values='Count', names='Method')
# fig.show()

In [631]:
fig = px.pie(df, values='Count', names='Type')
# fig.show()

# Export & Clean-up

In [632]:
# Export to CSV after preprocessing
df.to_csv('data/Melbourne_housing_FULL.csv')
cbsTop.to_csv('data/countsBySuburb.csv')
msd.to_csv('data/meanSuburbValues.csv')
dm.to_csv('data/dotmapHousing.csv')