# Data Pre-Processing

## Location Data

**Needed Fields:**
- Postcode : int
- Locality : str (suburb)
- State : str
- Long : float
- Lat : float

In [None]:
# Importing the necessary libraries
import pandas as pd

# Loading the data
postcode_df = pd.read_excel("Datasets/australian_postcodes.xlsx")

# Extracting the needed fields
postcode_df = postcode_df[['Postcode', 'Locality', 'State', 'Long', 'Lat']]

postcode_df

Unnamed: 0,Postcode,Locality,State,Long,Lat
0,200,ANU,ACT,149.119000,-35.277700
1,200,Australian National University,ACT,149.118900,-35.277700
2,800,DARWIN,NT,130.836680,-12.458684
3,800,DARWIN CITY,NT,130.836680,-12.458684
4,801,DARWIN,NT,130.836680,-12.458684
...,...,...,...,...,...
18521,9013,BRISBANE,QLD,152.823141,-27.603479
18522,9015,BRISBANE,QLD,152.823141,-27.603479
18523,9464,NORTHGATE MC,QLD,153.074982,-27.397055
18524,9726,GOLD COAST MC,QLD,153.412197,-28.008783


In [None]:
# Dropping duplicate locations
postcode_filter = postcode_df.loc[:, ['Postcode', 'Locality', 'State', 'Long', 'Lat']].copy()
postcode_filter = postcode_filter.drop_duplicates(subset=['Postcode', 'Long', 'Lat'], keep="first")

# Standardising data types
postcode_filter.loc[:, 'Postcode'] = postcode_filter['Postcode'].astype(str)
postcode_filter.loc[:, 'Locality'] = postcode_filter['Locality'].astype(str)
postcode_filter.loc[:, 'State'] = postcode_filter['State'].astype(str)

# Standardising data format
postcode_filter['Postcode'] = postcode_filter['Postcode'].str.zfill(4)
postcode_filter['Locality'] = postcode_filter['Locality'].str.upper()
postcode_filter['State'] = postcode_filter['State'].str.upper()

postcode_filter.head(5)

Unnamed: 0,Postcode,Locality,State,Long,Lat
0,200,ANU,ACT,149.119,-35.2777
1,200,AUSTRALIAN NATIONAL UNIVERSITY,ACT,149.1189,-35.2777
2,800,DARWIN,NT,130.83668,-12.458684
4,801,DARWIN,NT,130.83668,-12.458684
5,803,WAGAIT BEACH,NT,130.745908,-12.433991


In [None]:
# Creating the primary keys for the location data
suburbs = postcode_filter['Locality'].drop_duplicates().reset_index()
suburbs['locality_id'] = ['SUB{:02d}'.format(i+1) for i in range(len(suburbs))]

suburbs = suburbs[['locality_id', 'Locality']]

suburbs

Unnamed: 0,locality_id,Locality
0,SUB01,ANU
1,SUB02,AUSTRALIAN NATIONAL UNIVERSITY
2,SUB03,DARWIN
3,SUB04,WAGAIT BEACH
4,SUB05,PARAP
...,...,...
4564,SUB4565,LITTLE LONSDALE STREET
4565,SUB4566,DANDENONG
4566,SUB4567,NORTHGATE MC
4567,SUB4568,GOLD COAST MC


In [None]:
# Standardising data type
postcode_filter['Locality'] = postcode_filter['Locality'].astype(str)
suburbs['Locality'] = suburbs['Locality'].astype(str)

# Merging the location data to add primary keys
postcode_filter = postcode_filter.join(suburbs.set_index('Locality'), on='Locality')

# Renaming column names
postcode_filter.rename(columns={'Postcode': 'postcode',
                        'Locality': 'locality',
                        'State': 'state_code',
                        'Long': 'long',
                        'Lat': 'lat'},
                        inplace=True)

postcode_filter

Unnamed: 0,postcode,locality,state_code,long,lat,locality_id
0,0200,ANU,ACT,149.119000,-35.277700,SUB01
1,0200,AUSTRALIAN NATIONAL UNIVERSITY,ACT,149.118900,-35.277700,SUB02
2,0800,DARWIN,NT,130.836680,-12.458684,SUB03
4,0801,DARWIN,NT,130.836680,-12.458684,SUB03
5,0803,WAGAIT BEACH,NT,130.745908,-12.433991,SUB04
...,...,...,...,...,...,...
18521,9013,BRISBANE,QLD,152.823141,-27.603479,SUB2386
18522,9015,BRISBANE,QLD,152.823141,-27.603479,SUB2386
18523,9464,NORTHGATE MC,QLD,153.074982,-27.397055,SUB4567
18524,9726,GOLD COAST MC,QLD,153.412197,-28.008783,SUB4568


## State List Master Table

In [None]:
# Obtaining the unique states
state_list = postcode_filter['state_code'].drop_duplicates().reset_index()

# Assigning unique ID to each state
state_list['state_id'] = ['STATE{:02d}'.format(i+1) for i in range(len(state_list))]

# Extracting needed fields
state_list = state_list[['state_id', 'state_code']]

# Mapping full state names
full_state = {"ACT": "AUSTRALIAN CAPITAL TERRITORY",
              "NT": "NORTHERN TERRITORY",
              "SA": "SOUTH AUSTRALIA",
              "WA": "WESTERN AUSTRALIA",
              "NSW": "NEW SOUTH WALES",
              "QLD": "QUEENSLAND",
              "VIC": "VICTORIA",
              "TAS": "TASMANIA"}

state_list['state_name'] = state_list['state_code'].map(full_state)
state_list["state_code"] = state_list["state_code"]

state_list

Unnamed: 0,state_id,state_code,state_name
0,STATE01,ACT,AUSTRALIAN CAPITAL TERRITORY
1,STATE02,NT,NORTHERN TERRITORY
2,STATE03,SA,SOUTH AUSTRALIA
3,STATE04,WA,WESTERN AUSTRALIA
4,STATE05,NSW,NEW SOUTH WALES
5,STATE06,QLD,QUEENSLAND
6,STATE07,VIC,VICTORIA
7,STATE08,TAS,TASMANIA


In [None]:
# Exporting to csv
state_list.to_csv("state_list.csv")

Assigning state_id to location data

In [None]:
# Assigning state_id to location data
postcode_filter = postcode_filter.join(state_list.set_index('state_code'), on='state_code')

postcode_filter = postcode_filter[['locality_id', 'state_id', 'postcode', 'locality',
                                   'long', 'lat']]

# Filtering the data to Victoria suburbs
postcode_filter = postcode_filter[postcode_filter['state_id'] == 'STATE07']
postcode_filter

Unnamed: 0,locality_id,state_id,postcode,locality,long,lat
6194,LOC1709,STATE07,3000,MELBOURNE,144.982585,-37.814437
6195,LOC1710,STATE07,3001,MELBOURNE,144.982585,-37.814437
6196,LOC1711,STATE07,3002,EAST MELBOURNE,144.982585,-37.814437
6197,LOC1712,STATE07,3003,WEST MELBOURNE,144.949592,-37.810871
6198,LOC1713,STATE07,3004,MELBOURNE,144.982585,-37.814437
...,...,...,...,...,...,...
18509,LOC5113,STATE07,8383,MELBOURNE,144.982585,-37.814437
18510,LOC5114,STATE07,8438,SUNSHINE WEST,144.811079,-37.798099
18511,LOC5115,STATE07,8511,SUNSHINE WEST,144.811079,-37.798099
18512,LOC5116,STATE07,8785,DANDENONG,145.208504,-38.016114


In [None]:
# Exporting to csv
postcode_filter.to_csv('location_data.csv')