## The purpose of this Jupyter Notebook is to extract/scrape the ATAR WA High School rankings from 2016 - 2020 & create tables which are to be loaded to the SQL database later on

In [44]:
# import the required library
import pandas as pd

In [2]:
# get the 2020 url WA schools
url_secondary_wa_2020 ="https://bettereducation.com.au/Results/WA/wace.aspx?yr=2020"

# Use panda's `read_html` to parse the 2020 ATAR url
secondary_wa_2020 = pd.read_html(url_secondary_wa_2020, header=0)[0]

# rename the "Unnamed: 0" column name
secondary_wa_2020 = secondary_wa_2020.rename(columns = {"Unnamed: 0" : "Year"})

# replace the values of the "Year" column
secondary_wa_2020['Year'] = 2020
secondary_wa_2020

Unnamed: 0,Year,Better Education Rank,School,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,% students with an ATAR,Trend / Compare
0,2020,1,Perth Modern School,97.55,242,242,100.00,Trend / Compare
1,2020,2,St Hilda's Anglican School for Girls,92.70,147,134,91.16,Trend / Compare
2,2020,3,Christ Church Grammar School,92.50,190,175,92.11,Trend / Compare
3,2020,4,Penrhos College,90.65,135,116,85.93,Trend / Compare
4,2020,5,Methodist Ladies' College,90.55,126,112,88.89,Trend / Compare
...,...,...,...,...,...,...,...,...
134,2020,135,Balcatta Senior High School,60.40,76,20,26.32,Trend / Compare
135,2020,136,Safety Bay Senior High School,58.75,115,23,20.00,Trend / Compare
136,2020,137,Darling Range Sports College,52.40,129,44,34.11,Trend / Compare
137,2020,138,Gilmore College,52.25,141,23,16.31,Trend / Compare


In [3]:
# rename some column names to prevent issues when uploading the data to PostgreSQL later on (eg. % will cause an error)
secondary_wa_2020 = secondary_wa_2020.rename(columns = {"Better Education Rank" : "Rank",
                                                          "% students with an ATAR" : "Percentage of students with an ATAR"})

# re-arrange the columns & delete the column "Trend / Compare" since it's not required
secondary_wa_2020 = secondary_wa_2020[['School', 'Year', 'Rank', 'Median ATAR','No. eligible Yr 12 students', 'No. Students with an ATAR', 'Percentage of students with an ATAR']]
secondary_wa_2020

Unnamed: 0,School,Year,Rank,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,Percentage of students with an ATAR
0,Perth Modern School,2020,1,97.55,242,242,100.00
1,St Hilda's Anglican School for Girls,2020,2,92.70,147,134,91.16
2,Christ Church Grammar School,2020,3,92.50,190,175,92.11
3,Penrhos College,2020,4,90.65,135,116,85.93
4,Methodist Ladies' College,2020,5,90.55,126,112,88.89
...,...,...,...,...,...,...,...
134,Balcatta Senior High School,2020,135,60.40,76,20,26.32
135,Safety Bay Senior High School,2020,136,58.75,115,23,20.00
136,Darling Range Sports College,2020,137,52.40,129,44,34.11
137,Gilmore College,2020,138,52.25,141,23,16.31


In [4]:
# replace some values to correct the spelling to prevent Null values when performing the merge with other Dataframes later on
secondary_wa_2020["School"].replace({"Australian Islamic College": "Australian Islamic College - Dianella", 
                       "Christian Brothers College,Fremantle,WA,6160": "Christian Brothers' College",
                       "Australian Islamic College (Kewdale)": "Australian Islamic College - Kewdale",
                       "Serpentine-Jarrahdale Grammar School": "Court Grammar School",
                       "Western Australian College of Agriculture (Cunderdin)": "Western Australian College Of Agriculture - Cunderdin",
                       "Carey Baptist College,Harrisdale,WA,6112": "Carey Baptist College",
                       "St Mary MacKillop College": "St Mary Mackillop College",
                       "John Curtin College Of The Arts": "John Curtin College of The Arts",
                       "St Stephen's School (Carramar)": "St Stephen's School - Carramar",
                       "St Stephen's School (Duncraig)": "St Stephen's School - Duncraig",
                       "St Stephen's School": "St Stephen's School - Carramar",
                       "Irene McCormack Catholic Colle": "Irene McCormack Catholic College",
                       "Swan Valley Anglican Community": "Swan Valley Anglican Community School",
                       "John Curtin College Of The Arts": "John Curtin College of the Arts",
                       "Peter Moyes Anglican Community": "Peter Moyes Anglican Community School",
                       "Great Southern Grammar School": "Great Southern Grammar"
                        }, inplace=True)

In [5]:
# read "school_coordinates" csv file
school_coordinates = pd.read_csv("Resources/school_coordinates.csv")
school_coordinates

Unnamed: 0,School,Suburb,Longitude,Latitude
0,Adam Road Primary School,South Bunbury,115.635371,-33.357951
1,Al-Hidayah Islamic School,Bentley,115.911564,-32.010358
2,Albany Community Kindergarten,Albany,117.880987,-35.021460
3,Albany Primary School,Albany,117.891611,-35.018212
4,Albany Secondary Education Support Centre,Albany,117.866867,-34.990749
...,...,...,...,...
1135,Yuluma Primary School,Innaloo,115.789990,-31.892268
1136,Yuna Primary School,Yuna,115.002648,-28.326566
1137,Woodthorpe School,Willetton,115.901720,-32.056540
1138,St Stephen's School - Carramar,Carramar,115.788280,-31.717040


In [6]:
# Merge "secondary_wa_2020" & "school_coordinates" Dataframes
wa_hs_atar_2020_coord = pd.merge(secondary_wa_2020, school_coordinates, on='School', how='left')
wa_hs_atar_2020_coord

Unnamed: 0,School,Year,Rank,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,Percentage of students with an ATAR,Suburb,Longitude,Latitude
0,Perth Modern School,2020,1,97.55,242,242,100.00,Subiaco,115.836396,-31.945066
1,St Hilda's Anglican School for Girls,2020,2,92.70,147,134,91.16,Mosman Park,115.767812,-32.005047
2,Christ Church Grammar School,2020,3,92.50,190,175,92.11,Claremont,115.776987,-31.985461
3,Penrhos College,2020,4,90.65,135,116,85.93,Como,115.872539,-31.997431
4,Methodist Ladies' College,2020,5,90.55,126,112,88.89,Claremont,115.775540,-31.987012
...,...,...,...,...,...,...,...,...,...,...
134,Balcatta Senior High School,2020,135,60.40,76,20,26.32,Balcatta,115.812243,-31.875517
135,Safety Bay Senior High School,2020,136,58.75,115,23,20.00,Safety Bay,115.740964,-32.303286
136,Darling Range Sports College,2020,137,52.40,129,44,34.11,Forrestfield,116.010860,-31.975661
137,Gilmore College,2020,138,52.25,141,23,16.31,Orelia,115.814803,-32.238266


In [7]:
# Check for NaN values
# creating bool series True for NaN values  
bool_series = pd.isnull(wa_hs_atar_2020_coord["Longitude"])  
    
# filtering data  
# displaying data only with Gender = NaN  
wa_hs_atar_2020_coord[bool_series]

Unnamed: 0,School,Year,Rank,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,Percentage of students with an ATAR,Suburb,Longitude,Latitude


In [8]:
# read "sectors" csv file
sectors = pd.read_csv("Resources/sectors.csv")
sectors

Unnamed: 0,School,Sector,Gender,Religion
0,Mercedes College,Non-government,Girls,Catholic
1,Wesley College,Non-government,Boys,Christian
2,Penrhos College,Non-government,Girls,Uniting
3,Newman College,Non-government,Co-ed,Catholic
4,St Norbert College,Non-government,Co-ed,Catholic
...,...,...,...,...
188,Bruce Rock District High School,Government,Co-ed,Secular
189,Shark Bay School,Government,Co-ed,Secular
190,Pemberton District High School,Government,Co-ed,Secular
191,Bunbury John Calvin School,Non-government,Co-ed,Christian


In [9]:
# Merge both "wa_hs_atar_2020_coord" & "sectors" Dataframes
wa_hs_atar_2020 = pd.merge(wa_hs_atar_2020_coord, sectors, on='School', how='left')
wa_hs_atar_2020

Unnamed: 0,School,Year,Rank,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,Percentage of students with an ATAR,Suburb,Longitude,Latitude,Sector,Gender,Religion
0,Perth Modern School,2020,1,97.55,242,242,100.00,Subiaco,115.836396,-31.945066,Government,Co-ed,Secular
1,St Hilda's Anglican School for Girls,2020,2,92.70,147,134,91.16,Mosman Park,115.767812,-32.005047,Non-government,Girls,Anglican
2,Christ Church Grammar School,2020,3,92.50,190,175,92.11,Claremont,115.776987,-31.985461,Non-government,Boys,Anglican
3,Penrhos College,2020,4,90.65,135,116,85.93,Como,115.872539,-31.997431,Non-government,Girls,Uniting
4,Methodist Ladies' College,2020,5,90.55,126,112,88.89,Claremont,115.775540,-31.987012,Non-government,Girls,Methodist
...,...,...,...,...,...,...,...,...,...,...,...,...,...
134,Balcatta Senior High School,2020,135,60.40,76,20,26.32,Balcatta,115.812243,-31.875517,Government,Co-ed,Secular
135,Safety Bay Senior High School,2020,136,58.75,115,23,20.00,Safety Bay,115.740964,-32.303286,Government,Co-ed,Secular
136,Darling Range Sports College,2020,137,52.40,129,44,34.11,Forrestfield,116.010860,-31.975661,Government,Co-ed,Secular
137,Gilmore College,2020,138,52.25,141,23,16.31,Orelia,115.814803,-32.238266,Government,Co-ed,Secular


In [10]:
# Check for NaN values
# creating bool series True for NaN values  
bool_series = pd.isnull(wa_hs_atar_2020["Sector"])  
    
# filtering data  
# displaying data only with Gender = NaN  
wa_hs_atar_2020[bool_series]

Unnamed: 0,School,Year,Rank,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,Percentage of students with an ATAR,Suburb,Longitude,Latitude,Sector,Gender,Religion


In [11]:
# save the "wa_hs_atar_2020" Dataframe to a CSV file
wa_hs_atar_2020.to_csv("Resources/wa_hs_atar_2020.csv", index = False)

In [12]:
# get the 2019 url WA schools
url_secondary_wa_2019 ="https://bettereducation.com.au/Results/WA/wace.aspx?yr=2019"

# Use panda's `read_html` to parse the 2019 ATAR url
secondary_wa_2019 = pd.read_html(url_secondary_wa_2019, header=0)[0]

# rename the "Unnamed: 0" column name
secondary_wa_2019 = secondary_wa_2019.rename(columns = {"Unnamed: 0" : "Year"})

# replace the values of the "Year" column
secondary_wa_2019['Year'] = 2019
secondary_wa_2019

Unnamed: 0,Year,Better Education Rank,School,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,% students with an ATAR,Trend / Compare
0,2019,1,Perth Modern School,96.75,241,238,98.76,Trend / Compare
1,2019,2,Hale School,92.85,201,177,88.06,Trend / Compare
2,2019,3,Christ Church Grammar School,92.50,199,178,89.45,Trend / Compare
3,2019,4,St Mary's Anglican Girls' School,91.15,161,137,85.09,Trend / Compare
4,2019,5,Presbyterian Ladies' College,90.75,123,116,94.31,Trend / Compare
...,...,...,...,...,...,...,...,...
137,2019,138,Warwick Senior High School,60.80,77,24,31.17,Trend / Compare
138,2019,139,Byford Secondary College,60.00,134,45,33.58,Trend / Compare
139,2019,140,Hedland Senior High School,55.95,80,21,26.25,Trend / Compare
140,2019,141,Merredin College,55.20,54,23,42.59,Trend / Compare


In [13]:
# rename some column names to prevent issues when uploading the data to PostgreSQL later on (eg. % will cause an error)
secondary_wa_2019 = secondary_wa_2019.rename(columns = {"Better Education Rank" : "Rank",
                                                          "% students with an ATAR" : "Percentage of students with an ATAR"})

# re-arrange the columns & delete the column "Trend / Compare" since it's not required
secondary_wa_2019 = secondary_wa_2019[['School', 'Year', 'Rank', 'Median ATAR','No. eligible Yr 12 students', 'No. Students with an ATAR', 'Percentage of students with an ATAR']]

secondary_wa_2019

Unnamed: 0,School,Year,Rank,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,Percentage of students with an ATAR
0,Perth Modern School,2019,1,96.75,241,238,98.76
1,Hale School,2019,2,92.85,201,177,88.06
2,Christ Church Grammar School,2019,3,92.50,199,178,89.45
3,St Mary's Anglican Girls' School,2019,4,91.15,161,137,85.09
4,Presbyterian Ladies' College,2019,5,90.75,123,116,94.31
...,...,...,...,...,...,...,...
137,Warwick Senior High School,2019,138,60.80,77,24,31.17
138,Byford Secondary College,2019,139,60.00,134,45,33.58
139,Hedland Senior High School,2019,140,55.95,80,21,26.25
140,Merredin College,2019,141,55.20,54,23,42.59


In [14]:
# replace some values to correct the spelling to prevent Null values when performing the merge with other Dataframes later on
secondary_wa_2019["School"].replace({"Australian Islamic College": "Australian Islamic College - Dianella", 
                       "Christian Brothers College,Fremantle,WA,6160": "Christian Brothers' College",
                       "Australian Islamic College (Kewdale)": "Australian Islamic College - Kewdale",
                       "Serpentine-Jarrahdale Grammar School": "Court Grammar School",
                       "Western Australian College of Agriculture (Cunderdin)": "Western Australian College Of Agriculture - Cunderdin",
                       "Carey Baptist College,Harrisdale,WA,6112": "Carey Baptist College",
                       "St Mary MacKillop College": "St Mary Mackillop College",
                       "John Curtin College Of The Arts": "John Curtin College of The Arts",
                       "St Stephen's School (Carramar)": "St Stephen's School - Carramar",
                       "St Stephen's School (Duncraig)": "St Stephen's School - Duncraig",
                       "St Stephen's School": "St Stephen's School - Carramar",
                       "Irene McCormack Catholic Colle": "Irene McCormack Catholic College",
                       "Swan Valley Anglican Community": "Swan Valley Anglican Community School",
                       "John Curtin College Of The Arts": "John Curtin College of the Arts",
                       "Peter Moyes Anglican Community": "Peter Moyes Anglican Community School",
                       "Great Southern Grammar School": "Great Southern Grammar"
                        }, inplace=True)

In [15]:
# Merge "secondary_wa_2019" & "school_coordinates" Dataframes
wa_hs_atar_2019_coord = pd.merge(secondary_wa_2019, school_coordinates, on='School', how='left')
wa_hs_atar_2019_coord

Unnamed: 0,School,Year,Rank,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,Percentage of students with an ATAR,Suburb,Longitude,Latitude
0,Perth Modern School,2019,1,96.75,241,238,98.76,Subiaco,115.836396,-31.945066
1,Hale School,2019,2,92.85,201,177,88.06,Wembley Downs,115.783783,-31.912322
2,Christ Church Grammar School,2019,3,92.50,199,178,89.45,Claremont,115.776987,-31.985461
3,St Mary's Anglican Girls' School,2019,4,91.15,161,137,85.09,Karrinyup,115.766244,-31.876971
4,Presbyterian Ladies' College,2019,5,90.75,123,116,94.31,Peppermint Grove,115.764906,-31.993592
...,...,...,...,...,...,...,...,...,...,...
137,Warwick Senior High School,2019,138,60.80,77,24,31.17,Warwick,115.815429,-31.838675
138,Byford Secondary College,2019,139,60.00,134,45,33.58,Byford,116.001879,-32.223432
139,Hedland Senior High School,2019,140,55.95,80,21,26.25,South Hedland,118.598616,-20.398789
140,Merredin College,2019,141,55.20,54,23,42.59,Merredin,118.284982,-31.488090


In [16]:
# Check for NaN values
# creating bool series True for NaN values  
bool_series = pd.isnull(wa_hs_atar_2019_coord["Longitude"])  
    
# filtering data  
# displaying data only with "Longitude" = NaN  
wa_hs_atar_2019_coord[bool_series]

Unnamed: 0,School,Year,Rank,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,Percentage of students with an ATAR,Suburb,Longitude,Latitude


In [17]:
# Merge both "wa_hs_atar_2019_coord" & "sectors" Dataframes
wa_hs_atar_2019 = pd.merge(wa_hs_atar_2019_coord, sectors, on='School', how='left')
wa_hs_atar_2019

Unnamed: 0,School,Year,Rank,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,Percentage of students with an ATAR,Suburb,Longitude,Latitude,Sector,Gender,Religion
0,Perth Modern School,2019,1,96.75,241,238,98.76,Subiaco,115.836396,-31.945066,Government,Co-ed,Secular
1,Hale School,2019,2,92.85,201,177,88.06,Wembley Downs,115.783783,-31.912322,Non-government,Boys,Anglican
2,Christ Church Grammar School,2019,3,92.50,199,178,89.45,Claremont,115.776987,-31.985461,Non-government,Boys,Anglican
3,St Mary's Anglican Girls' School,2019,4,91.15,161,137,85.09,Karrinyup,115.766244,-31.876971,Non-government,Girls,Anglican
4,Presbyterian Ladies' College,2019,5,90.75,123,116,94.31,Peppermint Grove,115.764906,-31.993592,Non-government,Girls,Uniting
...,...,...,...,...,...,...,...,...,...,...,...,...,...
137,Warwick Senior High School,2019,138,60.80,77,24,31.17,Warwick,115.815429,-31.838675,Government,Co-ed,Secular
138,Byford Secondary College,2019,139,60.00,134,45,33.58,Byford,116.001879,-32.223432,Government,Co-ed,Secular
139,Hedland Senior High School,2019,140,55.95,80,21,26.25,South Hedland,118.598616,-20.398789,Government,Co-ed,Secular
140,Merredin College,2019,141,55.20,54,23,42.59,Merredin,118.284982,-31.488090,Government,Co-ed,Secular


In [18]:
# Check for NaN values
# creating bool series True for NaN values  
bool_series = pd.isnull(wa_hs_atar_2019["Sector"])  
    
# filtering data  
# displaying data only with "Sector" = NaN  
wa_hs_atar_2019[bool_series]

Unnamed: 0,School,Year,Rank,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,Percentage of students with an ATAR,Suburb,Longitude,Latitude,Sector,Gender,Religion


In [19]:
# save the "wa_hs_atar_2019" Dataframe to a CSV file
wa_hs_atar_2019.to_csv("Resources/wa_hs_atar_2019.csv", index = False)

In [20]:
# get the 2018 url WA schools
url_secondary_wa_2018 ="https://bettereducation.com.au/Results/WA/wace.aspx?yr=2018"

# Use panda's `read_html` to parse the 2018 ATAR url
secondary_wa_2018 = pd.read_html(url_secondary_wa_2018, header=0)[0]

# rename the "Unnamed: 0" column name
secondary_wa_2018 = secondary_wa_2018.rename(columns = {"Unnamed: 0" : "Year"})

# replace the values of the "Year" column
secondary_wa_2018['Year'] = 2018
secondary_wa_2018

Unnamed: 0,Year,Better Education Rank,School,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,% students with an ATAR,Trend / Compare
0,2018,1,Perth Modern School,97.00,240,236,98.33,Trend / Compare
1,2018,2,Carmel School,92.60,33,31,93.94,Trend / Compare
2,2018,3,St Mary's Anglican Girls' School,92.10,175,150,85.71,Trend / Compare
3,2018,4,Methodist Ladies' College,92.00,139,133,95.68,Trend / Compare
4,2018,5,St Hilda's Anglican School for Girls,91.65,141,135,95.74,Trend / Compare
...,...,...,...,...,...,...,...,...
143,2018,144,Newton Moore Senior High School,55.55,81,29,35.80,Trend / Compare
144,2018,145,Northam Senior High School,53.70,88,27,30.68,Trend / Compare
145,2018,146,Dalyellup College,51.90,70,29,41.43,Trend / Compare
146,2018,147,Rockingham Senior High School,47.90,98,22,22.45,Trend / Compare


In [21]:
# rename some column names to prevent issues when uploading the data to PostgreSQL later on (eg. % will cause an error)
secondary_wa_2018 = secondary_wa_2018.rename(columns = {"Better Education Rank" : "Rank",
                                                          "% students with an ATAR" : "Percentage of students with an ATAR"})

# re-arrange the columns & delete the column "Trend / Compare" since it's not required
secondary_wa_2018 = secondary_wa_2018[['School', 'Year', 'Rank', 'Median ATAR','No. eligible Yr 12 students', 'No. Students with an ATAR', 'Percentage of students with an ATAR']]

secondary_wa_2018

Unnamed: 0,School,Year,Rank,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,Percentage of students with an ATAR
0,Perth Modern School,2018,1,97.00,240,236,98.33
1,Carmel School,2018,2,92.60,33,31,93.94
2,St Mary's Anglican Girls' School,2018,3,92.10,175,150,85.71
3,Methodist Ladies' College,2018,4,92.00,139,133,95.68
4,St Hilda's Anglican School for Girls,2018,5,91.65,141,135,95.74
...,...,...,...,...,...,...,...
143,Newton Moore Senior High School,2018,144,55.55,81,29,35.80
144,Northam Senior High School,2018,145,53.70,88,27,30.68
145,Dalyellup College,2018,146,51.90,70,29,41.43
146,Rockingham Senior High School,2018,147,47.90,98,22,22.45


In [22]:
# replace some values to correct the spelling to prevent Null values when performing the merge with other Dataframes later on
secondary_wa_2018["School"].replace({"Australian Islamic College": "Australian Islamic College - Dianella", 
                       "Christian Brothers College,Fremantle,WA,6160": "Christian Brothers' College",
                       "Australian Islamic College (Kewdale)": "Australian Islamic College - Kewdale",
                       "Serpentine-Jarrahdale Grammar School": "Court Grammar School",
                       "Western Australian College of Agriculture (Cunderdin)": "Western Australian College Of Agriculture - Cunderdin",
                       "Carey Baptist College,Harrisdale,WA,6112": "Carey Baptist College",
                       "St Mary MacKillop College": "St Mary Mackillop College",
                       "John Curtin College Of The Arts": "John Curtin College of The Arts",
                       "St Stephen's School (Carramar)": "St Stephen's School - Carramar",
                       "St Stephen's School (Duncraig)": "St Stephen's School - Duncraig",
                       "St Stephen's School": "St Stephen's School - Carramar",
                       "Irene McCormack Catholic Colle": "Irene McCormack Catholic College",
                       "Swan Valley Anglican Community": "Swan Valley Anglican Community School",
                       "John Curtin College Of The Arts": "John Curtin College of the Arts",
                       "Peter Moyes Anglican Community": "Peter Moyes Anglican Community School",
                       "Great Southern Grammar School": "Great Southern Grammar"
                        }, inplace=True)

In [23]:
# Merge "secondary_wa_2018" & "school_coordinates" Dataframes
wa_hs_atar_2018_coord = pd.merge(secondary_wa_2018, school_coordinates, on='School', how='left')
wa_hs_atar_2018_coord

Unnamed: 0,School,Year,Rank,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,Percentage of students with an ATAR,Suburb,Longitude,Latitude
0,Perth Modern School,2018,1,97.00,240,236,98.33,Subiaco,115.836396,-31.945066
1,Carmel School,2018,2,92.60,33,31,93.94,Dianella,115.862075,-31.899107
2,St Mary's Anglican Girls' School,2018,3,92.10,175,150,85.71,Karrinyup,115.766244,-31.876971
3,Methodist Ladies' College,2018,4,92.00,139,133,95.68,Claremont,115.775540,-31.987012
4,St Hilda's Anglican School for Girls,2018,5,91.65,141,135,95.74,Mosman Park,115.767812,-32.005047
...,...,...,...,...,...,...,...,...,...,...
143,Newton Moore Senior High School,2018,144,55.55,81,29,35.80,South Bunbury,115.635421,-33.359211
144,Northam Senior High School,2018,145,53.70,88,27,30.68,Northam,116.667649,-31.647256
145,Dalyellup College,2018,146,51.90,70,29,41.43,Dalyellup,115.625767,-33.403855
146,Rockingham Senior High School,2018,147,47.90,98,22,22.45,Rockingham,115.735520,-32.284952


In [24]:
# Check for NaN values
# creating bool series True for NaN values  
bool_series = pd.isnull(wa_hs_atar_2018_coord["Longitude"])  
    
# filtering data  
# displaying data only with "Longitude" = NaN
wa_hs_atar_2018_coord[bool_series]

Unnamed: 0,School,Year,Rank,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,Percentage of students with an ATAR,Suburb,Longitude,Latitude


In [25]:
# Merge both "wa_hs_atar_2018_coord" & "sectors" Dataframes
wa_hs_atar_2018 = pd.merge(wa_hs_atar_2018_coord, sectors, on='School', how='left')
wa_hs_atar_2018

Unnamed: 0,School,Year,Rank,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,Percentage of students with an ATAR,Suburb,Longitude,Latitude,Sector,Gender,Religion
0,Perth Modern School,2018,1,97.00,240,236,98.33,Subiaco,115.836396,-31.945066,Government,Co-ed,Secular
1,Carmel School,2018,2,92.60,33,31,93.94,Dianella,115.862075,-31.899107,Non-government,Co-ed,Jewish
2,St Mary's Anglican Girls' School,2018,3,92.10,175,150,85.71,Karrinyup,115.766244,-31.876971,Non-government,Girls,Anglican
3,Methodist Ladies' College,2018,4,92.00,139,133,95.68,Claremont,115.775540,-31.987012,Non-government,Girls,Methodist
4,St Hilda's Anglican School for Girls,2018,5,91.65,141,135,95.74,Mosman Park,115.767812,-32.005047,Non-government,Girls,Anglican
...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,Newton Moore Senior High School,2018,144,55.55,81,29,35.80,South Bunbury,115.635421,-33.359211,Government,Co-ed,Secular
144,Northam Senior High School,2018,145,53.70,88,27,30.68,Northam,116.667649,-31.647256,Government,Co-ed,Secular
145,Dalyellup College,2018,146,51.90,70,29,41.43,Dalyellup,115.625767,-33.403855,Government,Co-ed,Secular
146,Rockingham Senior High School,2018,147,47.90,98,22,22.45,Rockingham,115.735520,-32.284952,Government,Co-ed,Secular


In [26]:
# Check for NaN values
# creating bool series True for NaN values  
bool_series = pd.isnull(wa_hs_atar_2018["Sector"])  
    
# filtering data  
# displaying data only with "Sector" = NaN  
wa_hs_atar_2018[bool_series]

Unnamed: 0,School,Year,Rank,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,Percentage of students with an ATAR,Suburb,Longitude,Latitude,Sector,Gender,Religion


In [27]:
# save the "wa_hs_atar_2018" Dataframe to a CSV file
wa_hs_atar_2018.to_csv("Resources/wa_hs_atar_2018.csv", index = False)

In [28]:
# get the 2017 url WA schools
url_secondary_wa_2017 ="https://bettereducation.com.au/Results/WA/wace.aspx?yr=2017"

# Use panda's `read_html` to parse the 2017 ATAR url
secondary_wa_2017 = pd.read_html(url_secondary_wa_2017, header=0)[0]

# rename the "Unnamed: 0" column name
secondary_wa_2017 = secondary_wa_2017.rename(columns = {"Unnamed: 0" : "Year"})

# replace the values of the "Year" column
secondary_wa_2017['Year'] = 2017
secondary_wa_2017

Unnamed: 0,Year,Better Education Rank,School,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,% students with an ATAR,Trend / Compare
0,2017,1,Perth Modern School,95.90,223,222,99.55,Trend / Compare
1,2017,2,St Hilda's Anglican School for Girls,93.60,142,133,93.66,Trend / Compare
2,2017,3,Christ Church Grammar School,92.45,182,159,87.36,Trend / Compare
3,2017,4,Penrhos College,91.20,139,119,85.61,Trend / Compare
4,2017,5,St Mary's Anglican Girls' School,90.95,174,157,90.23,Trend / Compare
...,...,...,...,...,...,...,...,...
141,2017,142,Butler College,57.25,214,71,33.18,Trend / Compare
142,2017,143,Warnbro Community High School,56.35,99,21,21.21,Trend / Compare
143,2017,144,Safety Bay Senior High School,56.05,130,31,23.85,Trend / Compare
144,2017,145,Southern River College,55.40,96,24,25.00,Trend / Compare


In [29]:
# rename some column names to prevent issues when uploading the data to PostgreSQL later on (eg. % will cause an error)
secondary_wa_2017 = secondary_wa_2017.rename(columns = {"Better Education Rank" : "Rank",
                                                          "% students with an ATAR" : "Percentage of students with an ATAR"})

# re-arrange the columns & delete the column "Trend / Compare" since it's not required
secondary_wa_2017 = secondary_wa_2017[['School', 'Year', 'Rank', 'Median ATAR','No. eligible Yr 12 students', 'No. Students with an ATAR', 'Percentage of students with an ATAR']]

secondary_wa_2017

Unnamed: 0,School,Year,Rank,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,Percentage of students with an ATAR
0,Perth Modern School,2017,1,95.90,223,222,99.55
1,St Hilda's Anglican School for Girls,2017,2,93.60,142,133,93.66
2,Christ Church Grammar School,2017,3,92.45,182,159,87.36
3,Penrhos College,2017,4,91.20,139,119,85.61
4,St Mary's Anglican Girls' School,2017,5,90.95,174,157,90.23
...,...,...,...,...,...,...,...
141,Butler College,2017,142,57.25,214,71,33.18
142,Warnbro Community High School,2017,143,56.35,99,21,21.21
143,Safety Bay Senior High School,2017,144,56.05,130,31,23.85
144,Southern River College,2017,145,55.40,96,24,25.00


In [30]:
# replace some values to correct the spelling to prevent Null values when performing the merge with other Dataframes later on
secondary_wa_2017["School"].replace({"Australian Islamic College": "Australian Islamic College - Dianella", 
                       "Christian Brothers College,Fremantle,WA,6160": "Christian Brothers' College",
                       "Australian Islamic College (Kewdale)": "Australian Islamic College - Kewdale",
                       "Serpentine-Jarrahdale Grammar School": "Court Grammar School",
                       "Western Australian College of Agriculture (Cunderdin)": "Western Australian College Of Agriculture - Cunderdin",
                       "Carey Baptist College,Harrisdale,WA,6112": "Carey Baptist College",
                       "St Mary MacKillop College": "St Mary Mackillop College",
                       "John Curtin College Of The Arts": "John Curtin College of The Arts",
                       "St Stephen's School (Carramar)": "St Stephen's School - Carramar",
                       "St Stephen's School (Duncraig)": "St Stephen's School - Duncraig",
                       "St Stephen's School": "St Stephen's School - Carramar",
                       "Irene McCormack Catholic Colle": "Irene McCormack Catholic College",
                       "Swan Valley Anglican Community": "Swan Valley Anglican Community School",
                       "John Curtin College Of The Arts": "John Curtin College of the Arts",
                       "Peter Moyes Anglican Community": "Peter Moyes Anglican Community School",
                       "Great Southern Grammar School": "Great Southern Grammar"
                        }, inplace=True)

In [31]:
# Merge "secondary_wa_2017" & "school_coordinates" Dataframes
wa_hs_atar_2017_coord = pd.merge(secondary_wa_2017, school_coordinates, on='School', how='left')
wa_hs_atar_2017_coord

Unnamed: 0,School,Year,Rank,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,Percentage of students with an ATAR,Suburb,Longitude,Latitude
0,Perth Modern School,2017,1,95.90,223,222,99.55,Subiaco,115.836396,-31.945066
1,St Hilda's Anglican School for Girls,2017,2,93.60,142,133,93.66,Mosman Park,115.767812,-32.005047
2,Christ Church Grammar School,2017,3,92.45,182,159,87.36,Claremont,115.776987,-31.985461
3,Penrhos College,2017,4,91.20,139,119,85.61,Como,115.872539,-31.997431
4,St Mary's Anglican Girls' School,2017,5,90.95,174,157,90.23,Karrinyup,115.766244,-31.876971
...,...,...,...,...,...,...,...,...,...,...
141,Butler College,2017,142,57.25,214,71,33.18,Butler,115.708701,-31.651664
142,Warnbro Community High School,2017,143,56.35,99,21,21.21,Warnbro,115.759096,-32.345569
143,Safety Bay Senior High School,2017,144,56.05,130,31,23.85,Safety Bay,115.740964,-32.303286
144,Southern River College,2017,145,55.40,96,24,25.00,Gosnells,115.979918,-32.085598


In [32]:
# Check for NaN values
# creating bool series True for NaN values  
bool_series = pd.isnull(wa_hs_atar_2017_coord["Longitude"])  
    
# filtering data  
# displaying data only with "Longitude" = NaN
wa_hs_atar_2017_coord[bool_series]

Unnamed: 0,School,Year,Rank,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,Percentage of students with an ATAR,Suburb,Longitude,Latitude


In [33]:
# Merge both "wa_hs_atar_2017_coord" & "sectors" Dataframes
wa_hs_atar_2017 = pd.merge(wa_hs_atar_2017_coord, sectors, on='School', how='left')
wa_hs_atar_2017

Unnamed: 0,School,Year,Rank,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,Percentage of students with an ATAR,Suburb,Longitude,Latitude,Sector,Gender,Religion
0,Perth Modern School,2017,1,95.90,223,222,99.55,Subiaco,115.836396,-31.945066,Government,Co-ed,Secular
1,St Hilda's Anglican School for Girls,2017,2,93.60,142,133,93.66,Mosman Park,115.767812,-32.005047,Non-government,Girls,Anglican
2,Christ Church Grammar School,2017,3,92.45,182,159,87.36,Claremont,115.776987,-31.985461,Non-government,Boys,Anglican
3,Penrhos College,2017,4,91.20,139,119,85.61,Como,115.872539,-31.997431,Non-government,Girls,Uniting
4,St Mary's Anglican Girls' School,2017,5,90.95,174,157,90.23,Karrinyup,115.766244,-31.876971,Non-government,Girls,Anglican
...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,Butler College,2017,142,57.25,214,71,33.18,Butler,115.708701,-31.651664,Government,Co-ed,Secular
142,Warnbro Community High School,2017,143,56.35,99,21,21.21,Warnbro,115.759096,-32.345569,Government,Co-ed,Secular
143,Safety Bay Senior High School,2017,144,56.05,130,31,23.85,Safety Bay,115.740964,-32.303286,Government,Co-ed,Secular
144,Southern River College,2017,145,55.40,96,24,25.00,Gosnells,115.979918,-32.085598,Government,Co-ed,Secular


In [34]:
# Check for NaN values
# creating bool series True for NaN values  
bool_series = pd.isnull(wa_hs_atar_2017["Sector"])  
    
# filtering data  
# displaying data only with "Sector" = NaN
wa_hs_atar_2017[bool_series]

Unnamed: 0,School,Year,Rank,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,Percentage of students with an ATAR,Suburb,Longitude,Latitude,Sector,Gender,Religion


In [35]:
# save the "wa_hs_atar_2017" Dataframe to a CSV file
wa_hs_atar_2017.to_csv("Resources/wa_hs_atar_2017.csv", index = False)

In [36]:
# get the 2016 url WA schools
url_secondary_wa_2016 ="https://bettereducation.com.au/Results/WA/wace.aspx?yr=2016"

# Use panda's `read_html` to parse the 2016 ATAR url
secondary_wa_2016 = pd.read_html(url_secondary_wa_2016, header=0)[0]

# rename the "Unnamed: 0" column name
secondary_wa_2016 = secondary_wa_2016.rename(columns = {"Unnamed: 0" : "Year"})

# replace the values of the "Year" column
secondary_wa_2016['Year'] = 2016
secondary_wa_2016

Unnamed: 0,Year,Better Education Rank,School,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,% students with an ATAR,Trend / Compare
0,2016,1,Perth Modern School,95.55,222,220,99.10,Trend / Compare
1,2016,2,Presbyterian Ladies' College,92.90,122,110,90.16,Trend / Compare
2,2016,3,Penrhos College,92.65,145,123,84.83,Trend / Compare
3,2016,4,Christ Church Grammar School,92.50,188,171,90.96,Trend / Compare
4,2016,5,Santa Maria College,91.85,189,151,79.89,Trend / Compare
...,...,...,...,...,...,...,...,...
138,2016,139,Thornlie Senior High School,58.45,125,35,28.00,Trend / Compare
139,2016,140,Lakeland Senior High School,57.70,89,27,30.34,Trend / Compare
140,2016,141,Newton Moore Senior High School,56.95,90,35,38.89,Trend / Compare
141,2016,142,Foundation Christian College,50.65,35,26,74.29,Trend / Compare


In [37]:
# rename some column names to prevent issues when uploading the data to PostgreSQL later on (eg. % will cause an error)
secondary_wa_2016 = secondary_wa_2016.rename(columns = {"Better Education Rank" : "Rank",
                                                          "% students with an ATAR" : "Percentage of students with an ATAR"})

# re-arrange the columns & delete the column "Trend / Compare" since it's not required
secondary_wa_2016 = secondary_wa_2016[['School', 'Year', 'Rank', 'Median ATAR','No. eligible Yr 12 students', 'No. Students with an ATAR', 'Percentage of students with an ATAR']]

secondary_wa_2016

Unnamed: 0,School,Year,Rank,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,Percentage of students with an ATAR
0,Perth Modern School,2016,1,95.55,222,220,99.10
1,Presbyterian Ladies' College,2016,2,92.90,122,110,90.16
2,Penrhos College,2016,3,92.65,145,123,84.83
3,Christ Church Grammar School,2016,4,92.50,188,171,90.96
4,Santa Maria College,2016,5,91.85,189,151,79.89
...,...,...,...,...,...,...,...
138,Thornlie Senior High School,2016,139,58.45,125,35,28.00
139,Lakeland Senior High School,2016,140,57.70,89,27,30.34
140,Newton Moore Senior High School,2016,141,56.95,90,35,38.89
141,Foundation Christian College,2016,142,50.65,35,26,74.29


In [38]:
# replace some values to correct the spelling to prevent Null values when performing the merge with other Dataframes later on
secondary_wa_2016["School"].replace({"Australian Islamic College": "Australian Islamic College - Dianella", 
                       "Christian Brothers College,Fremantle,WA,6160": "Christian Brothers' College",
                       "Australian Islamic College (Kewdale)": "Australian Islamic College - Kewdale",
                       "Serpentine-Jarrahdale Grammar School": "Court Grammar School",
                       "Western Australian College of Agriculture (Cunderdin)": "Western Australian College Of Agriculture - Cunderdin",
                       "Carey Baptist College,Harrisdale,WA,6112": "Carey Baptist College",
                       "St Mary MacKillop College": "St Mary Mackillop College",
                       "John Curtin College Of The Arts": "John Curtin College of The Arts",
                       "St Stephen's School (Carramar)": "St Stephen's School - Carramar",
                       "St Stephen's School (Duncraig)": "St Stephen's School - Duncraig",
                       "St Stephen's School": "St Stephen's School - Carramar",
                       "Irene McCormack Catholic Colle": "Irene McCormack Catholic College",
                       "Swan Valley Anglican Community": "Swan Valley Anglican Community School",
                       "John Curtin College Of The Arts": "John Curtin College of the Arts",
                       "Peter Moyes Anglican Community": "Peter Moyes Anglican Community School",
                       "Great Southern Grammar School": "Great Southern Grammar"
                        }, inplace=True)

In [39]:
# Merge "secondary_wa_2016" & "school_coordinates" Dataframes
wa_hs_atar_2016_coord = pd.merge(secondary_wa_2016, school_coordinates, on='School', how='left')
wa_hs_atar_2016_coord

Unnamed: 0,School,Year,Rank,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,Percentage of students with an ATAR,Suburb,Longitude,Latitude
0,Perth Modern School,2016,1,95.55,222,220,99.10,Subiaco,115.836396,-31.945066
1,Presbyterian Ladies' College,2016,2,92.90,122,110,90.16,Peppermint Grove,115.764906,-31.993592
2,Penrhos College,2016,3,92.65,145,123,84.83,Como,115.872539,-31.997431
3,Christ Church Grammar School,2016,4,92.50,188,171,90.96,Claremont,115.776987,-31.985461
4,Santa Maria College,2016,5,91.85,189,151,79.89,Attadale,115.795823,-32.018614
...,...,...,...,...,...,...,...,...,...,...
138,Thornlie Senior High School,2016,139,58.45,125,35,28.00,Thornlie,115.959641,-32.066276
139,Lakeland Senior High School,2016,140,57.70,89,27,30.34,South Lake,115.845914,-32.109055
140,Newton Moore Senior High School,2016,141,56.95,90,35,38.89,South Bunbury,115.635421,-33.359211
141,Foundation Christian College,2016,142,50.65,35,26,74.29,Greenfields,115.762289,-32.529316


In [40]:
# Check for NaN values
# creating bool series True for NaN values  
bool_series = pd.isnull(wa_hs_atar_2016_coord["Longitude"])  
    
# filtering data  
# displaying data only with "Longitude" = NaN
wa_hs_atar_2016_coord[bool_series]

Unnamed: 0,School,Year,Rank,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,Percentage of students with an ATAR,Suburb,Longitude,Latitude


In [41]:
# Merge both "wa_hs_atar_2016_coord" & "sectors" Dataframes
wa_hs_atar_2016 = pd.merge(wa_hs_atar_2016_coord, sectors, on='School', how='left')
wa_hs_atar_2016

Unnamed: 0,School,Year,Rank,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,Percentage of students with an ATAR,Suburb,Longitude,Latitude,Sector,Gender,Religion
0,Perth Modern School,2016,1,95.55,222,220,99.10,Subiaco,115.836396,-31.945066,Government,Co-ed,Secular
1,Presbyterian Ladies' College,2016,2,92.90,122,110,90.16,Peppermint Grove,115.764906,-31.993592,Non-government,Girls,Uniting
2,Penrhos College,2016,3,92.65,145,123,84.83,Como,115.872539,-31.997431,Non-government,Girls,Uniting
3,Christ Church Grammar School,2016,4,92.50,188,171,90.96,Claremont,115.776987,-31.985461,Non-government,Boys,Anglican
4,Santa Maria College,2016,5,91.85,189,151,79.89,Attadale,115.795823,-32.018614,Non-government,Girls,Catholic
...,...,...,...,...,...,...,...,...,...,...,...,...,...
138,Thornlie Senior High School,2016,139,58.45,125,35,28.00,Thornlie,115.959641,-32.066276,Government,Co-ed,Secular
139,Lakeland Senior High School,2016,140,57.70,89,27,30.34,South Lake,115.845914,-32.109055,Government,Co-ed,Secular
140,Newton Moore Senior High School,2016,141,56.95,90,35,38.89,South Bunbury,115.635421,-33.359211,Government,Co-ed,Secular
141,Foundation Christian College,2016,142,50.65,35,26,74.29,Greenfields,115.762289,-32.529316,Non-government,Co-ed,Christian


In [42]:
# Check for NaN values
# creating bool series True for NaN values  
bool_series = pd.isnull(wa_hs_atar_2016["Sector"])  
    
# filtering data  
# displaying data only with "Sector" = NaN
wa_hs_atar_2016[bool_series]

Unnamed: 0,School,Year,Rank,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,Percentage of students with an ATAR,Suburb,Longitude,Latitude,Sector,Gender,Religion


In [43]:
# save the "wa_hs_atar_2016" Dataframe to a CSV file
wa_hs_atar_2016.to_csv("Resources/wa_hs_atar_2016.csv", index = False)