## The purpose of this Jupyter Notebook is to create a complete list of ICSEA scores which will be used for all the schools in our datasets from 2016-2019

In [24]:
# import the required library
import pandas as pd

In [25]:
# get the 2019 url WA school ranking
rank_2019 = "https://bettereducation.com.au/school/secondary/wa/wa_top_secondary_schools.aspx?yr=2019"

# Use panda's `read_html` to parse the 2019 ranking url where the ICSEA scores are listed (years 2016 to 2018 have no ICSEA scores)
hs_wa_2019 = pd.read_html(rank_2019, header=0)[0]
hs_wa_2019

Unnamed: 0,School,Postcode,State Overall Score,English,Maths,Total Enrolments,Trend / Compare,Sector,ICSEA
0,Perth Modern School,6008,100,,,1418,Trend / Compare,Government,1239.0
1,"St Hilda's Anglican School for Girls (Inc),Mos...",6012,100,,,1087,Trend / Compare,Non-government,1197.0
2,Christ Church Grammar School,6010,100,,,1646,Trend / Compare,Non-government,1180.0
3,Rossmoyne Senior High School,6148,100,,,2188,Trend / Compare,Government,1121.0
4,"St Mary's Anglican Girls' School (Inc),Karriny...",6018,100,,,1414,Trend / Compare,Non-government,1158.0
...,...,...,...,...,...,...,...,...,...
89,"Swan Valley Anglican Community School,Aveley,W...",6069,90,,,1031,Trend / Compare,Non-government,1045.0
90,Boyup Brook District High School,6244,90,,,162,Trend / Compare,Government,1002.0
91,"Mater Dei College,Edgewater,WA,6027",6027,90,,,804,Trend / Compare,Non-government,1055.0
92,"Emmanuel Catholic College,Beeliar,WA,6164",6164,90,,,1061,Trend / Compare,Non-government,1036.0


In [26]:
# replace the values of "Australian Islamic College" to correct the spelling to prevent Null values when performing the merge with other Dataframes later on
hs_wa_2019["School"].replace({"Australian Islamic College (Kewdale),Kewdale,WA,6105": "Australian Islamic College - Kewdale",
                              "Australian Islamic College (Thornlie),Thornlie,WA,6108": "Australian Islamic College - Thornlie"
                                    }, inplace=True)

In [27]:
# Convert the values in the "ICSEA" column to a String
hs_wa_2019["ICSEA"] = hs_wa_2019["ICSEA"].astype(str)

# Remove other strings before "." to show only the ICSEA values
hs_wa_2019["ICSEA"] = hs_wa_2019["ICSEA"].str.split('.', 1).str.get(0)
hs_wa_2019

Unnamed: 0,School,Postcode,State Overall Score,English,Maths,Total Enrolments,Trend / Compare,Sector,ICSEA
0,Perth Modern School,6008,100,,,1418,Trend / Compare,Government,1239
1,"St Hilda's Anglican School for Girls (Inc),Mos...",6012,100,,,1087,Trend / Compare,Non-government,1197
2,Christ Church Grammar School,6010,100,,,1646,Trend / Compare,Non-government,1180
3,Rossmoyne Senior High School,6148,100,,,2188,Trend / Compare,Government,1121
4,"St Mary's Anglican Girls' School (Inc),Karriny...",6018,100,,,1414,Trend / Compare,Non-government,1158
...,...,...,...,...,...,...,...,...,...
89,"Swan Valley Anglican Community School,Aveley,W...",6069,90,,,1031,Trend / Compare,Non-government,1045
90,Boyup Brook District High School,6244,90,,,162,Trend / Compare,Government,1002
91,"Mater Dei College,Edgewater,WA,6027",6027,90,,,804,Trend / Compare,Non-government,1055
92,"Emmanuel Catholic College,Beeliar,WA,6164",6164,90,,,1061,Trend / Compare,Non-government,1036


In [28]:
# Remove the strings after the "," to show only the school name
hs_wa_2019["School"] = hs_wa_2019["School"].str.split(',', 1).str.get(0)
hs_wa_2019

Unnamed: 0,School,Postcode,State Overall Score,English,Maths,Total Enrolments,Trend / Compare,Sector,ICSEA
0,Perth Modern School,6008,100,,,1418,Trend / Compare,Government,1239
1,St Hilda's Anglican School for Girls (Inc),6012,100,,,1087,Trend / Compare,Non-government,1197
2,Christ Church Grammar School,6010,100,,,1646,Trend / Compare,Non-government,1180
3,Rossmoyne Senior High School,6148,100,,,2188,Trend / Compare,Government,1121
4,St Mary's Anglican Girls' School (Inc),6018,100,,,1414,Trend / Compare,Non-government,1158
...,...,...,...,...,...,...,...,...,...
89,Swan Valley Anglican Community School,6069,90,,,1031,Trend / Compare,Non-government,1045
90,Boyup Brook District High School,6244,90,,,162,Trend / Compare,Government,1002
91,Mater Dei College,6027,90,,,804,Trend / Compare,Non-government,1055
92,Emmanuel Catholic College,6164,90,,,1061,Trend / Compare,Non-government,1036


In [29]:
# Remove the "space" and "string inside parenthesis" to show only the school name
hs_wa_2019["School"] = hs_wa_2019["School"].str.split('\s+\(', 1).str.get(0)
hs_wa_2019

Unnamed: 0,School,Postcode,State Overall Score,English,Maths,Total Enrolments,Trend / Compare,Sector,ICSEA
0,Perth Modern School,6008,100,,,1418,Trend / Compare,Government,1239
1,St Hilda's Anglican School for Girls,6012,100,,,1087,Trend / Compare,Non-government,1197
2,Christ Church Grammar School,6010,100,,,1646,Trend / Compare,Non-government,1180
3,Rossmoyne Senior High School,6148,100,,,2188,Trend / Compare,Government,1121
4,St Mary's Anglican Girls' School,6018,100,,,1414,Trend / Compare,Non-government,1158
...,...,...,...,...,...,...,...,...,...
89,Swan Valley Anglican Community School,6069,90,,,1031,Trend / Compare,Non-government,1045
90,Boyup Brook District High School,6244,90,,,162,Trend / Compare,Government,1002
91,Mater Dei College,6027,90,,,804,Trend / Compare,Non-government,1055
92,Emmanuel Catholic College,6164,90,,,1061,Trend / Compare,Non-government,1036


In [30]:
# re-arrange the columns & remove other columns
icsea = hs_wa_2019[['School', 'ICSEA']]
icsea

Unnamed: 0,School,ICSEA
0,Perth Modern School,1239
1,St Hilda's Anglican School for Girls,1197
2,Christ Church Grammar School,1180
3,Rossmoyne Senior High School,1121
4,St Mary's Anglican Girls' School,1158
...,...,...
89,Swan Valley Anglican Community School,1045
90,Boyup Brook District High School,1002
91,Mater Dei College,1055
92,Emmanuel Catholic College,1036


In [31]:
# read "school_coordinates" csv file
icsea_additional = pd.read_csv("Resources/icsea_additional.csv")
icsea_additional

Unnamed: 0,School,ICSEA
0,Heritage College Perth,1028
1,Alkimos Baptist College,1035
2,Margaret River Senior High School,1037
3,Bunbury Catholic College,1035
4,Kalbarri District High School,988
...,...,...
94,Manea Senior College,1022
95,Mindarie Senior College,1034
96,Our Lady of Mercy College,1035
97,Sevenoaks Senior College,976


In [32]:
# Merge by concat the "icsea" Dataframe with "icsea_update" Dataframe
icsea_full = pd.concat([icsea, icsea_additional])

In [33]:
# Display the merged Dataframe
icsea_full

Unnamed: 0,School,ICSEA
0,Perth Modern School,1239
1,St Hilda's Anglican School for Girls,1197
2,Christ Church Grammar School,1180
3,Rossmoyne Senior High School,1121
4,St Mary's Anglican Girls' School,1158
...,...,...
94,Manea Senior College,1022
95,Mindarie Senior College,1034
96,Our Lady of Mercy College,1035
97,Sevenoaks Senior College,976


In [34]:
# Check for duplicates
icsea_full[icsea_full.duplicated(keep=False)]

Unnamed: 0,School,ICSEA


In [35]:
# check for "nan" values which is a known value in the 2019 rankings dataset for Woodthorpe School
# "nan" values are not considered Null (as opposed to "Nan") so it has to be searched manually
icsea_full.loc[icsea_full["ICSEA"] == "nan", :]

Unnamed: 0,School,ICSEA
34,Woodthorpe School,


In [36]:
# replaced "nan" with 983 which is the ICSEA score for Woodthorpe School
icsea_full = icsea_full.replace('nan', 983)

In [37]:
# check for "nan" values after replacing the value to double check
icsea_full.loc[icsea_full["ICSEA"] == "nan", :]

Unnamed: 0,School,ICSEA


In [38]:
# replace some values to correct the spelling# replace some values to correct the spelling to prevent Null values when performing the merge with other Dataframes later on
icsea_full["School"].replace({"John Curtin College Of The Arts": "John Curtin College of the Arts", 
                       "Australian Christian College": "Australian Christian College - Southlands",
                       "St Stephen's School": "St Stephen's School - Duncraig",
                       "Schools Of Isolated & Distance Education": "School of Isolated and Distance Education",
                       "St Mary MacKillop College": "St Mary Mackillop College"
                        }, inplace=True)

In [39]:
# # save the "icsea_full" Dataframe to a CSV file
icsea_full.to_csv("Resources/icsea_full.csv", index = False)