In [1]:
# Import key librarys

import pandas as pd
import numpy as np

import plotly.express as px

import chart_studio
import chart_studio.plotly as py
import chart_studio.tools as tls

In [2]:
# Define level
level = 'Advanced_Higher'
level_label = level.replace('_', ' ')

# Import data
df = pd.read_excel("grade-boundaries-2022.xlsx", 
                       sheet_name = level, skiprows = 2)

df16 = pd.read_excel("Grade_Boundaries_2016.xls", 
                       sheet_name = level, skiprows = 3)

df17 = pd.read_excel("Grade_Boundaries_2017.xls", 
                       sheet_name = level, skiprows = 3)

In [3]:
def tidy_up_1(df):
    
    # Remove spaces and replace with '_'
    df.columns = df.columns.str.replace(' ','_')
    
    # Replace [z] values with NaN
    df = df.replace('[z]', np.nan)

    # Remove rows from 2021 - 2020
    df = df[df.columns.drop(list(df.filter(regex='2021')))]
    df = df[df.columns.drop(list(df.filter(regex='2020')))]
    
    return df
    
df = tidy_up_1(df)
df16 = tidy_up_1(df16)
df17 = tidy_up_1(df17)

In [4]:
def tidy_up_2(df):

    # Create a list of years
    years = ['2022', '2019', '2018']

    # Sort out column titles
    for year in years:
        df = df.rename({'Maximum_Mark_' + year: 'max_' + year}, axis=1)
        df = df.rename({'Upper_A_Boundary_' + year: 'upA_' + year}, axis=1)
        df = df.rename({'A_Boundary_' + year: 'A_' + year}, axis=1)
        df = df.rename({'B_Boundary_' + year: 'B_' + year}, axis=1)
        df = df.rename({'C_Boundary_' + year: 'C_' + year}, axis=1)
        df = df.rename({'D_Boundary_' + year: 'D_' + year}, axis=1)

    return df
        
df = tidy_up_2(df)

In [5]:
def tidy_up_2(df, year):

    # Sort out column titles
    df = df.rename({'Max_Mark': 'max_' + year}, axis=1)
    df = df.rename({'Upper_A': 'upA_' + year}, axis=1)
    df = df.rename({'A_Mark': 'A_' + year}, axis=1)
    df = df.rename({'B_Mark': 'B_' + year}, axis=1)
    df = df.rename({'C_Mark': 'C_' + year}, axis=1)
    df = df.rename({'D_Mark': 'D_' + year}, axis=1)

    return df
        
df16 = tidy_up_2(df16, '2016')
df17 = tidy_up_2(df17, '2017')

In [6]:
# Sort out subjects names in 2016 and 2017 datasets

def tidy_up_2a(df):

    # Update names
    di = {'Hospitality: Practical Cake Craft': 'Practical Cake Craft',
          'Hospitality: Practical Cookery': 'Practical Cookery',
          'Cantonese': 'Chinese Languages'}
    
    df = df.replace({"Subject": di})
    
    # Remove duplicate names
    values = ['Cruinn-eolas (Geography)', 'Nuadh-Eolas (Modern Studies)', 'Eachdraidh (History)', 
        'Matamataig (Mathematics)', 'Matamataig Fad-bheatha (Lifeskills Mathematics)',
            'Mandarin (Simplified)', 'Mandarin (Traditional)']

    df = df.loc[~df['Subject'].isin(values)]
    
    return df

df16 = tidy_up_2a(df16)
df17 = tidy_up_2a(df17)

In [7]:
# Concat together three dfs (on Subject as index)

dfc = pd.concat([df.set_index('Subject'), df17.set_index('Subject'), df16.set_index('Subject')], 
                  axis=1).reset_index()

In [378]:
def per(df):
    
    # Create a list of years
    years = ['2022', '2019', '2018', '2017', '2016']
    
    # Calculate number of Bs, Cs and Ds
    for year in years:
        df['upA_' + year] = df['upA_' + year] / df['max_' + year]
        df['A_' + year] = df['A_' + year] / df['max_' + year]
        df['B_' + year] = df['B_' + year] / df['max_' + year]
        df['C_' + year] = df['C_' + year] / df['max_' + year]
        df['D_' + year] = df['D_' + year] / df['max_' + year]
        
    return df

dfc = per(dfc)

In [379]:
def tidy_up_3(df):
    
    # Remove rows with max
    df = df[df.columns.drop(list(df.filter(regex='max')))]
    
    # Round to 2 decimal places (note this rounds 0.425 to 0.42 due to issue of rounding floats)
    df = df.round(2)
    
    return df
    
dfc = tidy_up_3(dfc)

In [380]:
dfc.tail()

Unnamed: 0,Subject,upA_2022,A_2022,B_2022,C_2022,D_2022,upA_2019,A_2019,B_2019,C_2019,...,upA_2017,A_2017,B_2017,C_2017,D_2017,upA_2016,A_2016,B_2016,C_2016,D_2016
30,Physical Education,0.85,0.7,0.59,0.48,0.37,0.85,0.7,0.6,0.5,...,0.9,0.75,0.65,0.55,0.5,0.85,0.7,0.6,0.5,0.45
31,Physics,0.85,0.66,0.53,0.4,0.26,0.85,0.69,0.59,0.49,...,0.82,0.65,0.54,0.44,0.38,0.8,0.65,0.55,0.47,0.42
32,"Religious, Moral and Philosophical Studies",0.83,0.67,0.56,0.46,0.36,0.91,0.76,0.65,0.54,...,0.85,0.7,0.59,0.49,0.44,0.85,0.7,0.6,0.5,0.45
33,Spanish,0.82,0.67,0.57,0.47,0.37,0.82,0.68,0.58,0.48,...,0.82,0.67,0.57,0.48,0.42,0.82,0.67,0.57,0.48,0.43
34,Statistics,0.81,0.66,0.57,0.48,0.39,0.79,0.64,0.54,0.45,...,0.84,0.69,0.59,0.49,0.44,0.85,0.7,0.6,0.5,0.45


In [381]:
# Convert into long format

def tolong(df):
    ldf = pd.melt(df, id_vars = ['Subject'], var_name = 'abc', value_name ='Count')
    ldf['Grades'] = ldf['abc'].apply(lambda x: (x.rsplit('_', 1))[0])
    ldf['Year'] = ldf.pop('abc').apply(lambda x: (x.split('_'))[-1])
    
    return ldf
    
ldf = tolong(dfc)

ldf.head()

Unnamed: 0,Subject,Count,Grades,Year
0,Accounting,0.85,upA,2022
1,Art and Design (Design),0.85,upA,2022
2,Art and Design (Expressive),0.85,upA,2022
3,Biology,0.75,upA,2022
4,Business Management,0.78,upA,2022


In [382]:
# Check
#ldf.query('Subject == ["Physics"]')

In [383]:
# Plot graph

# Create a list of subjects in alphabetical order
subjects = sorted(ldf['Subject'].unique())
years = sorted(ldf['Year'].unique())

fig = px.line(ldf, x = "Year", y = "Count", 
                color = "Subject",
                facet_row = 'Grades',
                width = 800,
                height = 1150,
                category_orders = {'Subject' : subjects, 'Year' : years},
                labels = {"Count": "Percentage"},
                title = 'Grade boundaries per subject from 2016 - 2022 at ' + level_label
                )

fig.update_traces(mode = "markers+lines")

filename = 'boundaries_' + level + '.html'

fig.write_html('./graphs/' + filename)

fig.show()

In [385]:
# Setup Chart Studio
username = '#####' # your username
api_key = '#####' # your api key - go to profile > settings > regenerate key
chart_studio.tools.set_credentials_file(username=username, api_key=api_key)

# Upload to Chart Studio
#py.plot(fig, filename = filename, auto_open=True)