In [None]:
# Import key librarys

import pandas as pd
import numpy as np

import plotly.express as px

import chart_studio
import chart_studio.plotly as py
import chart_studio.tools as tls

In [None]:
def tidy_up_1(df):
    
    # Remove spaces and replace with '_'
    df.columns = df.columns.str.replace(' ','_')
    
    # Replace [z] values with NaN
    df = df.replace('[z]', np.nan)

    # Remove rows from 2021 - 2020
    df = df[df.columns.drop(list(df.filter(regex='2021')))]
    df = df[df.columns.drop(list(df.filter(regex='2020')))]
    
    return df

def tidy_up_2(df):

    # Create a list of years
    years = ['2023', '2022', '2019', '2018']

    # Sort out column titles
    for year in years:
        df = df.rename({'Maximum_Mark_' + year: 'max_' + year}, axis=1)
        df = df.rename({'Upper_A_Boundary_' + year: 'A1_' + year}, axis=1)
        df = df.rename({'A_Boundary_' + year: 'A2_' + year}, axis=1)
        df = df.rename({'B_Boundary_' + year: 'B_' + year}, axis=1)
        df = df.rename({'C_Boundary_' + year: 'C_' + year}, axis=1)
        df = df.rename({'D_Boundary_' + year: 'D_' + year}, axis=1)

    return df

def tidy_up_3(df, year):

    # Sort out column titles
    df = df.rename({'Max_Mark': 'max_' + year}, axis=1)
    df = df.rename({'Upper_A': 'A1_' + year}, axis=1)
    df = df.rename({'A_Mark': 'A2_' + year}, axis=1)
    df = df.rename({'B_Mark': 'B_' + year}, axis=1)
    df = df.rename({'C_Mark': 'C_' + year}, axis=1)
    df = df.rename({'D_Mark': 'D_' + year}, axis=1)

    return df

def tidy_up_3a(df):

    # Update names
    di = {'Hospitality: Practical Cake Craft': 'Practical Cake Craft',
          'Hospitality: Practical Cookery': 'Practical Cookery',
          'Cantonese': 'Chinese Languages'}
    
    df = df.replace({"Subject": di})
    
    # Remove duplicate names
    values = ['Cruinn-eolas (Geography)', 'Nuadh-Eolas (Modern Studies)', 'Eachdraidh (History)', 
        'Matamataig (Mathematics)', 'Matamataig Fad-bheatha (Lifeskills Mathematics)',
            'Mandarin (Simplified)', 'Mandarin (Traditional)']

    df = df.loc[~df['Subject'].isin(values)]
    
    return df

def per(df):
    
    # Create a list of years
    years = ['2023', '2022', '2019', '2018', '2017', '2016']
    
    # Calculate percentage of Upper As, As, Bs, Cs and Ds
    for year in years:
        df['A1_' + year] = df['A1_' + year] / df['max_' + year]
        df['A2_' + year] = df['A2_' + year] / df['max_' + year]
        df['B_' + year] = df['B_' + year] / df['max_' + year]
        df['C_' + year] = df['C_' + year] / df['max_' + year]
        df['D_' + year] = df['D_' + year] / df['max_' + year]
        
    return df

def tidy_up_4(df):
    
    # Remove rows with max
    df = df[df.columns.drop(list(df.filter(regex='max')))]
    
    # Round to 2 decimal places (note this rounds 0.425 to 0.42 due to issue of rounding floats)
    df = df.round(2)
    
    return df

def tolong(df):
    
    # Melt into long format
    ldf = pd.melt(df, id_vars = ['Subject', 'Level'], var_name = 'abc', value_name ='Count')
    
    # Split one column with grade and year into two columns, one for grade, one for year 
    ldf['Grades'] = ldf['abc'].apply(lambda x: (x.rsplit('_', 1))[0])
    ldf['Year'] = ldf.pop('abc').apply(lambda x: (x.split('_'))[-1])
    
    return ldf

In [None]:
# Define levels
levels = ['National_5', 'Higher', 'Advanced_Higher']

# Create empty list for dfs
frames = []

# Loop through levels
for level in levels:

    # Create level label
    level_label = level.replace('_', ' ')

    # Import data
    df23 = pd.read_excel("./data_grade_boundaries/grade-boundaries-2023.xlsx", 
                           sheet_name = level, skiprows = 2)

    df22 = pd.read_excel("./data_grade_boundaries/grade-boundaries-2022.xlsx", 
                           sheet_name = level, skiprows = 2)

    df16 = pd.read_excel("./data_grade_boundaries/Grade_Boundaries_2016.xls", 
                           sheet_name = level, skiprows = 3)

    df17 = pd.read_excel("./data_grade_boundaries/Grade_Boundaries_2017.xls", 
                           sheet_name = level, skiprows = 3)
    
    # Tidy up data 1
    df23 = tidy_up_1(df23)
    df22 = tidy_up_1(df22)
    df16 = tidy_up_1(df16)
    df17 = tidy_up_1(df17)
    
    # Tidy up data 2
    df23 = tidy_up_2(df23)
    df22 = tidy_up_2(df22)
    
    # Tidy up data 3
    df16 = tidy_up_3(df16, '2016')
    df17 = tidy_up_3(df17, '2017')
    
    # Tidy up data 3a
    df16 = tidy_up_2a(df16)
    df17 = tidy_up_2a(df17)
    
    # Concat together three dfs (on Subject as index)

    dfc = pd.concat([df23.set_index('Subject'), df22.set_index('Subject'), 
                         df17.set_index('Subject'), df16.set_index('Subject')], 
                              axis=1).reset_index()

    # Drop duplicate columns
    dfc = dfc.loc[:,~dfc.columns.duplicated()].copy()
    
    # Calculate percentages of each grade
    dfc = per(dfc)
    
    # Remove max columns
    dfc = tidy_up_4(dfc)
    
    # Add level
    dfc['Level'] = level_label
    
    # Convert to long format
    ldf = tolong(dfc)
    
    # Append to list
    frames.append(ldf)

In [None]:
# Concat list of dfs together
dft = pd.concat(frames)

# Drop rows with NaN
dft = dft.dropna()

# Export to csv
dft.to_csv("boundaries.csv")

In [None]:
# Plot graph

# Create a list of subjects in alphabetical order
subjects = sorted(ldf['Subject'].unique())
years = sorted(ldf['Year'].unique())

fig = px.line(ldf, x = "Year", y = "Count", 
                color = "Subject",
                facet_row = 'Grades',
                width = 800,
                height = 1150,
                category_orders = {'Subject' : subjects, 'Year' : years},
                labels = {"Count": "Percentage"},
                title = 'Grade boundaries per subject from 2016 - 2022 at ' + level_label
                )

fig.update_traces(mode = "markers+lines")

filename = 'boundaries_' + level + '.html'

fig.write_html('./output_graphs_boundaries/' + filename)

fig.show()

In [None]:
# Setup Chart Studio
username = '#####' # your username
api_key = '#####' # your api key - go to profile > settings > regenerate key
chart_studio.tools.set_credentials_file(username=username, api_key=api_key)

# Upload to Chart Studio
#py.plot(fig, filename = filename, auto_open=True)