In [None]:
# Import key librarys

import pandas as pd
import numpy as np

In [None]:
def tidy(df):
    
    #### Fix usual issues with all strings
    
    # Capitalise headers
    df.columns = df.columns.astype(str).str.upper()
    
    # Capitalise cells
    df = df.map(lambda x: x.upper() if type(x) is str else x)

    # Strip whitespace
    df = df.map(lambda x: x.strip() if type(x) is str else x)

    # Remove parenthesis
    df = df.map(lambda x: x.replace('(', '') if type(x) is str else x)
    df = df.map(lambda x: x.replace(')', '') if type(x) is str else x)
    
    # Remove linebreaks
    df = df.map(lambda x: x.replace('\n', '') if type(x) is str else x)

    # Replace annoying substrings
    df = df.map(lambda x: x.replace(' AND ', ' & ') if type(x) is str else x)
    df = df.map(lambda x: x.replace(' – ', ' - ') if type(x) is str else x)
    df = df.map(lambda x: x.replace(' / ', '/') if type(x) is str else x)
    df = df.map(lambda x: x.replace('/ ', '/') if type(x) is str else x)
    df = df.map(lambda x: x.replace(' /', '/') if type(x) is str else x)
    
    df = df.map(lambda x: x.replace(' - ', ': ') if type(x) is str else x)
    df = df.map(lambda x: x.replace(' : ', ': ') if type(x) is str else x)
    
    df = df.map(lambda x: x.replace('  ', ' ') if type(x) is str else x)

    # Drop columns with minimus number of 2 non-null values
    df = df.dropna(axis='columns', thresh=2)
    
    return df

def per(df, y):
    
    if y in ('2019', '2018', '2017', '2016'):
        
        df['A1'] = df['UPPER A'] / df['MAX MARK']
        df['A2'] = df['A MARK'] / df['MAX MARK']
        df['B'] = df['B MARK'] / df['MAX MARK']
        df['C'] = df['C MARK'] / df['MAX MARK']
        df['D'] = df['D MARK'] / df['MAX MARK']
        
    else:

        df['A1'] = df['UPPER A BOUNDARY ' + y] / df['MAXIMUM MARK ' + y]
        df['A2'] = df['A BOUNDARY ' + y] / df['MAXIMUM MARK ' + y]
        df['B'] = df['B BOUNDARY ' + y] / df['MAXIMUM MARK ' + y]
        df['C'] = df['C BOUNDARY ' + y] / df['MAXIMUM MARK ' + y]
        df['D'] = df['D BOUNDARY ' + y] / df['MAXIMUM MARK ' + y]
    
    return df

In [None]:
# Create empty list for dfs
frames = []

In [None]:
# Define years
years = ['2025', '2024', '2023', '2022', '2017', '2016']

# Define levels
levels = ['National_5', 'Higher', 'Advanced_Higher']

# Loop through levels
for l in levels:
    
    # Create level label
    level_label = l.replace('_', ' ').upper()

    # Loops through years
    for y in years:

        # Read in excel file taking care to skip the correct number of rows and use correct filename
        if y in ('2023', '2022'):

            df = pd.read_excel('./data_grade_boundaries/grade-boundaries-' + y + '.xlsx',
                              sheet_name = l, skiprows = 2, na_values = '[z]')

        elif y in ('2017', '2016'):

            df = pd.read_excel('./data_grade_boundaries/Grade_Boundaries_' + y + '.xls',
                      sheet_name = l, skiprows = 3, na_values = '[z]')


        else:

            df = pd.read_excel('./data_grade_boundaries/grade-boundaries-' + y + '.xlsx',
                      sheet_name = l, skiprows = 3, na_values = '[z]')

        # Tidy up df
        df = tidy(df)

        # Calculate percentages
        df = per(df, y)

        # Change into long format
        dfl = pd.melt(df, id_vars =['SUBJECT'], 
                value_vars = ['A1', 'A2', 'B', 'C', 'D'],
                        var_name = ['GRADE'], value_name ='COUNT')
        
        # Add level and year
        dfl['LEVEL'] = level_label
        dfl['YEAR'] = y

        # Append to df list
        frames.append(dfl)

In [None]:
# Define years
years = ['2019', '2018']

# Define levels
levels = ['National 5', 'Higher', 'Adv Higher']

# Loop through levels
for l in levels:
    
    # Create level label
    level_label = l.upper()

    # Loops through years
    for y in years:

        # Read in excel file taking care to skip the correct number of rows and use correct filename

        df = pd.read_excel('./data_grade_boundaries/Grade_Boundaries_' + y + '.xls',
                  sheet_name = l, skiprows = 3, na_values = '[z]')

        # Tidy up df
        df = tidy(df)
        
        df.columns.values[[0, 1]] = ['SUBJECT', 'MAX MARK']

        # Calculate percentages
        df = per(df, y)

        # Change into long format
        dfl = pd.melt(df, id_vars =['SUBJECT'], 
                value_vars = ['A1', 'A2', 'B', 'C', 'D'],
                        var_name = ['GRADE'], value_name ='COUNT')
        
        # Add level and year
        dfl['LEVEL'] = level_label
        dfl['YEAR'] = y

        # Append to df list
        frames.append(dfl)

In [None]:
df.head()

In [None]:
dfl.head()

In [None]:
# Concat list of dfs together
dft = pd.concat(frames)

# Drop rows with NaN
dft = dft.dropna()

# Dictionary to map level replacements
lmap = {'ADVANCED HIGHER': 'AH',
        'ADV HIGHER': 'AH',
        'HIGHER': 'H',
        'NATIONAL 5': 'N5'}

# Replace level values using dictionary
dft = dft.replace({'LEVEL': lmap})

# Dictionary to subject level replacements
smap = {'HOSPITALITY: PRACTICAL CAKE CRAFT': 'PRACTICAL CAKE CRAFT',
        'HOSPITALITY: PRACTICAL COOKERY': 'PRACTICAL COOKERY',
        'RELIGIOUS MORAL & PHILOSOPHICAL STUDIES': 'RELIGIOUS, MORAL & PHILOSOPHICAL STUDIES',
        'GAIDHLIG': 'GÀIDHLIG'}

# Replace level values using dictionary
dft = dft.replace({'SUBJECT': smap})

# Export to csv
dft.to_csv("./output_csvs//boundaries.csv", index=False)

In [None]:
dft.head()