# SQA arrangements

In [None]:
# Import librarys

import pandas as pd
import numpy as np

# Functions

In [None]:
# Define functions to read sheets

def read_grades(level):

    # Read in sheet skipping the correct number of rows
    df = pd.read_excel('./arrangement_data/attainment-statistics-2024.xlsx', 
                        sheet_name = level, skiprows = 3,
                          na_values = ['***', '-', ' -', '[c]', '[z]'], thousands = ',')
    
    return df

def read_arrangements(level):

    # Read in sheet skipping the correct number of rows
    df = pd.read_excel('./arrangement_data/assessment-arrangements-tables-2024.xlsx', 
                        sheet_name = level, skiprows = 3,
                          na_values = ['***', '-', ' -', '[c]', '[z]'], thousands = ',')
    
    return df

In [None]:
## Define functions to tidy data

def tidy_df(df):
    
    #### Fix usual issues with all strings
    
    # Capitalise headers
    df.columns = df.columns.astype(str).str.upper()
    
    # Rename subject column
    df = df.rename(columns={'TITLE': 'SUBJECT'})
    
    # Capitalise cells
    df = df.map(lambda x: x.upper() if type(x) is str else x)

    # Strip whitespace
    df = df.map(lambda x: x.strip() if type(x) is str else x)

    # Remove parenthesis
    df = df.map(lambda x: x.replace('(', '') if type(x) is str else x)
    df = df.map(lambda x: x.replace(')', '') if type(x) is str else x)
    
    # Remove linebreaks
    df = df.map(lambda x: x.replace('\n', '') if type(x) is str else x)

    # Replace annoying substrings
    df = df.map(lambda x: x.replace(' AND ', ' & ') if type(x) is str else x)
    df = df.map(lambda x: x.replace(' – ', ' - ') if type(x) is str else x)
    df = df.map(lambda x: x.replace(' / ', '/') if type(x) is str else x)
    df = df.map(lambda x: x.replace('/ ', '/') if type(x) is str else x)
    df = df.map(lambda x: x.replace(' /', '/') if type(x) is str else x)
    
    df = df.map(lambda x: x.replace(' - ', ': ') if type(x) is str else x)
    df = df.map(lambda x: x.replace(' : ', ': ') if type(x) is str else x)
    
    df = df.map(lambda x: x.replace('  ', ' ') if type(x) is str else x)
    
    # Drop rows with minimum number of 2 non-null values
    df = df.dropna(thresh=2)

    # Drop rows with columns number of 2 non-null values
    df = df.dropna(axis='columns', thresh=2)
    
    return df

In [None]:
# Define functions to count grades

def grade_count(df, year):
    
    # Change columns from percentages to counts
    df['A'] = df['GRADE A COUNT ' + year]
    df['B'] = df['GRADE A-B COUNT ' + year] - df['GRADE A COUNT ' + year]
    df['C'] = df['GRADE A-C COUNT ' + year] - df['GRADE A-B COUNT ' + year]
    df['D'] = df['GRADE A-D COUNT ' + year] - df['GRADE A-C COUNT ' + year]
    
    return df

def pass_fail(df):
    
    # Replace nans with zeros
    df = df.fillna(0)
    
    # Add column with na count
    df['NA'] = df['ENTRIES'] - df['A'] - df['B'] - df['C'] - df['D']
    
    # Add column with pass count
    df['PASS'] = df['ENTRIES'] - df['NA']
    
    return df

# Read in and process excel files

In [None]:
# Create empty list for dfs
frames = []

In [None]:
## Read in attainment data

# Define level and years
levels = ['National_5', 'Higher', 'Advanced_Higher']
years = ['2019', '2022', '2023', '2024']

# Define header to search for
header = 'subject'

# Create empty dataframe
dfl = pd.DataFrame()

for level in levels:
    
    for year in years:    
        
        # Read and tidy data
        df = read_grades(level)
        df = tidy_df(df)

        # Calculate grade counts
        df = grade_count(df,year)
        
        # Simplify column headings for year in loop
        df['ENTRIES'] = df['ENTRIES ' + year] 

        # Calculate pass and fails
        df = pass_fail(df)

        # Change into long format
        dfl = pd.melt(df, id_vars =['SUBJECT'], 
                value_vars = ['ENTRIES', 'A', 'B', 'C', 'D', 'PASS', 'NA'],
                        var_name ='GRADE', value_name ='COUNT')

        # Add in year and level columns
        dfl['YEAR'] = int(year)
        dfl['LEVEL'] = level

        # Append to df list
        frames.append(dfl)

In [None]:
## Read in additional arrangements data

# Define level and years
levels = {'National_5': 'Table_2.1', 'Higher': 'Table_2.2', 'Advanced_Higher': 'Table_2.3'}
years = ['2019', '2022', '2023', '2024']

# Define header to search for
header = 'subject'

# Create empty dataframe
dfl = pd.DataFrame()

for level, sheet in levels.items():
        
    # Read and tidy data
    df = read_arrangements(sheet)
    df = tidy_df(df)

    # Change into long format
    dfl = pd.melt(df, id_vars =['SUBJECT'], 
            value_vars = ['2024', '2023', '2022', '2019'],
                    var_name ='YEAR', value_name ='COUNT')

    # Add in grade and level columns
    dfl['GRADE'] = 'AA'
    dfl['LEVEL'] = level

    # Append to df list
    frames.append(dfl)

In [None]:
# df.columns
# df.head()
dfl.head()

# Tidy up

In [None]:
# Concat list of dfs together
dft = pd.concat(frames)

# Drop rows with any nans
dft = dft.dropna()

# Tidy rows
dft = tidy_df(dft)

# Sort out row order and reset index
dft = dft.sort_values(['YEAR', 'LEVEL', 'SUBJECT', 'GRADE'])
dft = dft.reset_index(drop=True)

In [None]:
# Select a subset of the df
#dft.loc[(dft['SUBJECT'] == 'HEALTH & SOCIAL CARE')]
#sorted(dft['LEVEL'].unique())
dft.head()

# Export to .csv

In [None]:
dft.to_csv('./output_csvs/arrangements.csv', index=False)