## Import libraries and define functions

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Function to tidy columns and cells
def tidy(df):
    
    #### Fix usual issues with all strings
    
    # Capitalise headers
    df.columns = df.columns.astype(str).str.upper()
    
    # Capitalise columns
    df = df.map(lambda x: x.upper() if type(x) is str else x)

    # Strip whitespace
    df = df.map(lambda x: x.strip() if type(x) is str else x)

    # Remove parenthesis
    df = df.map(lambda x: x.replace('(', '') if type(x) is str else x)
    df = df.map(lambda x: x.replace(')', '') if type(x) is str else x)
    
    # Remove dashes
    df = df.map(lambda x: x.replace('-', '') if type(x) is str else x)
    
    # Remove full stops
    df = df.map(lambda x: x.replace('.', '') if type(x) is str else x)
    
    # Remove commas
    df = df.map(lambda x: x.replace(',', '') if type(x) is str else x)
    
    # Remove linebreaks
    df = df.map(lambda x: x.replace('\n', '') if type(x) is str else x)
    
    # Remove double spaces
    df = df.map(lambda x: x.replace('  ', ' ') if type(x) is str else x)

    # Replace annoying substrings
    df = df.map(lambda x: x.replace(' AND ', ' & ') if type(x) is str else x)
    df = df.map(lambda x: x.replace(' – ', ' - ') if type(x) is str else x)
    df = df.map(lambda x: x.replace(' / ', '/') if type(x) is str else x)
    df = df.map(lambda x: x.replace('/ ', '/') if type(x) is str else x)
    df = df.map(lambda x: x.replace(' /', '/') if type(x) is str else x)
    
    # Strip whitespace again
    df = df.map(lambda x: x.strip() if type(x) is str else x)
    
    # Drop rows with minimum number of 2 non-null values
    df = df.dropna(thresh=2)
    
    return df

# Function to strip suffixes
def tidy_headers(df):
    
    # Remove (%) from column headings
    dfs.columns = dfs.columns.str.replace('(%)', '')

    # Create list of suffixes to remove
    ls = ['[NOTE 9]', '[NOTE 8]', '[NOTE 7]', '[NOTE 6]', '[NOTE 5]', '[NOTE 4]',
          '[NOTE 3]', '[NOTE 2]', '[NOTE 1]']

    # Loop through suffix list
    for i in ls:

        # Remove suffix
        dfs.columns = dfs.columns.str.removesuffix(i)

        # Strip white space
        dfs.columns = dfs.columns.str.strip()
        
    return df

# Function to strip suffixes
def tidy_older_headers(df):
    
    # Remove (%) from colu\mn headings
    dfs.columns = dfs.columns.str.replace('(%)', '')
    
    # Create list of suffixes to remove
    ls = ['(9)', '(8)', '(7)', '(6)', '(5)', '(4)', '(3)', '(2)', '(1)']

    # Loop through suffix list
    for i in ls:

        # Remove suffix
        df.columns = df.columns.str.removesuffix(i)

        # Strip white space
        df.columns = df.columns.str.strip()
        
    return df

## Read in school stats data

In [None]:
# Create empty list to store all df
frames = []

In [None]:
# Read in data
dfs = pd.read_excel('./data/School+level+summary+statistics+2023+v2.xlsx', 
                                sheet_name='2023 School Level Statistics',
                                   skiprows=1,
                                       na_values=('c', '#', '*', 'z', 'x'))

# Tidy data
dfs = tidy(dfs)

# Tidy headers
dfs = tidy_headers(dfs)

# Rename column headers
dfs = dfs.rename(columns={'TEACHERS (FULL TIME EQUIVALENT)': 'TEACHERS'})
dfs = dfs.rename(columns={'PUPILS WITH AN ADDITIONAL SUPPORT NEED RECORDED': 'ASN PUPILS'})
dfs = dfs.rename(columns={'SCHOOL NAME': 'SCHOOL'})
dfs = dfs.rename(columns={'P6-P7/S1-S6/SP PUPILS REGISTERED FOR FREE SCHOOL MEALS': 'FSM PUPILS'})

# Select subset of columns
dfs = dfs[['SEEDCODE', 'LOCAL AUTHORITY', 'SCHOOL TYPE', 'SCHOOL', 'TEACHERS',
               'PUPIL ROLL', 'ASN PUPILS', 'FSM PUPILS', 'ATTENDANCE RATE']]

# Add year
dfs['YEAR'] = '2223'

# Append df to list
frames.append(dfs)

In [None]:
# Read in data
dfs = pd.read_excel('./data/School+level+summary+statistics+2022+V4.xlsx', 
                                sheet_name='2022 School Level Statistics',
                                   skiprows=1,
                                       na_values=('c', '#', '*', 'z', 'x'))
                    
# Tidy data
dfs = tidy(dfs)

# Tidy headers
dfs = tidy_headers(dfs)

# Rename column headers
dfs = dfs.rename(columns={'TEACHERS (FULL TIME EQUIVALENT)': 'TEACHERS'})
dfs = dfs.rename(columns={'PUPILS WITH AN ADDITIONAL SUPPORT NEED RECORDED': 'ASN PUPILS'})
dfs = dfs.rename(columns={'SCHOOL NAME': 'SCHOOL'})
dfs = dfs.rename(columns={'P6-P7/S1-S6/SP PUPILS REGISTERED FOR FREE SCHOOL MEALS': 'FSM PUPILS'})

# Select subset of columns
dfs = dfs[['SEEDCODE', 'LOCAL AUTHORITY', 'SCHOOL TYPE', 'SCHOOL', 'TEACHERS',
               'PUPIL ROLL', 'ASN PUPILS', 'FSM PUPILS', 'ATTENDANCE RATE']]

# Add year
dfs['YEAR'] = '2122'

# Append df to list
frames.append(dfs)

In [None]:
# Read in data
dfs = pd.read_excel('./data/School+level+summary+statistics+2021.xlsx', 
                                sheet_name='2021 School Level Statistics',
                                   skiprows=1,
                                       na_values=('c', '#', '*', 'z', 'x'))

# Remove extra suffix from Local Authority name for this year
dfs['Local Authority'] = dfs['Local Authority'].str.removesuffix(' [ NOTE ]')

# Tidy data
dfs = tidy(dfs)

# Tidy headers
dfs = tidy_headers(dfs)

# Rename column headers
dfs = dfs.rename(columns={'TEACHERS (FULL TIME EQUIVALENT)': 'TEACHERS'})
dfs = dfs.rename(columns={'PUPILS WITH AN ADDITIONAL SUPPORT NEED RECORDED': 'ASN PUPILS'})
dfs = dfs.rename(columns={'SCHOOL NAME': 'SCHOOL'})
dfs = dfs.rename(columns={'P5-P7/S1-S6/SP PUPILS REGISTERED FOR FREE SCHOOL MEALS': 'FSM PUPILS'})

# Select subset of columns
dfs = dfs[['SEEDCODE', 'LOCAL AUTHORITY', 'SCHOOL TYPE', 'SCHOOL', 'TEACHERS',
               'PUPIL ROLL', 'ASN PUPILS', 'FSM PUPILS', 'ATTENDANCE RATE']]

# Add year
dfs['YEAR'] = '2021'

# Append df to list
frames.append(dfs)

In [None]:
# Read in data
dfs = pd.read_excel('./data/School+level+summary+statistics+2020.xlsx', 
                                sheet_name='2020',
                                   skiprows=1,
                                       na_values=('c', '#', '*', 'z', 'x'))

# Tidy data
dfs = tidy(dfs)

# Tidy headers
dfs = tidy_older_headers(dfs)

# Rename column headers
dfs = dfs.rename(columns={'FTE TEACHERS': 'TEACHERS'})
dfs = dfs.rename(columns={'PUPILS WITH AN ADDITIONAL SUPPORT NEED RECORDED': 'ASN PUPILS'})
dfs = dfs.rename(columns={'SCHOOL NAME': 'SCHOOL'})
dfs = dfs.rename(columns={'NUMBER OF P4-P7/S1-S6/SP PUPILS REGISTERED FOR FSM': 'FSM PUPILS'})

# Select subset of columns
dfs = dfs[['SEEDCODE', 'LOCAL AUTHORITY', 'SCHOOL TYPE', 'SCHOOL', 'TEACHERS',
               'PUPIL ROLL', 'ASN PUPILS', 'FSM PUPILS', 'ATTENDANCE RATE']]

# Add year
dfs['YEAR'] = '1920'

# Append df to list
frames.append(dfs)

In [None]:
# Read in data
dfs = pd.read_excel('./data/School+level+summary+statistics+2019.xlsx', 
                                sheet_name='2019',
                                   skiprows=1,
                                       na_values=('c', '#', '*', 'z', 'x'))

# Tidy data
dfs = tidy(dfs)

# Tidy headers
dfs = tidy_older_headers(dfs)

# Rename column headers
dfs = dfs.rename(columns={'FTE TEACHERS': 'TEACHERS'})
dfs = dfs.rename(columns={'PUPILS WITH AN ADDITIONAL SUPPORT NEED RECORDED': 'ASN PUPILS'})
dfs = dfs.rename(columns={'SCHOOL NAME': 'SCHOOL'})
dfs = dfs.rename(columns={'NUMBER OF P4-P7/S1-S6/SP PUPILS REGISTERED FOR FSM': 'FSM PUPILS'})

# Select subset of columns
dfs = dfs[['SEEDCODE', 'LOCAL AUTHORITY', 'SCHOOL TYPE', 'SCHOOL', 'TEACHERS',
               'PUPIL ROLL', 'ASN PUPILS', 'FSM PUPILS', 'ATTENDANCE RATE']]

# Add year
dfs['YEAR'] = '1819'

# Append df to list
frames.append(dfs)

In [None]:
dfs.head()

## Combine frames and export

In [None]:
# Concat list into long df
adfs = pd.concat(frames, axis = 0, ignore_index = True)

# Convert zeros to nan
adfs = adfs.replace(0, np.nan)

# Convert dtype several columns
adfs['YEAR'] = adfs['YEAR'].astype(str)
adfs['SEEDCODE'] = adfs['SEEDCODE'].astype('Int64')
adfs['PUPIL ROLL'] = adfs['PUPIL ROLL'].astype('Int64')
adfs['ASN PUPILS'] = adfs['ASN PUPILS'].astype('Int64')
adfs['FSM PUPILS'] = adfs['FSM PUPILS'].astype('Int64')

In [None]:
## Update school names with contact list document

# Read in contact list data
cdf = pd.read_excel('./data/school+contact+list+31+October+2024.xlsx', sheet_name='Open Schools', skiprows=5)

# Tidy data
cdf = tidy(cdf)

# Convert into dictionary
codeName = dict(zip(cdf['SEED CODE'], cdf['SCHOOL NAME']))
codeLA = dict(zip(cdf['SEED CODE'], cdf['LA NAME']))
codeFund = dict(zip(cdf['SEED CODE'], cdf['CENTRE TYPE']))

# Set all schools to LA as only a few Grant Aided
adfs['SCHOOL FUNDING'] = 'LOCAL AUTHORITY'

# Create new column with updated values for School Name, LA and Type
adfs['SCHOOL NAME UPDATE'] = adfs['SEEDCODE'].map(codeName)
adfs['SCHOOL LA UPDATE'] = adfs['SEEDCODE'].map(codeLA)
adfs['SCHOOL FUNDING'] = adfs['SEEDCODE'].map(codeFund)

# Replace School with value from updated column (unless updated column is blank!)
adfs['SCHOOL'] = adfs['SCHOOL NAME UPDATE'].combine_first(adfs['SCHOOL'])
adfs['LOCAL AUTHORITY'] = adfs['SCHOOL LA UPDATE'].combine_first(adfs['LOCAL AUTHORITY'])

# Drop working column
adfs = adfs.drop('SCHOOL NAME UPDATE', axis=1)
adfs = adfs.drop('SCHOOL LA UPDATE', axis=1)

In [None]:
## Calc new stats

# Pupil : Teacher
adfs['PUPIL TEACHER RATIO'] = adfs['PUPIL ROLL'] / adfs['TEACHERS']

# ASN %
adfs['ASN %'] = adfs['ASN PUPILS'] / adfs['PUPIL ROLL'] * 100

# FSM %
adfs['FSM %'] = adfs['FSM PUPILS'] / adfs['PUPIL ROLL'] * 100

In [None]:
# Tidy up loose ends
adfs['LOCAL AUTHORITY'] = adfs['LOCAL AUTHORITY'].str.replace('CITY OF EDINBURGH', 'EDINBURGH CITY')

# Output df to .csv
adfs.to_csv('./csvs/schoolStats.csv', index=False)

adfs.head()