## Import libraries and define functions

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Function to tidy columns and cells
def tidy(df):
    
    #### Fix usual issues with all strings
    
    # Capitalise headers
    df.columns = df.columns.astype(str).str.upper()
    
    # Capitalise columns
    df = df.map(lambda x: x.upper() if type(x) is str else x)

    # Strip whitespace
    df = df.map(lambda x: x.strip() if type(x) is str else x)

    # Remove parenthesis
    df = df.map(lambda x: x.replace('(', '') if type(x) is str else x)
    df = df.map(lambda x: x.replace(')', '') if type(x) is str else x)
    
    # Remove dashes
    df = df.map(lambda x: x.replace('-', '') if type(x) is str else x)
    
    # Remove full stops
    df = df.map(lambda x: x.replace('.', '') if type(x) is str else x)
    
    # Remove commas
    df = df.map(lambda x: x.replace(',', '') if type(x) is str else x)
    
    # Remove linebreaks
    df = df.map(lambda x: x.replace('\n', '') if type(x) is str else x)
    
    # Remove double spaces
    df = df.map(lambda x: x.replace('  ', ' ') if type(x) is str else x)

    # Replace annoying substrings
    df = df.map(lambda x: x.replace(' AND ', ' & ') if type(x) is str else x)
    df = df.map(lambda x: x.replace(' – ', ' - ') if type(x) is str else x)
    df = df.map(lambda x: x.replace(' / ', '/') if type(x) is str else x)
    df = df.map(lambda x: x.replace('/ ', '/') if type(x) is str else x)
    df = df.map(lambda x: x.replace(' /', '/') if type(x) is str else x)
    
    # Strip whitespace again
    df = df.map(lambda x: x.strip() if type(x) is str else x)
    
    # Drop rows with minimum number of 2 non-null values
    df = df.dropna(thresh=2)
    
    return df

# Function to strip suffixes
def tidy_headers(df):
    
    # Remove (%) from column headings
    dfs.columns = dfs.columns.str.replace('(%)', '')

    # Create list of suffixes to remove
    ls = ['[NOTE 9]', '[NOTE 8]', '[NOTE 7]', '[NOTE 6]', '[NOTE 5]', '[NOTE 4]',
          '[NOTE 3]', '[NOTE 2]', '[NOTE 1]']

    # Loop through suffix list
    for i in ls:

        # Remove suffix
        dfs.columns = dfs.columns.str.removesuffix(i)

        # Strip white space
        dfs.columns = dfs.columns.str.strip()
        
    return df

# Function to strip suffixes
def tidy_older_headers(df):
    
    # Remove (%) from colu\mn headings
    dfs.columns = dfs.columns.str.replace('(%)', '')
    
    # Create list of suffixes to remove
    ls = ['(9)', '(8)', '(7)', '(6)', '(5)', '(4)', '(3)', '(2)', '(1)']

    # Loop through suffix list
    for i in ls:

        # Remove suffix
        df.columns = df.columns.str.removesuffix(i)

        # Strip white space
        df.columns = df.columns.str.strip()
        
    return df

## Read in school stats data

In [None]:
# Create empty list to store all df
frames = []

In [None]:
# Read in data
dfs = pd.read_excel('./data/School+level+summary+statistics+2023+v2.xlsx', 
                                sheet_name='2023 School Level Statistics',
                                   skiprows=1,
                                       na_values=('c', '#', '*', 'z', 'x'))

# Tidy data
dfs = tidy(dfs)

# Tidy headers
dfs = tidy_headers(dfs)

# Rename column headers
dfs = dfs.rename(columns={'TEACHERS (FULL TIME EQUIVALENT)': 'TEACHERS'})
dfs = dfs.rename(columns={'PUPILS WITH AN ADDITIONAL SUPPORT NEED RECORDED': 'ASN PUPILS'})
dfs = dfs.rename(columns={'SCHOOL NAME': 'SCHOOL'})
dfs = dfs.rename(columns={'P6-P7/S1-S6/SP PUPILS REGISTERED FOR FREE SCHOOL MEALS': 'FSM PUPILS'})

# Select subset of columns
dfs = dfs[['SEEDCODE', 'LOCAL AUTHORITY', 'SCHOOL TYPE', 'SCHOOL', 'TEACHERS',
               'PUPIL ROLL', 'ASN PUPILS', 'FSM PUPILS', 'ATTENDANCE RATE']]

# Add year
dfs['YEAR'] = '2223'

# Append df to list
frames.append(dfs)

In [None]:
# Read in data
dfs = pd.read_excel('./data/School+level+summary+statistics+2022+V4.xlsx', 
                                sheet_name='2022 School Level Statistics',
                                   skiprows=1,
                                       na_values=('c', '#', '*', 'z', 'x'))
                    
# Tidy data
dfs = tidy(dfs)

# Tidy headers
dfs = tidy_headers(dfs)

# Rename column headers
dfs = dfs.rename(columns={'TEACHERS (FULL TIME EQUIVALENT)': 'TEACHERS'})
dfs = dfs.rename(columns={'PUPILS WITH AN ADDITIONAL SUPPORT NEED RECORDED': 'ASN PUPILS'})
dfs = dfs.rename(columns={'SCHOOL NAME': 'SCHOOL'})
dfs = dfs.rename(columns={'P6-P7/S1-S6/SP PUPILS REGISTERED FOR FREE SCHOOL MEALS': 'FSM PUPILS'})

# Select subset of columns
dfs = dfs[['SEEDCODE', 'LOCAL AUTHORITY', 'SCHOOL TYPE', 'SCHOOL', 'TEACHERS',
               'PUPIL ROLL', 'ASN PUPILS', 'FSM PUPILS', 'ATTENDANCE RATE']]

# Add year
dfs['YEAR'] = '2122'

# Append df to list
frames.append(dfs)

In [None]:
# Read in data
dfs = pd.read_excel('./data/School+level+summary+statistics+2021.xlsx', 
                                sheet_name='2021 School Level Statistics',
                                   skiprows=1,
                                       na_values=('c', '#', '*', 'z', 'x'))

# Remove extra suffix from Local Authority name for this year
dfs['Local Authority'] = dfs['Local Authority'].str.removesuffix(' [ NOTE ]')

# Tidy data
dfs = tidy(dfs)

# Tidy headers
dfs = tidy_headers(dfs)

# Rename column headers
dfs = dfs.rename(columns={'TEACHERS (FULL TIME EQUIVALENT)': 'TEACHERS'})
dfs = dfs.rename(columns={'PUPILS WITH AN ADDITIONAL SUPPORT NEED RECORDED': 'ASN PUPILS'})
dfs = dfs.rename(columns={'SCHOOL NAME': 'SCHOOL'})
dfs = dfs.rename(columns={'P5-P7/S1-S6/SP PUPILS REGISTERED FOR FREE SCHOOL MEALS': 'FSM PUPILS'})

# Select subset of columns
dfs = dfs[['SEEDCODE', 'LOCAL AUTHORITY', 'SCHOOL TYPE', 'SCHOOL', 'TEACHERS',
               'PUPIL ROLL', 'ASN PUPILS', 'FSM PUPILS', 'ATTENDANCE RATE']]

# Add year
dfs['YEAR'] = '2021'

# Append df to list
frames.append(dfs)

In [None]:
# Read in data
dfs = pd.read_excel('./data/School+level+summary+statistics+2020.xlsx', 
                                sheet_name='2020',
                                   skiprows=1,
                                       na_values=('c', '#', '*', 'z', 'x'))

# Tidy data
dfs = tidy(dfs)

# Tidy headers
dfs = tidy_older_headers(dfs)

# Rename column headers
dfs = dfs.rename(columns={'FTE TEACHERS': 'TEACHERS'})
dfs = dfs.rename(columns={'PUPILS WITH AN ADDITIONAL SUPPORT NEED RECORDED': 'ASN PUPILS'})
dfs = dfs.rename(columns={'SCHOOL NAME': 'SCHOOL'})
dfs = dfs.rename(columns={'NUMBER OF P4-P7/S1-S6/SP PUPILS REGISTERED FOR FSM': 'FSM PUPILS'})

# Select subset of columns
dfs = dfs[['SEEDCODE', 'LOCAL AUTHORITY', 'SCHOOL TYPE', 'SCHOOL', 'TEACHERS',
               'PUPIL ROLL', 'ASN PUPILS', 'FSM PUPILS', 'ATTENDANCE RATE']]

# Add year
dfs['YEAR'] = '1920'

# Append df to list
frames.append(dfs)

In [None]:
# Read in data
dfs = pd.read_excel('./data/School+level+summary+statistics+2019.xlsx', 
                                sheet_name='2019',
                                   skiprows=1,
                                       na_values=('c', '#', '*', 'z', 'x'))

# Tidy data
dfs = tidy(dfs)

# Tidy headers
dfs = tidy_older_headers(dfs)

# Rename column headers
dfs = dfs.rename(columns={'FTE TEACHERS': 'TEACHERS'})
dfs = dfs.rename(columns={'PUPILS WITH AN ADDITIONAL SUPPORT NEED RECORDED': 'ASN PUPILS'})
dfs = dfs.rename(columns={'SCHOOL NAME': 'SCHOOL'})
dfs = dfs.rename(columns={'NUMBER OF P4-P7/S1-S6/SP PUPILS REGISTERED FOR FSM': 'FSM PUPILS'})

# Select subset of columns
dfs = dfs[['SEEDCODE', 'LOCAL AUTHORITY', 'SCHOOL TYPE', 'SCHOOL', 'TEACHERS',
               'PUPIL ROLL', 'ASN PUPILS', 'FSM PUPILS', 'ATTENDANCE RATE']]

# Add year
dfs['YEAR'] = '1819'

# Append df to list
frames.append(dfs)

In [None]:
dfs.head()

## Import older data

In [None]:
# Read in data
dfs = pd.read_excel('./data/School+level+summary+statistics+2019.xlsx', 
                                sheet_name='2019',
                                   skiprows=1,
                                       na_values='c')

# Tidy data
dfs = tidy(dfs)

# Tidy headers
dfs = tidy_older_headers(dfs)

# Rename column headers
dfs = dfs.rename(columns={'FTE TEACHERS': 'TEACHERS'})
dfs = dfs.rename(columns={'PUPILS WITH AN ADDITIONAL SUPPORT NEED RECORDED': 'ASN PUPILS'})
dfs = dfs.rename(columns={'SCHOOL NAME': 'SCHOOL'})

# Select subset of columns
dfs = dfs[['SEEDCODE', 'LOCAL AUTHORITY', 'SCHOOL TYPE', 'SCHOOL', 'TEACHERS',
               'PUPIL ROLL', 'ASN PUPILS', 'ATTENDANCE RATE']]

# Add year
dfs['YEAR'] = '1819'

In [None]:
# Create dictionary for year column
dt = {'2018':'1718', '2017':'1617', '2016':'1516', '2015':'1415'}

for k in dt:

    # Read in data
    dfs = pd.read_excel('./data/School+Contact+Details+2018.xlsx', 
                                    sheet_name='Open at September ' + k,
                                       skiprows=5,
                                           na_values=('c','.', '0', 'n/a', 'N/A', '0(2)'))
    # Tidy data
    dfs = tidy(dfs)

    # Tidy headers
    dfs = tidy_older_headers(dfs)
    
    # Fix duplicate column names
    dfs.columns = ['SEEDCODE', 'LA NAME', 'CENTRE TYPE', 'SCHOOL NAME', 'ADDRESS 1',
           'ADDRESS 2', 'ADDRESS 3', 'POST CODE', 'E-MAIL', 'PHONE', 'PRIMARY',
           'SECONDARY', 'SPECIAL', 'PRIMARY ROLL', 'SECONDARY ROLL',
           'SPECIAL ROLL', 'PRIMARY FTE', 'SECONDARY FTE', 'SPECIAL FTE',
           '6-FOLD URBAN/RURAL MEASURE', '8-FOLD URBAN/RURAL MEASURE',
           'DENOMINATION', 'PROPORTION OF PUPILS FROM MINORITY ETHNIC GROUPS',
           'PROPORTION OF PUPILS WHO LIVE IN 20% MOST DEPRIVED DATAZONES IN SCOTLAND']
    
    # Rename column headers
    dfs = dfs.rename(columns={'LA NAME': 'LOCAL AUTHORITY'})
    dfs = dfs.rename(columns={'SCHOOL NAME': 'SCHOOL'})
    
    # Select LA schools
    dfs = dfs.loc[dfs['CENTRE TYPE'] != 'INDEPENDENT']

    # Add missing columns
    dfs['ASN PUPILS'] = np.nan
    dfs['ATTENDANCE RATE'] = np.nan
    
    # Add year
    dfs['YEAR'] = dt[k]
    
    # Create school type list
    sls = ['PRIMARY', 'SECONDARY', 'SPECIAL']
    
    # Loop through types of school
    for t in sls:
        
        # Filter by school type (t)
        dfst = dfs[~dfs[t].isna()].copy()

        # Add school type column
        dfst['SCHOOL TYPE'] = dfst[t]

        # Add pupil roll column
        dfst['PUPIL ROLL'] = dfst[t +' ROLL'].fillna(0)

        # Add teachers column
        dfst['TEACHERS'] = dfst[t + ' FTE']

        # Select subset of columns
        dfst = dfst[['SEEDCODE', 'LOCAL AUTHORITY', 'SCHOOL TYPE', 'SCHOOL', 'TEACHERS',
                       'PUPIL ROLL', 'ASN PUPILS', 'ATTENDANCE RATE', 'YEAR']]

        # Append df to list
        frames.append(dfst)

In [None]:
# Create dictionary for year column
dt = {'2014':'1314'}

for k in dt:

    # Read in data
    dfs = pd.read_excel('./data/School+Contact+Details+2018.xlsx', 
                                    sheet_name='Open at September ' + k,
                                       skiprows=7,
                                           na_values=('c','.', '0', 'n/a', 'N/A', '0(2)'))
    # Tidy data
    dfs = tidy(dfs)

    # Tidy headers
    dfs = tidy_older_headers(dfs)

    # Fix duplicate column names
    dfs.columns = ['SEEDCODE', 'LA NAME', 'CENTRE TYPE', 'SCHOOL NAME', 'ADDRESS 1',
                   'ADDRESS 2', 'ADDRESS 3', 'ADDRESS 4', 'POST CODE 1', 'POST CODE 2',
                   'E-MAIL', 'WEBSITE', 'PHONE', 'FAX', 'PRIMARY', 'SECONDARY', 'SPECIAL',
                   'PRIMARY ROLL', 'SECONDARY ROLL', 'SPECIAL ROLL', 'PRIMARY FTE',
                   'SECONDARY FTE', 'SPECIAL FTE', '6-FOLD URBAN/RURAL MEASURE',
                   '8-FOLD URBAN/RURAL MEASURE', 'DENOMINATION',
                   'PROPORTION OF PUPILS FROM MINORITY ETHNIC GROUPS',
                   'PROPORTION OF PUPILS WHO LIVE IN 20% MOST DEPRIVED DATAZONES IN SCOTLAND']
    
    # Rename column headers
    dfs = dfs.rename(columns={'LA NAME': 'LOCAL AUTHORITY'})
    dfs = dfs.rename(columns={'SCHOOL NAME': 'SCHOOL'})
    
    # Select LA schools
    dfs = dfs.loc[dfs['CENTRE TYPE'] != 'INDEPENDENT']

    # Add missing columns
    dfs['ASN PUPILS'] = np.nan
    dfs['ATTENDANCE RATE'] = np.nan
    
    # Add year
    dfs['YEAR'] = dt[k]

    # Create school type list
    sls = ['PRIMARY', 'SECONDARY', 'SPECIAL']
    
    # Loop through types of school
    for t in sls:
        
        # Filter by school type (t)
        dfst = dfs[~dfs[t].isna()].copy()

        # Add school type column
        dfst['SCHOOL TYPE'] = dfst[t]

        # Add pupil roll column
        dfst['PUPIL ROLL'] = dfst[t +' ROLL'].fillna(0)

        # Add teachers column
        dfst['TEACHERS'] = dfst[t + ' FTE']

        # Select subset of columns
        dfst = dfst[['SEEDCODE', 'LOCAL AUTHORITY', 'SCHOOL TYPE', 'SCHOOL', 'TEACHERS',
                       'PUPIL ROLL', 'ASN PUPILS', 'ATTENDANCE RATE', 'YEAR']]

        # Append df to list
        frames.append(dfst)

In [None]:
# Create dictionary for year column
dt = {'2013':'1213'}

for k in dt:

    # Read in data
    dfs = pd.read_excel('./data/School+Contact+Details+2018.xlsx', 
                                    sheet_name='Open at September ' + k,
                                       skiprows=5,
                                           na_values=('c','.', '0', 'n/a', 'N/A', '0(2)'))
    # Tidy data
    dfs = tidy(dfs)

    # Tidy headers
    dfs = tidy_older_headers(dfs)

    # Fix duplicate column names
    dfs.columns = ['SEEDCODE', 'LA NAME', 'CENTRE TYPE', 'SCHOOL NAME', 'ADDRESS 1',
                   'ADDRESS 2', 'ADDRESS 3', 'ADDRESS 4', 'POST CODE 1', 'POST CODE 2',
                   'E-MAIL', 'WEBSITE', 'PHONE', 'FAX', 'PRIMARY', 'SECONDARY', 'SPECIAL',
                   'PRIMARY ROLL', 'SECONDARY ROLL', 'SPECIAL ROLL', 'PRIMARY FTE',
                   'SECONDARY FTE', 'SPECIAL FTE', '6-FOLD URBAN/RURAL MEASURE',
                   '8-FOLD URBAN/RURAL MEASURE', 'DENOMINATION',
                   'PROPORTION OF PUPILS FROM MINORITY ETHNIC GROUPS',
                   'PROPORTION OF PUPILS WHO LIVE IN 20% MOST DEPRIVED DATAZONES IN SCOTLAND']

    # Rename column headers
    dfs = dfs.rename(columns={'LA NAME': 'LOCAL AUTHORITY'})
    dfs = dfs.rename(columns={'SCHOOL NAME': 'SCHOOL'})
    
    # Select LA schools
    dfs = dfs.loc[dfs['CENTRE TYPE'] != 'INDEPENDENT']

    # Add missing columns
    dfs['ASN PUPILS'] = np.nan
    dfs['ATTENDANCE RATE'] = np.nan
    
    # Add year
    dfs['YEAR'] = dt[k]

    # Create school type list
    sls = ['PRIMARY', 'SECONDARY', 'SPECIAL']
    
    # Loop through types of school
    for t in sls:
        
        # Filter by school type (t)
        dfst = dfs[~dfs[t].isna()].copy()

        # Add school type column
        dfst['SCHOOL TYPE'] = dfst[t]

        # Add pupil roll column
        dfst['PUPIL ROLL'] = dfst[t +' ROLL'].fillna(0)

        # Add teachers column
        dfst['TEACHERS'] = dfst[t + ' FTE']

        # Select subset of columns
        dfst = dfst[['SEEDCODE', 'LOCAL AUTHORITY', 'SCHOOL TYPE', 'SCHOOL', 'TEACHERS',
                       'PUPIL ROLL', 'ASN PUPILS', 'ATTENDANCE RATE', 'YEAR']]

        # Append df to list
        frames.append(dfst)

In [None]:
# Create dictionary for year column
dt = {'2012':'1112'}

for k in dt:

    # Read in data
    dfs = pd.read_excel('./data/School+Contact+Details+2018.xlsx', 
                                    sheet_name='Open at September ' + k,
                                       skiprows=5,
                                           na_values=('c','.', '0', 'n/a', 'N/A', '0(2)'))
    # Tidy data
    dfs = tidy(dfs)

    # Tidy headers
    dfs = tidy_older_headers(dfs)

    # Fix duplicate column names
    dfs.columns = ['LA NAME', 'CENTRE TYPE', 'SEEDCODE', 'SCHOOL NAME', 'ADDRESS 1',
                   'ADDRESS 2', 'ADDRESS 3', 'ADDRESS 4', 'POST CODE 1', 'POST CODE 2',
                   'E-MAIL', 'WEBSITE', 'PHONE', 'FAX', 'PRIMARY', 'SECONDARY', 'SPECIAL',
                   'PRIMARY ROLL', 'SECONDARY ROLL', 'SPECIAL ROLL', 'PRIMARY FTE',
                   'SECONDARY FTE', 'SPECIAL FTE', 'DENOMINATION',
                   'PROPORTION OF PUPILS FROM MINORITY ETHNIC GROUPS']

    # Rename column headers
    dfs = dfs.rename(columns={'LA NAME': 'LOCAL AUTHORITY'})
    dfs = dfs.rename(columns={'SCHOOL NAME': 'SCHOOL'})
    
    # Select LA schools
    dfs = dfs.loc[dfs['CENTRE TYPE'] != 'INDEPENDENT']

    # Add missing columns
    dfs['ASN PUPILS'] = np.nan
    dfs['ATTENDANCE RATE'] = np.nan
    
    # Add year
    dfs['YEAR'] = dt[k]

    # Create school type list
    sls = ['PRIMARY', 'SECONDARY', 'SPECIAL']
    
    # Loop through types of school
    for t in sls:
        
        # Filter by school type (t)
        dfst = dfs[~dfs[t].isna()].copy()

        # Add school type column
        dfst['SCHOOL TYPE'] = dfst[t]

        # Add pupil roll column
        dfst['PUPIL ROLL'] = dfst[t +' ROLL'].fillna(0)

        # Add teachers column
        dfst['TEACHERS'] = dfst[t + ' FTE']

        # Select subset of columns
        dfst = dfst[['SEEDCODE', 'LOCAL AUTHORITY', 'SCHOOL TYPE', 'SCHOOL', 'TEACHERS',
                       'PUPIL ROLL', 'ASN PUPILS', 'ATTENDANCE RATE', 'YEAR']]

        # Append df to list
        frames.append(dfst)

In [None]:
# Create dictionary for year column
dt = {'2011':'1011', '2010':'0910'}

for k in dt:

    # Read in data
    dfs = pd.read_excel('./data/School+Contact+Details+2018.xlsx', 
                                    sheet_name='Open at September ' + k,
                                       skiprows=5,
                                           na_values=('c','.', '0', 'n/a', 'N/A', '0(2)'))
    # Tidy data
    dfs = tidy(dfs)

    # Tidy headers
    dfs = tidy_older_headers(dfs)

    # Fix duplicate column names
    dfs.columns = ['LA NAME', 'CENTRE TYPE', 'SEEDCODE', 'SCHOOL NAME', 'ADDRESS 1',
                   'ADDRESS 2', 'ADDRESS 3', 'ADDRESS 4', 'POST CODE 1', 'POST CODE 2',
                   'E-MAIL', 'WEBSITE', 'PHONE', 'FAX', 'PRIMARY', 'SECONDARY', 'SPECIAL',
                   'PRIMARY ROLL', 'SECONDARY ROLL', 'SPECIAL ROLL', 'PRIMARY FTE',
                   'SECONDARY FTE', 'SPECIAL FTE', '6-FOLD URBAN/RURAL MEASURE',
                   '8-FOLD URBAN/RURAL MEASURE', 'DENOMINATION',
                   'PROPORTION OF PUPILS FROM MINORITY ETHNIC GROUPS']

    # Rename column headers
    dfs = dfs.rename(columns={'LA NAME': 'LOCAL AUTHORITY'})
    dfs = dfs.rename(columns={'SCHOOL NAME': 'SCHOOL'})
    
    # Select LA schools
    dfs = dfs.loc[dfs['CENTRE TYPE'] != 'INDEPENDENT']

    # Add missing columns
    dfs['ASN PUPILS'] = np.nan
    dfs['ATTENDANCE RATE'] = np.nan
    
    # Add year
    dfs['YEAR'] = dt[k]

    # Create school type list
    sls = ['PRIMARY', 'SECONDARY', 'SPECIAL']
    
    # Loop through types of school
    for t in sls:
        
        # Filter by school type (t)
        dfst = dfs[~dfs[t].isna()].copy()

        # Add school type column
        dfst['SCHOOL TYPE'] = dfst[t]

        # Add pupil roll column
        dfst['PUPIL ROLL'] = dfst[t +' ROLL'].fillna(0)

        # Add teachers column
        dfst['TEACHERS'] = dfst[t + ' FTE']

        # Select subset of columns
        dfst = dfst[['SEEDCODE', 'LOCAL AUTHORITY', 'SCHOOL TYPE', 'SCHOOL', 'TEACHERS',
                       'PUPIL ROLL', 'ASN PUPILS', 'ATTENDANCE RATE', 'YEAR']]

        # Append df to list
        frames.append(dfst)

In [None]:
# Create dictionary for year column
dt = {'2009':'0809'}

for k in dt:

    # Read in data
    dfs = pd.read_excel('./data/School+Contact+Details+2018.xlsx', 
                                    sheet_name='Open at September ' + k,
                                       skiprows=3,
                                           na_values=('c','.', '0', 'n/a', 'N/A', '0(2)'))
    # Tidy data
    dfs = tidy(dfs)

    # Tidy headers
    dfs = tidy_older_headers(dfs)

    # Fix duplicate column names
    dfs.columns = ['LOCAL AUTHORITY', 'CENTRE TYPE', 'CENTRE', 'SCHOOL NAME', 'ADDRESS 1',
                   'ADDRESS 2', 'ADDRESS 3', 'POST CODE 1', 'POST CODE 2', 'E-MAIL',
                   'WEBSITE', 'PHONE', 'FAX', 'PRE-SCHOOL', 'PRIMARY', 'SECONDARY',
                   'SPECIAL', 'PRIMARY ROLL', 'SECONDARY ROLL', 'SPECIAL ROLL',
                   'PRIMARY FTE', 'SECONDARY FTE', 'SPECIAL FTE', 'DATAZONE',
                   '6-FOLD URBAN/RURAL MEASURE', '8-FOLD URBAN/RURAL MEASURE',
                   'DENOMINATION', 'PROPORTION OF PUPILS FROM MINORITY ETHNIC GROUPS']

    # Rename column headers
    dfs = dfs.rename(columns={'LA NAME': 'LOCAL AUTHORITY'})
    dfs = dfs.rename(columns={'SCHOOL NAME': 'SCHOOL'})
    dfs = dfs.rename(columns={'CENTRE': 'SEEDCODE'})
    
    # Select LA schools
    dfs = dfs.loc[dfs['CENTRE TYPE'] != 'INDEPENDENT']

    # Add missing columns
    dfs['ASN PUPILS'] = np.nan
    dfs['ATTENDANCE RATE'] = np.nan
    
    # Add year
    dfs['YEAR'] = dt[k]

    # Create school type list
    sls = ['PRIMARY', 'SECONDARY', 'SPECIAL']
    
    # Loop through types of school
    for t in sls:
        
        # Filter by school type (t)
        dfst = dfs[~dfs[t].isna()].copy()

        # Add school type column
        dfst['SCHOOL TYPE'] = dfst[t]

        # Add pupil roll column
        dfst['PUPIL ROLL'] = dfst[t +' ROLL'].fillna(0)

        # Add teachers column
        dfst['TEACHERS'] = dfst[t + ' FTE']

        # Select subset of columns
        dfst = dfst[['SEEDCODE', 'LOCAL AUTHORITY', 'SCHOOL TYPE', 'SCHOOL', 'TEACHERS',
                       'PUPIL ROLL', 'ASN PUPILS', 'ATTENDANCE RATE', 'YEAR']]

        # Append df to list
        frames.append(dfst)

In [None]:
# Create dictionary for year column
dt = {'2008':'0807'}

for k in dt:

    # Read in data
    dfs = pd.read_excel('./data/School+Contact+Details+2018.xlsx', 
                                    sheet_name='Open at September ' + k,
                                       skiprows=3,
                                           na_values=('c','.', '0', 'n/a', 'N/A', '0(2)'))
    # Tidy data
    dfs = tidy(dfs)

    # Tidy headers
    dfs = tidy_older_headers(dfs)

    # Fix duplicate column names
    dfs.columns = ['LOCAL AUTHORITY', 'CENTRE TYPE', 'CENTRE', 'SCHOOL NAME', 'ADDRESS 1',
                   'ADDRESS 2', 'ADDRESS 3', 'POST CODE 1', 'POST CODE 2', 'EMAIL',
                   'WEBSITE', 'PHONE', 'FAX', 'PRE-SCHOOL', 'PRIMARY', 'SECONDARY',
                   'SPECIAL', 'PRIMARY ROLL', 'SECONDARY ROLL', 'SPECIAL ROLL',
                   'PRIMARY FTE', 'SECONDARY FTE', 'SPECIAL FTE', 'URBAN/RURAL MEASURE',
                   'DENOMINATION', 'PROPORTION OF PUPILS FROM MINORITY ETHNIC GROUPS']

    # Rename column headers
    dfs = dfs.rename(columns={'LA NAME': 'LOCAL AUTHORITY'})
    dfs = dfs.rename(columns={'SCHOOL NAME': 'SCHOOL'})
    dfs = dfs.rename(columns={'CENTRE': 'SEEDCODE'})
    
    # Select LA schools
    dfs = dfs.loc[dfs['CENTRE TYPE'] != 'INDEPENDENT']

    # Add missing columns
    dfs['ASN PUPILS'] = np.nan
    dfs['ATTENDANCE RATE'] = np.nan
    
    # Add year
    dfs['YEAR'] = dt[k]

    # Create school type list
    sls = ['PRIMARY', 'SECONDARY', 'SPECIAL']
    
    # Loop through types of school
    for t in sls:
        
        # Filter by school type (t)
        dfst = dfs[~dfs[t].isna()].copy()

        # Add school type column
        dfst['SCHOOL TYPE'] = dfst[t]

        # Add pupil roll column
        dfst['PUPIL ROLL'] = dfst[t +' ROLL'].fillna(0)

        # Add teachers column
        dfst['TEACHERS'] = dfst[t + ' FTE']

        # Select subset of columns
        dfst = dfst[['SEEDCODE', 'LOCAL AUTHORITY', 'SCHOOL TYPE', 'SCHOOL', 'TEACHERS',
                       'PUPIL ROLL', 'ASN PUPILS', 'ATTENDANCE RATE', 'YEAR']]

        # Append df to list
        frames.append(dfst)

In [None]:
# Create dictionary for year column
dt = {'2007':'0706'}

for k in dt:

    # Read in data
    dfs = pd.read_excel('./data/School+Contact+Details+2018.xlsx', 
                                    sheet_name='Open at September ' + k,
                                       skiprows=10,
                                           na_values=('c','.', '0', 'n/a', 'N/A', '0(2)'))
    # Tidy data
    dfs = tidy(dfs)

    # Tidy headers
    dfs = tidy_older_headers(dfs)
    
    # Fix duplicate column names
    dfs.columns = ['LOCAL AUTHORITY', 'CENTRE TYPE', 'CENTRE', 'SCHOOL NAME', 'ADDRESS 1',
                   'ADDRESS 2', 'ADDRESS 3', 'POST CODE 1', 'POST CODE 2', 'EMAIL',
                   'WEBSITE', 'PHONE', 'FAX', 'PRE-SCHOOL', 'PRIMARY', 'SECONDARY',
                   'SPECIAL', 'PRIMARY ROLL', 'SECONDARY ROLL', 'SPECIAL ROLL', 'PRIMARY FTE',
                   'SECONDARY FTE', 'THROUGH (INDEPENDENT ONLY)', 'SPECIAL FTE',
                   'URBAN/RURAL MEASURE', 'DENOMINATION',
                   'PROPORTION OF PUPILS REGISTERED FOR FREE SCHOOL MEALS',
                   'PROPORTION OF PUPILS FROM MINORITY ETHNIC GROUPS']
    
    # Rename column headers
    dfs = dfs.rename(columns={'LA NAME': 'LOCAL AUTHORITY'})
    dfs = dfs.rename(columns={'SCHOOL NAME': 'SCHOOL'})
    dfs = dfs.rename(columns={'CENTRE': 'SEEDCODE'})
    
    # Select LA schools
    dfs = dfs.loc[dfs['CENTRE TYPE'] != 'INDEPENDENT']

    # Add missing columns
    dfs['ASN PUPILS'] = np.nan
    dfs['ATTENDANCE RATE'] = np.nan
    
    # Add year
    dfs['YEAR'] = dt[k]

    # Create school type list
    sls = ['PRIMARY', 'SECONDARY', 'SPECIAL']
    
    # Loop through types of school
    for t in sls:
        
        # Filter by school type (t)
        dfst = dfs[~dfs[t].isna()].copy()

        # Add school type column
        dfst['SCHOOL TYPE'] = dfst[t]

        # Add pupil roll column
        dfst['PUPIL ROLL'] = dfst[t +' ROLL'].fillna(0)

        # Add teachers column
        dfst['TEACHERS'] = dfst[t + ' FTE']

        # Select subset of columns
        dfst = dfst[['SEEDCODE', 'LOCAL AUTHORITY', 'SCHOOL TYPE', 'SCHOOL', 'TEACHERS',
                       'PUPIL ROLL', 'ASN PUPILS', 'ATTENDANCE RATE', 'YEAR']]

        # Append df to list
        frames.append(dfst)

In [None]:
# Create dictionary for year column
dt = {'2006':'0605'}

for k in dt:

    # Read in data
    dfs = pd.read_excel('./data/School+Contact+Details+2018.xlsx', 
                                    sheet_name='Open at September ' + k,
                                       skiprows=6,
                                           na_values=('c','.', '0', 'n/a', 'N/A', '0(2)'))
    
    ### Special fixes for 0605 data
    
    # Fix suffix for school name for this year
    dfs['School Name'] = dfs['School Name'].str.removesuffix('(3)')
    
    # Fix local authority for grant aided schools for this year
    def fix_LA(row):
        if row['School Funding Type'] == 'Grant aided':
            return 'GRANT AIDED'
        else:
            return row['Local Authority']
        
    dfs['Local Authority'] = dfs.apply(lambda row: fix_LA(row), axis=1)
    
    ### Special fixes for 0605 data
    
    # Tidy data
    dfs = tidy(dfs)

    # Tidy headers
    dfs = tidy_older_headers(dfs)

    # Fix duplicate column names
    dfs.columns = ['LOCAL AUTHORITY', 'SCHOOL NAME', 'SEED CODE', 'SCHOOL FUNDING TYPE',
                   'ADDRESS 1', 'ADDRESS 2', 'POST CODE 1', 'POST CODE 2', 'PHONE', 'FAX',
                   'E-MAIL', 'WEB ADDRESS', 'PRE SCHOOL', 'PRIMARY', 'SECONDARY',
                   'SPECIAL', 'PRIMARY ROLL', 'SECONDARY ROLL', 'SPECIAL ROLL', 'PRIMARY FTE',
                   'SECONDARY FTE', 'THROUGH (INDEPENDENT ONLY)', 'SPECIAL FTE', 'LOCATION',
                   'DENOMINATION', 'PROPORTION OF PUPILS FROM MINORITY ETHNIC GROUPS',
                   'PROPORTION OF PUPILS REGISTERED FOR FREE SCHOOL MEALS']
    
    # Rename column headers
    dfs = dfs.rename(columns={'LA NAME': 'LOCAL AUTHORITY'})
    dfs = dfs.rename(columns={'SCHOOL NAME': 'SCHOOL'})
    dfs = dfs.rename(columns={'SEED CODE': 'SEEDCODE'})
    
    # Select LA schools
    dfs = dfs.loc[dfs['SCHOOL FUNDING TYPE'] != 'INDEPENDENT']

    # Add missing columns
    dfs['ASN PUPILS'] = np.nan
    dfs['ATTENDANCE RATE'] = np.nan
    
    # Add year
    dfs['YEAR'] = dt[k]

    # Create school type list
    sls = ['PRIMARY', 'SECONDARY', 'SPECIAL']
    
    # Loop through types of school
    for t in sls:
        
        # Filter by school type (t)
        dfst = dfs[~dfs[t].isna()].copy()

        # Add school type column
        dfst['SCHOOL TYPE'] = dfst[t]

        # Add pupil roll column
        dfst['PUPIL ROLL'] = dfst[t +' ROLL'].fillna(0)

        # Add teachers column
        dfst['TEACHERS'] = dfst[t + ' FTE']

        # Select subset of columns
        dfst = dfst[['SEEDCODE', 'LOCAL AUTHORITY', 'SCHOOL TYPE', 'SCHOOL', 'TEACHERS',
                       'PUPIL ROLL', 'ASN PUPILS', 'ATTENDANCE RATE', 'YEAR']]

        # Append df to list
        frames.append(dfst)

In [None]:
# Create dictionary for year column
dt = {'2005':'0504'}

for k in dt:

    # Read in data
    dfs = pd.read_excel('./data/School+Contact+Details+2018.xlsx', 
                                    sheet_name='Open at September ' + k,
                                       skiprows=5,
                                           na_values=('c','.', '0', 'n/a', 'N/A', '0(2)'))
    # Tidy data
    dfs = tidy(dfs)

    # Tidy headers
    dfs = tidy_older_headers(dfs)

    # Fix duplicate column names
    dfs.columns = ['LOCAL AUTHORITY', 'SEED CODE', 'SCHOOL NAME', 'SCHOOL FUNDING TYPE',
                   'ADDRESS 1', 'ADDRESS 2', 'POST CODE 1', 'POST CODE 2', 'PHONE', 'FAX',
                   'E-MAIL', 'WEB ADDRESS', 'PRE SCHOOL', 'PRIMARY', 'SECONDARY',
                   'SPECIAL', 'SEN UNIT', 'PRIMARY ROLL', 'SECONDARY ROLL', 'SPECIAL ROLL',
                   'PRIMARY FTE', 'SECONDARY FTE', 'SPECIAL FTE', 'LOCATION', 'DENOMINATION']
    
    # Rename column headers
    dfs = dfs.rename(columns={'LA NAME': 'LOCAL AUTHORITY'})
    dfs = dfs.rename(columns={'SCHOOL NAME': 'SCHOOL'})
    dfs = dfs.rename(columns={'SEED CODE': 'SEEDCODE'})
    
    # Select LA schools
    dfs = dfs.loc[dfs['SCHOOL FUNDING TYPE'] != 'INDEPENDENT']

    # Add missing columns
    dfs['ASN PUPILS'] = np.nan
    dfs['ATTENDANCE RATE'] = np.nan
    
    # Add year
    dfs['YEAR'] = dt[k]

    # Create school type list
    sls = ['PRIMARY', 'SECONDARY', 'SPECIAL']
    
    # Loop through types of school
    for t in sls:
        
        # Filter by school type (t)
        dfst = dfs[~dfs[t].isna()].copy()

        # Add school type column
        dfst['SCHOOL TYPE'] = dfst[t]

        # Add pupil roll column
        dfst['PUPIL ROLL'] = dfst[t +' ROLL'].fillna(0)

        # Add teachers column
        dfst['TEACHERS'] = dfst[t + ' FTE']

        # Select subset of columns
        dfst = dfst[['SEEDCODE', 'LOCAL AUTHORITY', 'SCHOOL TYPE', 'SCHOOL', 'TEACHERS',
                       'PUPIL ROLL', 'ASN PUPILS', 'ATTENDANCE RATE', 'YEAR']]

        # Append df to list
        frames.append(dfst)

In [None]:
dfs.head()

## Combine frames and export

In [None]:
# Concat list into long df
adfs = pd.concat(frames, axis = 0, ignore_index = True)

# Convert zeros to nan
adfs = adfs.replace(0, np.nan)

# Convert dtype several columns
adfs['YEAR'] = adfs['YEAR'].astype(str)
adfs['SEEDCODE'] = adfs['SEEDCODE'].astype('Int64')
adfs['PUPIL ROLL'] = adfs['PUPIL ROLL'].astype('Int64')
adfs['ASN PUPILS'] = adfs['ASN PUPILS'].astype('Int64')
adfs['FSM PUPILS'] = adfs['FSM PUPILS'].astype('Int64')

In [None]:
## Update school names with contact list document

# Read in contact list data
cdf = pd.read_excel('./data/school+contact+list+31+October+2024.xlsx', sheet_name='Open Schools', skiprows=5)

# Tidy data
cdf = tidy(cdf)

# Convert into dictionary
codeName = dict(zip(cdf['SEED CODE'], cdf['SCHOOL NAME']))
codeLA = dict(zip(cdf['SEED CODE'], cdf['LA NAME']))
codeFund = dict(zip(cdf['SEED CODE'], cdf['CENTRE TYPE']))

# Set all schools to LA as only a few Grant Aided
adfs['SCHOOL FUNDING'] = 'LOCAL AUTHORITY'

# Create new column with updated values for School Name, LA and Type
adfs['SCHOOL NAME UPDATE'] = adfs['SEEDCODE'].map(codeName)
adfs['SCHOOL LA UPDATE'] = adfs['SEEDCODE'].map(codeLA)
adfs['SCHOOL FUNDING'] = adfs['SEEDCODE'].map(codeFund)

# Replace School with value from updated column (unless updated column is blank!)
adfs['SCHOOL'] = adfs['SCHOOL NAME UPDATE'].combine_first(adfs['SCHOOL'])
adfs['LOCAL AUTHORITY'] = adfs['SCHOOL LA UPDATE'].combine_first(adfs['LOCAL AUTHORITY'])

# Drop working column
adfs = adfs.drop('SCHOOL NAME UPDATE', axis=1)
adfs = adfs.drop('SCHOOL LA UPDATE', axis=1)

In [None]:
## Calc new stats

# Pupil : Teacher
adfs['PUPIL TEACHER RATIO'] = adfs['PUPIL ROLL'] / adfs['TEACHERS']

# ASN %
adfs['ASN %'] = adfs['ASN PUPILS'] / adfs['PUPIL ROLL'] * 100

# FSM %
adfs['FSM %'] = adfs['FSM PUPILS'] / adfs['PUPIL ROLL'] * 100

In [None]:
# Output df to .csv
adfs.to_csv('./csvs/schoolStats.csv', index=False)

adfs.head()

## Old code

In [None]:
# Concat list into long df
adfs = pd.concat(frames, axis = 0, ignore_index = True)

# Convert zeros to nan
adfs = adfs.replace(0, np.nan)

# Convert dtype several columns
adfs['YEAR'] = adfs['YEAR'].astype(str)
adfs['SEEDCODE'] = adfs['SEEDCODE'].astype('Int64')
adfs['PUPIL ROLL'] = adfs['PUPIL ROLL'].astype('Int64')
adfs['ASN PUPILS'] = adfs['ASN PUPILS'].astype('Int64')

# Tidy up loose ends
adfs['LOCAL AUTHORITY'] = adfs['LOCAL AUTHORITY'].str.replace('CITY OF EDINBURGH', 'EDINBURGH CITY')
adfs['LOCAL AUTHORITY'] = adfs['LOCAL AUTHORITY'].str.replace('EDINBURGH CITY OF', 'EDINBURGH CITY')
adfs['LOCAL AUTHORITY'] = adfs['LOCAL AUTHORITY'].str.replace('EILEAN SIAR', 'NA HEILEANAN SIAR')
adfs['LOCAL AUTHORITY'] = adfs['LOCAL AUTHORITY'].str.replace('GRANT MAINTAINED', 'GRANT AIDED')
adfs['LOCAL AUTHORITY'] = adfs['LOCAL AUTHORITY'].str.replace('GRANTMAINTAINED', 'GRANT AIDED')

## Fix 0506 school names with seed codes from 0405 document

# Read in data
dfn = pd.read_excel('./data/School+Contact+Details+2018.xlsx', 
                                    sheet_name='Open at September 2005',
                                       skiprows=5,
                                           na_values=('c','.', '0', 'n/a', 'N/A', '0(2)'))
# Tidy df
dfn = tidy(dfn)

# Convert into dictionary
codes1 = dict(zip(dfn['SEED CODE'], dfn['SCHOOL NAME']))

# Create new column with updated school name
adfs['SCHOOL UPDATED'] = adfs['SEEDCODE'].map(codes1)

# Replace School with value from updated column (unless updated column is blank!)
#adfs['SCHOOL'] = np.where(~adfs['SCHOOL UPDATED'].isnull(), adfs['SCHOOL UPDATED'], adfs['SCHOOL'])
adfs['SCHOOL'] = adfs['SCHOOL UPDATED'].combine_first(adfs['SCHOOL'])

## Map updated school names with seed codes from 2023 document

# Read in data from 2023 data
dfm = pd.read_excel('./data/School+level+summary+statistics+2023+v2.xlsx', 
                                sheet_name='2023 School Level Statistics',
                                   skiprows=1,
                                       na_values=('c', '#', '*', 'z', 'x'))

# Tidy df
dfm = tidy(dfm)

# Select SEEDCODE and SCHOOL NAME columns
dfm = dfm[['SEEDCODE', 'SCHOOL NAME']]

# Convert into dictionary
codes2 = dict(zip(dfm['SEEDCODE'], dfm['SCHOOL NAME']))

# Create new column with updated school name
adfs['SCHOOL UPDATED'] = adfs['SEEDCODE'].map(codes2)

# Replace School with value from updated column (unless updated column is blank!)
#adfs['SCHOOL'] = np.where(~adfs['SCHOOL UPDATED'].isnull(), adfs['SCHOOL UPDATED'], adfs['SCHOOL'])
adfs['SCHOOL'] = adfs['SCHOOL UPDATED'].combine_first(adfs['SCHOOL'])

## Tidy up remaining names with seed codes manually

# Create dictionary of dodgy values
codes3 = {8400032: 'GLASGOW GAELIC SECONDARY SCHOOL', 8359334: 'ABRONHILL HIGH SCHOOL',
                 8357234: 'CUMBERNAULD HIGH SCHOOL', 8314632: 'CUMBERNAULD HIGH SCHOOL',
                    8314632: "ST COLUMBA'S HIGH SCHOOL", 8314438: "ST ANDREW'S HIGH SCHOOL",
                        6203930: 'PAIBLE SCHOOL', 6102530: 'SKERRIES SCHOOL',
                          5939534: 'MAXWELLTOWN HIGH SCHOOL', 5407532: 'KIRKLAND HIGH SCHOOL',
                              5406331: 'BUCKHAVEN HIGH SCHOOL', 5351138: "ST COLUMBA'S RC HIGH SCHOOL",
                                  5331234: "ST SAVIOUR'S RC HIGH SCHOOL", 5330238: 'MENZIESHILL HIGH SCHOOL',
                                      5329736: 'LAWSIDE RC ACADEMY',
             
            5113520: 'BADCAUL PRIMARY', 5116023: 'KINLOCHEWE PRIMARY', 5126428: 'ELGOL PRIMARY',
                5127521: 'STRUAN PRIMARY', 5139023: 'FOYERS PRIMARY', 5145724: 'GERGASK PRIMARY',
                      5212529: 'EASTERFIELD PRIMARY SCHOOL', 5216621: 'FISHERFORD PRIMARY SCHOOL',
                          5224829: 'CLATT PRIMARY SCHOOL', 5226325: 'GARTLY PRIMARY SCHOOL', 
                              5240042: 'VIRTUAL SCHOOL', 5536847: "HOWDENHALL & ST KATHARINE'S",
                                 8338744: 'SECONDARY WELLBEING', 8414041: 'EVIP',
                                    8421544: "ST CHARLES' LANGUAGE & COMMUNICATION RESOURCE",
             
            8215634: 'GARNOCK COMMUNITY CAMPUS'}

# Create new column with updated school name
adfs['SCHOOL UPDATED'] = adfs['SEEDCODE'].map(codes3)

# Replace School with value from updated column (unless updated column is blank!)
#adfs['SCHOOL'] = np.where(~adfs['SCHOOL UPDATED'].isnull(), adfs['SCHOOL UPDATED'], adfs['SCHOOL'])
adfs['SCHOOL'] = adfs['SCHOOL UPDATED'].combine_first(adfs['SCHOOL'])

# Drop working column
adfs = adfs.drop('SCHOOL UPDATED', axis=1)

In [None]:
adfs.tail()

In [None]:
adfs.to_csv('./csvs/school_stats.csv', index=False)