In [None]:
import pandas as pd
import numpy as np

In [None]:
def readfile(year, ext, sheet, badrows, badfooter):
    
        df = pd.read_excel('./Summary Statistics for Schools in Scotland/' + year + ext, 
                                sheet_name=sheet,
                                    header=None, 
                                        na_values=['..', '#', ':'],
                                            skiprows=badrows, skipfooter=badfooter)
        return tidy(df)
    
def tidy(df):
    
    # Strip whitespace
    df[0] = df[0].str.strip()
    
    # Create list of suffixes to remove
    sls = ["(1) (2)", " (1) (2)", " (1)", "(1)", " (2)", "(2)", " (3)", "(3)", " (4)", "(4)",
                  " (5)", "(5)", " (6)", "(6)"]

    # Loop through suffix list
    for s in sls:
        
        # Remove suffix
        df = df.applymap(lambda x: x.replace(s, '') if type(x) is str else x)
    
    # Remove any remaining parenthesis to help with finding exact match and replacing in final tidy
    df = df.applymap(lambda x: x.replace('(', '') if type(x) is str else x)
    df = df.applymap(lambda x: x.replace(')', '') if type(x) is str else x)
    
    # Capitalise columns
    df[0] = df[0].str.title()
    
    # Replace dash symbol
    df[0] = df[0].str.replace(' – ', ' - ')
    
    # Replace some categories to common values
    df[0] = df[0].str.replace('Asn', 'ASN', case=True)
    df[0] = df[0].str.replace('Additional Support Needs ASN', 'ASN', case=True)
    df[0] = df[0].str.replace('English As An Additional Language', 'EAL', case=True)
    df[0] = df[0].str.replace('Csp', 'CSP', case=True)
    df[0] = df[0].str.replace('Iep', 'IEP', case=True)
    df[0] = df[0].str.replace('IEP Individualised Education Programme', 'IEP', case=True)
    df[0] = df[0].str.replace('CSP Co-Ordinated Support Plan', 'CSP', case=True)
    df[0] = df[0].str.replace('Assessed/Declared Disabled', 'Disabled', case=True)
    df[0] = df[0].str.replace('Total Pupils With ASN', 'All Pupils With ASN', case=True)
    
    # Remove rows with NaN in first column
    df = df[df[0].notna()]
    
    ## Drop dodgy columns
    # Find the nones
    nones = df.isnull()
    # Find the zeroes
    zeroes = df == 0
    # Find both the nones and the zeroes
    nones_zeroes = nones | zeroes
    # Find columns with only nones and nones
    cols = nones_zeroes.all()[nones_zeroes.all()].index.to_list()
    # Drop these columns
    df = df.drop(cols, axis=1)
        
    return df

In [None]:
### import data before 2022
# Create frames to sort working df
frames = []

# Create list of years
years = ['2013']

# Loop through years
for y in years:
    
    # Read in sheet
    wdf = readfile(y, '.xlsx', 'Table 3.5', 2, 1)
    
    # Append to list of df (with year)
    frames.append([wdf, y])

# Create list of years
years = ['2014']

# Loop through years
for y in years:
    
    # Read in sheet
    wdf = readfile(y, '.xls', 'Table 3.5', 2, 1)
    
    # Append to list of df (with year)
    frames.append([wdf, y])
    
# Create list of years
years = ['2015']

# Loop through years
for y in years:
    
    # Read in sheet
    wdf = readfile(y, '.xls', 'Table 4.5', 2, 2)
    
    # Append to list of df (with year)
    frames.append([wdf, y])
    
# Create list of years
years = ['2016', '2017', '2018']

# Loop through years
for y in years:
    
    # Read in sheet
    wdf = readfile(y, '.xls', 'Table 4.5', 3, 2)
    
    # Append to list of df (with year)
    frames.append([wdf, y])
    
# Create list of years
years = ['2019', '2020', '2021']

# Loop through years
for y in years:
    
    # Read in sheet
    wdf = readfile(y, '.xlsx', '4.5', 3, 3)
    
    # Append to list of df (with year)
    frames.append([wdf, y])
    
# Create list of years
years = ['2022', '2023']

# Loop through years
for y in years:
    
    # Read in sheet
    wdf = readfile(y, '.xlsx', '20', 3, 0)
    
    # Append to list of df (with year)
    frames.append([wdf, y])
    
# Create list of years
years = ['2022', '2023']

# Loop through years
for y in years:
    
    # Read in sheet
    wdf = readfile(y, '.xlsx', '21', 3, 3)
    
    # Append to list of df (with year)
    frames.append([wdf, y])

In [None]:
## Pull out subtables from sheet
# Create a list for tables
tables_ls = []

# Create list of subtables on sheet
ls = ['Ethnicity', 'ASN', 'EAL','Total Pupils']

# Loop through frames
for f in frames:
    
    # Select df which is first element of list entry
    df = f[0]

    # Not sure what this does (store the recurrence of 'markers')
    groups = df[0].isin(ls).cumsum()

    # Not sure how this works, creates a dictionary of df for each table
    tables = {'process'+str(k): g.iloc[0:] for k,g in df.groupby(groups)}
    
    # Append to list of df with year
    tables_ls.append([tables, f[1]])

In [None]:
## Tidy up column headers for each table
# Loop though list of tables
for l in tables_ls:
    
    # Select df which is first element of list entry
    tables = l[0]
    
    # Loop through tables dictionary
    for t in tables:
        
        # Get header for table
        header = tables[t].iloc[0, :1].item()
        
        # Check for shape of table before reformatting
        if header == 'Ethnicity' or header == 'ASN':
            
            # Manually rename columns from key at the top of sheet
            tables[t].columns = [np.nan, 'Female', 'Male', 'Total']

            # Concat 1st entry of 1st row with 2+ entries of 2nd row 
            tables[t].columns = np.concatenate([tables[t].iloc[0, :1], tables[t].columns[1:]])

            # Drop 1st row
            tables[t] = tables[t].iloc[1:].reset_index(drop=True)
        
        # Check for shape of table before reformatting
        if header == 'EAL' or header == 'Total Pupils':
            
            # Manually rename columns from key at the top of sheet
            tables[t].columns = [header, 'Female', 'Male', 'Total']

In [None]:
#tables[t]

In [None]:
## Convert tables from before 2022 into long format 

# Create frames to sort working df
frames = []

# Loop though list of tables
for l in tables_ls:
    
    # Select df which is first element of list entry
    tables = l[0]

    # Loop through tables
    for t in tables:

        # Loop through columns in table
        for h in tables[t].columns[1:]:

            # Melt data into long format
            wdf = pd.melt(tables[t], id_vars=[tables[t].columns[0]], value_vars=[h],
                         var_name='Type', value_name='Value')

            # Set first column header to be Category
            wdf.columns.values[0] = 'Category'

            # Create Table column with value equal to title of table 
            wdf['Table'] = tables[t].columns[0]

            # Create Year column with value from second element of list
            wdf['Year'] = l[1]

            # Append to list of df
            frames.append(wdf)

In [None]:
## Concat all together long format dfs into final df
ldf = pd.concat(frames)

In [None]:
## Tidy up

# Reorder columns
ldf = ldf[['Table', 'Category', 'Type', 'Value', 'Year']]

# Sort bad value
ldf['Category'] = ldf['Category'].str.replace('British Isles Islsles', 'British')

# Replace Gender with Sex (full match only)
ldf['Table'] = ldf['Table'].str.replace(r'(?i)Gender\b', r'Sex', regex=True)

# Replace Age (years) with Age (full match only)
ldf['Table'] = ldf['Table'].str.replace(r'(?i)Age years\b', r'Age', regex=True)

# Drop duplicates
ldf = ldf.drop_duplicates()

# Reset index
ldf = ldf.reset_index(drop=True)

In [None]:
ldf.head()

In [None]:
ldf.to_csv('pupil_characteristics.csv')

In [None]:
ldf.info()