In [None]:
import pandas as pd
import numpy as np

In [None]:
def readfile(year, ext, sheet, badrows, badfooter):
    
        df = pd.read_excel('./scotland/' + year + ext, 
                                sheet_name=sheet,
                                    header=None, 
                                        na_values=['..', '#', ':'],
                                            skiprows=badrows, skipfooter=badfooter)
        return tidy(df)
    
def tidy(df):
    
    # Strip whitespace
    df[0] = df[0].str.strip()
    
    # Create list of suffixes to remove
    sls = ["(1) (2)", " (1) (2)", " (1)", "(1)", " (2)", "(2)", " (3)", "(3)", " (4)", "(4)",
                  " (5)", "(5)", " (6)", "(6)"]

    # Loop through suffix list
    for s in sls:
        
        # Remove suffix
        df = df.applymap(lambda x: x.replace(s, '') if type(x) is str else x)
    
    # Remove any remaining parenthesis to help with finding exact match and replacing in final tidy
    df = df.applymap(lambda x: x.replace('(', '') if type(x) is str else x)
    df = df.applymap(lambda x: x.replace(')', '') if type(x) is str else x)
    
    # Capitalise columns
    df[0] = df[0].str.title()
    
    # Replace dash symbol
    df[0] = df[0].str.replace(' – ', ' - ')
    
    # Replace some categories to common values
    df[0] = df[0].str.replace('Elc', 'ELC', case=True)
    df[0] = df[0].str.replace('Ptrs', 'PTRs', case=True)
    df[0] = df[0].str.replace('Pre-Schools', 'ELC')
    df[0] = df[0].str.replace('Pre-School', 'ELC')
    df[0] = df[0].str.replace('Pupil Teacher Ratios', 'PTRs')
    
    # Remove rows with NaN in first column
    df = df[df[0].notna()]
    
    ## Drop dodgy columns
    # Find the nones
    nones = df.isnull()
    # Find the zeroes
    zeroes = df == 0
    # Find both the nones and the zeroes
    nones_zeroes = nones | zeroes
    # Find columns with only nones and nones
    cols = nones_zeroes.all()[nones_zeroes.all()].index.to_list()
    # Drop these columns
    df = df.drop(cols, axis=1)
        
    return df

In [None]:
# Create frames to sort working df
frames = []

# Create list of years
years = ['2013']

# Loop through years
for y in years:
    
    # Read in sheet
    wdf = readfile(y, '.xlsx', 'Table 1.1', 2, 5)
    
    # Append to list of df (with year)
    frames.append([wdf, y])
    
# Create list of years
years = ['2020']

# Loop through years
for y in years:
    
    # Read in sheet
    wdf = readfile(y, '.xlsx', '2.1', 3, 6)
    
    # Append to list of df (with year)
    frames.append([wdf, y])

In [None]:
#wdf.head()

In [None]:
# Create a list for tables
tables_ls = []

# # Create list of subtables on sheet
ls = ['Schools', 'Pupils', 'Teachers', 'PTRs']

# Loop through frames
for f in frames:
    
    # Select df which is first element of list entry
    df = f[0]

    # Not sure what this does (store the recurrence of 'markers')
    groups = df[0].isin(ls).cumsum()

    # Not sure how this works, creates a dictionary of of dataframes
    tables = {'process'+str(k): g.iloc[0:] for k,g in df.groupby(groups)}
    
    # Append to list of df with year
    tables_ls.append([tables, f[1]])

In [None]:
#tables

In [None]:
# Loop though list of tables
for l in tables_ls:
    
    # Select df which is first element of list entry
    tables = l[0]
    
    # Loop through tables dictionary
    for t in tables:
        
        if l[1] == '2013':
        
            # Manually ename columns from key at the top of sheet
            tables[t].columns = [np.nan, '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013']
        
        if l[1] == '2020':
        
            # Manually ename columns from key at the top of sheet
            tables[t].columns = [np.nan, '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']

        # Concat 1st entry of 1st row with 2+ entries of 2nd row 
        tables[t].columns = np.concatenate([tables[t].iloc[0, :1], tables[t].columns[1:]])
        # Drop 1st row
        tables[t] = tables[t].iloc[1:].reset_index(drop=True)

In [None]:
#tables[t]

In [None]:
## Convert tables from before 2022 into long format 

# Create frames to sort working df
frames = []

# Loop though list of list of dfs
for l in tables_ls:
    
    # Select list of dfs which is first element
    tables = l[0]

    # Loop through dfs in list of dfs
    for t in tables:

        # Loop through columns df
        for h in tables[t].columns[1:]:

            # Melt data into long format
            wdf = pd.melt(tables[t], id_vars=[tables[t].columns[0]], value_vars=[h],
                         var_name='Type', value_name='Value')

            # Set first column header to be Category
            wdf.columns.values[0] = 'Category'

            # Create df column with value equal to title of table 
            wdf['Table'] = tables[t].columns[0]

            # Create Year column with value from second element of list of list of dfs
            wdf['Year'] = l[1]

            # Append to list of df that are now in long format
            frames.append(wdf)

In [None]:
#wdf

In [None]:
# Concat together dfs list list
ldf = pd.concat(frames)

# Reorder columns
ldf = ldf[['Table', 'Category', 'Type', 'Value', 'Year']]

# ## Tidy up

# Create dictionary of categories to rename
ids = {'Total Excludes ELC': 'Total Excluding ELC',
       'Total School Excluding ELC': 'Total Excluding ELC',
        'Centrally Employed School': 'Centrally Employed',
        'Total Based In Schools': 'School Based',
        'Publicly Funded Schools': 'Total Excluding ELC',
        'Total Schools': 'Total Excluding ELC'
        }

# Add regex tags to strings to allow for finding exact match
replacement = {rf'\b{k}\b': v for k, v in ids.items()}

# Rename categories using dictionary above
ldf['Category'] = ldf['Category'].replace(replacement, regex=True)

# Drop duplicates
ldf = ldf.drop_duplicates()

# Reset index
ldf = ldf.reset_index(drop=True)

In [None]:
#ldf

In [None]:
ldf.to_csv('numbers.csv')

In [None]:
#ldf.loc[ldf['Table'] == 'Employment type']['Category'].unique().tolist()
ldf['Category'].unique().tolist()

In [None]:
ldf.info()