In [None]:
import pandas as pd
import numpy as np

In [None]:
def readfile(year, ext, sheet, badrows, badfooter):
    
        df = pd.read_excel('./Summary Statistics for Schools in Scotland/' + year + ext, 
                                sheet_name=sheet,
                                    header=None, 
                                        na_values=['..', '#', ':'],
                                            skiprows=badrows, skipfooter=badfooter)
        return tidy(df)
    
def tidy(df):
    
    # Strip whitespace
    df[0] = df[0].str.strip()
    
    # Create list of suffixes to remove
    sls = ["(1) (2)", " (1) (2)", " (1)", "(1)", " (2)", "(2)", " (3)", "(3)", " (4)", "(4)",
                  " (5)", "(5)", " (6)", "(6)"]

    # Loop through suffix list
    for s in sls:
        
        # Remove suffix
        df = df.applymap(lambda x: x.replace(s, '') if type(x) is str else x)
    
    # Remove any remaining parenthesis to help with finding exact match and replacing in final tidy
    df = df.applymap(lambda x: x.replace('(', '') if type(x) is str else x)
    df = df.applymap(lambda x: x.replace(')', '') if type(x) is str else x)
    
    # Capitalise columns
    df[0] = df[0].str.title()
    
    # Replace some categories to common values
    df[0] = df[0].str.replace('Elc', 'ELC', case=True)
    df[0] = df[0].str.replace('Ptrs', 'PTRs', case=True)
    df[0] = df[0].str.replace('Pre-Schools', 'ELC')
    df[0] = df[0].str.replace('Pre-School', 'ELC')
    df[0] = df[0].str.replace('Pupil Teacher Ratios', 'PTRs')
    
    # Remove rows with NaN in first column
    df = df[df[0].notna()]
    
    ## Drop dodgy columns
    # Find the nones
    nones = df.isnull()
    # Find the zeroes
    zeroes = df == 0
    # Find both the nones and the zeroes
    nones_zeroes = nones | zeroes
    # Find columns with only nones and nones
    cols = nones_zeroes.all()[nones_zeroes.all()].index.to_list()
    # Drop these columns
    df = df.drop(cols, axis=1)
        
    return df

In [None]:
### import data before 2022
# Create frames to sort working df
frames = []

# Create list of years
years = ['2013']

# Loop through years
for y in years:
    
    # Read in sheet
    wdf = readfile(y, '.xlsx', 'Table 2.3', 3, 4)
    
    # Append to list of df (with year)
    frames.append([wdf, y])

# Create list of years
years = ['2014']
    
# Loop through years
for y in years:
    
    # Read in sheet
    wdf = readfile(y, '.xls', 'Table 2.3', 3, 4)
    
    # Append to list of df (with year)
    frames.append([wdf, y])
    
# Create list of years
years = ['2015']
    
# Loop through years
for y in years:
    
    # Read in sheet
    wdf = readfile(y, '.xls', 'Table 3.4', 3, 3)
    
    # Append to list of df (with year)
    frames.append([wdf, y])
    
# Create list of years
years = ['2016']
    
# Loop through years
for y in years:
    
    # Read in sheet
    wdf = readfile(y, '.xls', 'Table 3.4', 4, 3)
    
    # Append to list of df (with year)
    frames.append([wdf, y])

# Create list of years
years = ['2017', '2018']
    
# Loop through years
for y in years:
    
    # Read in sheet
    wdf = readfile(y, '.xls', 'Table 3.4', 4, 1)
    
    # Append to list of df (with year)
    frames.append([wdf, y])
    
# Create list of years
years = ['2019']
    
# Loop through years
for y in years:
    
    # Read in sheet
    wdf = readfile(y, '.xlsx', '3.4', 4, 1)
    
    # Append to list of df (with year)
    frames.append([wdf, y])
    
# Create list of years
years = ['2020', '2021']
    
# Loop through years
for y in years:
    
    # Read in sheet
    wdf = readfile(y, '.xlsx', '3.4', 4, 2)
    
    # Append to list of df (with year)
    frames.append([wdf, y])

In [None]:
### import data after 2022
# Create frames to sort working df
nframes = []

# Create list of years
years = ['2022', '2023']

# Loop through years
for y in years:
    
    ## Gender
    # Read in sheet
    ndf = readfile(y, '.xlsx', '8', 3, 0)

    # Manually name columns from key at the top of sheet
    ndf.columns = ['Gender', 'Primary', 'Secondary', 
                                     'Special', 'Centrally Employed', 'Total']
    
    # Append to list of df (with year)
    nframes.append([ndf, y])
    
    
    ## Age
    # Read in sheet
    ndf = readfile(y, '.xlsx', '9', 3, 0)
    
    # Manually ename columns from key at the top of sheet
    ndf.columns = ['Age', 'Primary', 'Secondary', 
                                     'Special', 'Centrally Employed', 'Total']
    
    # Rename row values in first column
    ndf['Age'] = ['20-24', '25-34', '35-44', '45-54', '55+', 'Av Age']
    
    # Append to list of df (with year)
    nframes.append([ndf, y])
    
    
    ## Ethnicity
    # Read in sheet
    ndf = readfile(y, '.xlsx', '10', 3, 1)
    
    # Manually ename columns from key at the top of sheet
    ndf.columns = ['Ethnicity', 'Primary', 'Secondary', 
                                     'Special', 'Centrally Employed', 'Total']
    
    # Append to list of df (with year)
    nframes.append([ndf, y])
    
    
    ## Employment type
    # Read in sheet
    ndf = readfile(y, '.xlsx', '11', 3, 0)
    
    # Manually ename columns from key at the top of sheet
    ndf.columns = ['Employment Type', 'Primary', 'Secondary', 
                                     'Special', 'Centrally Employed', 'Total']
    
    # Rename row values in first column
    ndf['Employment Type'] = ['Permanent', 'Temporary', 'Induction']
    
    # Append to list of df (with year)
    nframes.append([ndf, y])
    
    
    ## Grade
    # Read in sheet
    ndf = readfile(y, '.xlsx', '12', 3, 0)
    
    # Manually ename columns from key at the top of sheet
    ndf.columns = ['Grade', 'Primary', 'Secondary', 
                                     'Special', 'Centrally Employed', 'Total']
    
    if len(ndf['Grade']) == 9:
    
        # Rename row values in first column
        ndf['Grade'] = ['Head', 'Head % ♀', 'Depute', 'Depute % ♀', 'Principal',
                           'Principal % ♀', 'Teacher', 'Teacher % ♀', 'All % ♀']
        
    if len(ndf['Grade']) == 11:
    
        # Rename row values in first column
        ndf['Grade'] = ['Head', 'Head % ♀', 'Depute', 'Depute % ♀', 'Principal', 'Principal % ♀', 
                        'Chartered', 'Chartered % ♀', 'Teacher', 'Teacher % ♀', 'All % ♀']
    
    # Append to list of df (with year)
    nframes.append([ndf, y])
    
    
    ## Mode Of Working
    # Read in sheet
    ndf = readfile(y, '.xlsx', '13', 3, 1)
    
    # Append to list of df (with year)
    nframes.append([ndf, y])
    
    # Manually ename columns from key at the top of sheet
    ndf.columns = ['Mode Of Working', 'Primary', 'Secondary', 
                                     'Special', 'Centrally Employed', 'Total']
    
    # Rename row values in first column
    ndf['Mode Of Working'] = ['FT', 'FT % ♀', 'PT', 'PT % ♀']
    
    # Append to list of df (with year)
    nframes.append([ndf, y])

In [None]:
#ndf.head(10)
#ndf['Age'].values
#ndf['Grade']

In [None]:
## Pull out subtables from sheet
# Create a list for tables
tables_ls = []

# Create list of subtables on sheet
ls = ['Gender', 'Sex', 'Age Years', 'Ethnicity', 'Employment Type', 'Post Employment Type', 'Grade',
           'Mode Of Working']

# Loop through frames
for f in frames:
    
    # Select df which is first element of list entry
    df = f[0]

    # Not sure what this does (store the recurrence of 'markers')
    groups = df[0].isin(ls).cumsum()

    # Not sure how this works, creates a dictionary of df for each table
    tables = {'process'+str(k): g.iloc[0:] for k,g in df.groupby(groups)}
    
    # Append to list of df with year
    tables_ls.append([tables, f[1]])

In [None]:
#tables_ls[1][0]
#read_df.head()
#tables['process0']

In [None]:
## Tidy up column headers for each table
# Loop though list of tables
for l in tables_ls:
    
    # Select df which is first element of list entry
    tables = l[0]
    
    # Loop through tables dictionary
    for t in tables:
        
        if len(tables[t].columns) == 7:
        
            # Manually ename columns from key at the top of sheet
            tables[t].columns = [np.nan, 'ELC', 'Primary', 'Secondary', 
                                     'Special', 'Centrally Employed', 'Total']
            
        if len(tables[t].columns) == 6:

            # Manually ename columns from key at the top of sheet
            tables[t].columns = [np.nan, 'Primary', 'Secondary', 
                                     'Special', 'Centrally Employed', 'Total']

        # Concat 1st entry of 1st row with 2+ entries of 2nd row 
        tables[t].columns = np.concatenate([tables[t].iloc[0, :1], tables[t].columns[1:]])
        # Drop 1st row
        tables[t] = tables[t].iloc[1:].reset_index(drop=True)

In [None]:
#tables['process1'].iloc[0]
#tables['process6'].head(10)
#tables['process6'].columns[0]
#tables_ls[0]
#tables[t].head()
#tables

In [None]:
## Convert tables from before 2022 into long format 

# Create frames to sort working df
frames = []

# Loop though list of tables
for l in tables_ls:
    
    # Select df which is first element of list entry
    tables = l[0]

    # Loop through tables
    for t in tables:
        
        if tables[t].columns[0] == 'Mode Of Working':

            # Rename row values in first column
            tables[t]['Mode Of Working'] = ['FT', 'FT % ♀', 'PT', 'PT % ♀']

        if tables[t].columns[0] == 'Grade':

            if len(tables[t]['Grade']) == 8:

                # Rename row values in first column
                tables[t]['Grade'] = ['Head', 'Head % ♀', 'Depute', 'Depute % ♀', 'Principal',
                                       'Principal % ♀', 'Teacher', 'All % ♀']

            if len(tables[t]['Grade']) == 9:
                
                # Rename row values in first column
                tables[t]['Grade'] = ['Head', 'Head % ♀', 'Depute', 'Depute % ♀', 'Principal',
                                       'Principal % ♀', 'Teacher', 'Teacher % ♀', 'All % ♀']

        if tables[t].columns[0] == 'Age Years':
            
            # Rename header for first column
            tables[t].columns.values[0] = 'Age'
            
            if len(tables[t]['Age']) == 7:
            
                # Rename row values in first column
                tables[t]['Age'] = ['20-24', '25-34', '35-44', '45-54', '55+', 'Unknown', 'Av Age']
                
            if len(tables[t]['Age']) == 6:
            
                # Rename row values in first column
                tables[t]['Age'] = ['20-24', '25-34', '35-44', '45-54', '55+', 'Av Age']
                
        if tables[t].columns[0] == 'Post Employment Type':
            
            # Rename header for first column
            tables[t].columns.values[0] = 'Employment Type'
        
        if tables[t].columns[0] == 'Employment Type':
        
            # Rename row values in first column
            tables[t]['Employment Type'] = ['Permanent', 'Temporary', 'Induction']

        # Loop through columns in table
        for h in tables[t].columns[1:]:

            # Melt data into long format
            wdf = pd.melt(tables[t], id_vars=[tables[t].columns[0]], value_vars=[h],
                         var_name='Type', value_name='Value')

            # Set first column header to be Category
            wdf.columns.values[0] = 'Category'

            # Create Table column with value equal to title of table 
            wdf['Table'] = tables[t].columns[0]

            # Create Year column with value from second element of list
            wdf['Year'] = l[1]

            # Append to list of df
            frames.append(wdf)

In [None]:
#tables[t]
#wdf.head()
#frames[0]

In [None]:
## Convert tables from 2022+ into long format 

# Loop through columns in table
for n in nframes:

    # Loop through columns in table
    for h in n[0].columns[1:]:


        # Melt data into long format
        wdf = pd.melt(n[0], id_vars=[n[0].columns[0]], value_vars=[h],
                     var_name='Type', value_name='Value')

        # Set first column header to be Category
        wdf.columns.values[0] = 'Category'

        # Create Table column with value equal to title of table 
        wdf['Table'] = n[0].columns[0]

        # Create Year column with value from second element of list
        wdf['Year'] = n[1]

        # Append to list of df
        frames.append(wdf)

In [None]:
#n[0].head(10)
#wdf

In [None]:
## Concat all together long format dfs into final df
ldf = pd.concat(frames)

In [None]:
## Tidy up

# Reorder columns
ldf = ldf[['Table', 'Category', 'Type', 'Value', 'Year']]

# Replace dash symbol
ldf['Category'] = ldf['Category'].str.replace(' – ', ' - ')

# Sort bad value
ldf['Category'] = ldf['Category'].str.replace('British Isles Islsles', 'British')

# Replace Gender with Sex (full match only)
ldf['Table'] = ldf['Table'].str.replace(r'(?i)Gender\b', r'Sex', regex=True)

# Replace Age (years) with Age (full match only)
ldf['Table'] = ldf['Table'].str.replace(r'(?i)Age years\b', r'Age', regex=True)

# Drop duplicates
ldf = ldf.drop_duplicates()

# Reset index
ldf = ldf.reset_index(drop=True)

In [None]:
#ldf
#ldf.loc[ldf['Table'] == 'Ethnicity']['Category'].unique().tolist()

In [None]:
ldf.to_csv('teacher_characteristics.csv')

In [None]:
ldf.info()