In [60]:
# Import librarys
import pandas as pd
import numpy as np

In [61]:
# Define function to read in excel file
def readfile(year, ext, sheet):
    
        df = pd.read_excel('./staff/' + year + ext, 
                                sheet_name=sheet,
                                      header=None)
        return df
    
# Define tidy up initial df
def tidy(df):
    
    #### Fix usual issues with all strings
    
    # Capitalise columns
    df = df.applymap(lambda x: x.upper() if type(x) is str else x)

    # Strip whitespace
    df = df.applymap(lambda x: x.strip() if type(x) is str else x)

    # Remove parenthesis
    df = df.applymap(lambda x: x.replace('(', '') if type(x) is str else x)
    df = df.applymap(lambda x: x.replace(')', '') if type(x) is str else x)
    
    # Remove linebreaks
    df = df.applymap(lambda x: x.replace('\n', '') if type(x) is str else x)

    # Replace annoying substrings
    df = df.applymap(lambda x: x.replace(' AND ', ' & ') if type(x) is str else x)
    df = df.applymap(lambda x: x.replace(' – ', ' - ') if type(x) is str else x)
    df = df.applymap(lambda x: x.replace(' / ', '/') if type(x) is str else x)
    df = df.applymap(lambda x: x.replace('/ ', '/') if type(x) is str else x)
    df = df.applymap(lambda x: x.replace(' /', '/') if type(x) is str else x)
    
    # Remove suffixes
    for i in range(8):
    
        df = df.applymap(lambda x: x.removesuffix(" [NOTE " + str(i) + "]") if type(x) is str else x)
        df = df.applymap(lambda x: x.removesuffix("[NOTE " + str(i) + "]") if type(x) is str else x)
        df = df.applymap(lambda x: x.removesuffix(" " + str(i)) if type(x) is str else x)
        df = df.applymap(lambda x: x.removesuffix(str(i)) if type(x) is str else x)
    
    #### Drop rows and columns with a lot of empty values
    
    # Drop rows with minimum number of 4 non-null values
    df = df.dropna(thresh=4)

    # Drop rows with columns number of 4 non-null values
    df = df.dropna(axis='columns', thresh=4)
    
    return df

In [62]:
# Create empty list to store raw df
frames = []

# Loop through years
years = ['2022', '2021', '2020']

for y in years:

    # Read in sheet
    wdf = readfile('psa' + y, '.xlsx', '2.1')

    # Tidy df
    wdf = tidy(wdf)
    
    # Set columns from first row
    wdf.columns = wdf.iloc[0].values
    wdf = wdf[1:]
    
    # Reset index
    wdf = wdf.reset_index(drop = True)
    
    # Rename column by index
    wdf.columns.values[[0]] = 'COUNCIL'
    
    # Loop through columns in table
    for h in wdf.columns[1:]:
    
        # Melt data into long format
        ldf = pd.melt(wdf, id_vars=['COUNCIL'], value_vars=[h],
                 var_name='TYPE', value_name='VALUE')
        
        # Add Column with year
        ldf['YEAR'] = y
    
        # Append to list of df (with year)
        frames.append(ldf)
        
# Loop through years
years = ['2019', '2018', '2017']

for y in years:

    # Read in sheet
    wdf = readfile('psa' + y, '.xlsx', '1.1')

    # Tidy df
    wdf = tidy(wdf)
    
    # Set columns from first row
    wdf.columns = wdf.iloc[0].values
    wdf = wdf[1:]
    
    # Reset index
    wdf = wdf.reset_index(drop = True)
    
    # Rename column by index
    wdf.columns.values[[0]] = 'COUNCIL'
    
    # Loop through columns in table
    for h in wdf.columns[1:]:
    
        # Melt data into long format
        ldf = pd.melt(wdf, id_vars=['COUNCIL'], value_vars=[h],
                 var_name='TYPE', value_name='VALUE')
        
        # Add Column with year
        ldf['YEAR'] = y
    
        # Append to list of df (with year)
        frames.append(ldf)
        
# Loop through years
years = ['2016', '2015', '2014', '2013', '2012', '2011', '2010']

for y in years:

    # Read in sheet
    wdf = readfile(y, '.xls', '8.11Full')

    # Tidy df
    wdf = tidy(wdf)
    
    # Set columns from first row
    wdf.columns = wdf.iloc[0].values
    wdf = wdf[1:]
    
    # Reset index
    wdf = wdf.reset_index(drop = True)
    
    # Rename column by index
    wdf.columns.values[[0]] = 'COUNCIL'
    
    # Loop through columns in table
    for h in wdf.columns[1:]:
    
        # Melt data into long format
        ldf = pd.melt(wdf, id_vars=['COUNCIL'], value_vars=[h],
                 var_name='TYPE', value_name='VALUE')
        
        # Add Column with year
        ldf['YEAR'] = y
    
        # Append to list of df (with year)
        frames.append(ldf)

In [63]:
ldf.head()

Unnamed: 0,COUNCIL,TYPE,VALUE,YEAR
0,ABERDEEN CITY,TECHNICIAN,66,2010
1,ABERDEENSHIRE,TECHNICIAN,93,2010
2,ANGUS,TECHNICIAN,22,2010
3,ARGYLL & BUTE,TECHNICIAN,30,2010
4,CLACKMANNANSHIRE,TECHNICIAN,7,2010


In [64]:
# Concat all together long format dfs into final df
fdf = pd.concat(frames)

In [65]:
# Convert the 'VALUE' column to numeric type and set non-numeric values to NaN
fdf['VALUE'] = pd.to_numeric(fdf['VALUE'], errors='coerce')

# Drop rows containing NaN in the 'VALUE' column
fdf.dropna(subset=['VALUE'], inplace=True)

In [66]:
# Sort categories name changes
fdf['COUNCIL'] = fdf['COUNCIL'].str.replace('EILEAN SIAR', 'NA H-EILEANAN SIAR')
fdf['COUNCIL'] = fdf['COUNCIL'].str.replace('EDINBURGH, CITY OF', 'CITY OF EDINBURGH')

In [67]:
fdf.to_csv('./csvs/other_staff_numbers.csv')