In [None]:
import pandas as pd
import numpy as np

In [None]:
# Create function to read in excel file
def readfile(sheet):
    
        df = pd.read_excel('./data/FS560267659 budget and spend data.xlsx', 
                                sheet_name=sheet)
        return tidy(df)

def tidy(df):
    
    #### Fix usual issues with all strings
    
    # Capitalise headers
    df.columns = df.columns.astype(str).str.upper()
    
    # Capitalise columns
    df = df.map(lambda x: x.upper() if type(x) is str else x)

    # Strip whitespace
    df = df.map(lambda x: x.strip() if type(x) is str else x)

    # Remove parenthesis
    df = df.map(lambda x: x.replace('(', '') if type(x) is str else x)
    df = df.map(lambda x: x.replace(')', '') if type(x) is str else x)
    
    # Remove dashes
    df = df.map(lambda x: x.replace('-', '') if type(x) is str else x)
    
    # Remove full stops
    df = df.map(lambda x: x.replace('.', '') if type(x) is str else x)
    
    # Remove commas
    df = df.map(lambda x: x.replace(',', '') if type(x) is str else x)
    
    # Remove linebreaks
    df = df.map(lambda x: x.replace('\n', '') if type(x) is str else x)

    # Replace annoying substrings
    df = df.map(lambda x: x.replace(' AND ', ' & ') if type(x) is str else x)
    df = df.map(lambda x: x.replace(' – ', ' - ') if type(x) is str else x)
    df = df.map(lambda x: x.replace(' / ', '/') if type(x) is str else x)
    df = df.map(lambda x: x.replace('/ ', '/') if type(x) is str else x)
    df = df.map(lambda x: x.replace(' /', '/') if type(x) is str else x)
    
    # Strip whitespace again
    df = df.map(lambda x: x.strip() if type(x) is str else x)
    
    # Drop rows with minimum number of 2 non-null values
    df = df.dropna(thresh=2)
    
    return df

In [None]:
# Create list for dataframes
frames = []

## Read in files

dfh = readfile('10416')
dfh['SCH'] = 'MHS'
frames.append(dfh)

dfp = readfile('10094')
dfp['SCH'] = 'MPS'
frames.append(dfp)

dfl = readfile('10091')
dfl['SCH'] = 'LPS'
frames.append(dfl)

dfa = readfile('10072')
dfa['SCH'] = 'APS'
frames.append(dfa)

dfi = readfile('10086')
dfa['SCH'] = 'IPS'
frames.append(dfi)

In [None]:
# Concat all together long format dfs into final df
fdf = pd.concat(frames)

# Rename Desciption column
fdf = fdf.rename(columns={'DESCR': 'DESCRIPTION'})

In [None]:
# Update a few names
fdf = fdf.map(lambda x: x.replace('TEACHERS SUPPLY', 'SUPPLY') if type(x) is str else x)
fdf = fdf.map(lambda x: x.replace('TEACHER SUPPLY', 'SUPPLY') if type(x) is str else x)

# Remove School name from desciption column
ls = ['MALLAIG HIGH SCHOOL', 'MALLAIG SECONDARY SCHOOL', 'MALLAIG SECONDARY',
        'MALLAIG SEC SCHOOL', 'MALLAIG SEC', ' PS ', 
        'MALLAIG', 'LADY LOVAT', 'ARISAIG', 'INVERIE']

for l in ls:
    fdf['DESCRIPTION'] = fdf['DESCRIPTION'].str.replace(l, '')

# Join together some awkward categories
fdf['DESCRIPTION'] = fdf['DESCRIPTION'].str.replace('LRN SUP', 'LRNSUP')
fdf['DESCRIPTION'] = fdf['DESCRIPTION'].str.replace('MOD LANG', 'MODLANG')
fdf['DESCRIPTION'] = fdf['DESCRIPTION'].str.replace('MOD STD', 'MODSTD')
fdf['DESCRIPTION'] = fdf['DESCRIPTION'].str.replace('MUSIC & DRAMA', 'MUSIC&DRAMA')
fdf['DESCRIPTION'] = fdf['DESCRIPTION'].str.replace('FIRST AID', 'FIRSTAID')
fdf['DESCRIPTION'] = fdf['DESCRIPTION'].str.replace('FOOD FOR THOUGHT', 'FOODFORTHOUGHT')
fdf['DESCRIPTION'] = fdf['DESCRIPTION'].str.replace('ADD FUNDS', 'ADDFUNDS')

# Drop rows with non-null values
fdf = fdf.dropna()

# Strip whitespace
fdf = fdf.map(lambda x: x.strip() if type(x) is str else x)

# Set Category column to first name of DESCRIPTIONiption column
fdf['CATEGORY'] = fdf['DESCRIPTION'].str.split(n=1).str[0]

# Remove first word from DESCRIPTIONiption column
fdf['DESCRIPTION'] = fdf['DESCRIPTION'].str.split(n=1).str[1]

In [None]:
fdf.head()

In [None]:
# Create list of values
val = ['2324 BUDGET', '2324 ACTUALS', '2223 BUDGET',
       '2223 ACTUALS', '2122 BUDGET', '2122 ACTUALS', '2021 BUDGET',
       '2021 ACTUALS', '1920 BUDGET', '1920 ACTUALS', '1819 BUDGET',
       '1819 ACTUALS', '1718 BUDGET', '1718 ACTUALS', '1617 BUDGET',
       '1617 ACTUALS', '1516 BUDGET', '1516 ACTUALS']

# Convert into long format
ldf = pd.melt(fdf, id_vars=['EXP HEAD', 'DESCRIPTION', 'SCH', 'CATEGORY'], value_vars=val,
             var_name='VARIABLE', value_name='VALUE')

# Split variable column into year and type
ldf[['YEAR', 'TYPE']] = ldf['VARIABLE'].str.split(' ', n=1, expand=True)

# Drop old variable column
ldf = ldf.drop('VARIABLE', axis=1)

# Drop values equal to 0
ldf = ldf[ldf['VALUE'] != 0]

# Reorder columns
ldf = ldf[['EXP HEAD', 'SCH', 'CATEGORY', 'DESCRIPTION', 'YEAR', 'TYPE', 'VALUE']]

In [None]:
ldf.head()

In [None]:
# Export to csv
ldf.to_csv('./csvs/budget.csv', index=False)