In [57]:
import pandas as pd
import numpy as np

In [58]:
# Function to tidy columns and cells
def tidy(df):
    
    #### Fix usual issues with all strings
    
    # Capitalise headers
    df.columns = df.columns.astype(str).str.upper()
    
    # Capitalise columns
    df = df.map(lambda x: x.upper() if type(x) is str else x)

    # Strip whitespace
    df = df.map(lambda x: x.strip() if type(x) is str else x)

    # Remove parenthesis
    df = df.map(lambda x: x.replace('(', '') if type(x) is str else x)
    df = df.map(lambda x: x.replace(')', '') if type(x) is str else x)
    
    # Remove dashes
    df = df.map(lambda x: x.replace('-', '') if type(x) is str else x)
    
    # Remove full stops
    df = df.map(lambda x: x.replace('.', '') if type(x) is str else x)
    
    # Remove commas
    df = df.map(lambda x: x.replace(',', '') if type(x) is str else x)
    
    # Remove linebreaks
    df = df.map(lambda x: x.replace('\n', '') if type(x) is str else x)
    
    # Remove double spaces
    df = df.map(lambda x: x.replace('  ', ' ') if type(x) is str else x)

    # Replace annoying substrings
    df = df.map(lambda x: x.replace(' AND ', ' & ') if type(x) is str else x)
    df = df.map(lambda x: x.replace(' – ', ' - ') if type(x) is str else x)
    df = df.map(lambda x: x.replace(' / ', '/') if type(x) is str else x)
    df = df.map(lambda x: x.replace('/ ', '/') if type(x) is str else x)
    df = df.map(lambda x: x.replace(' /', '/') if type(x) is str else x)
    
    # Strip whitespace again
    df = df.map(lambda x: x.strip() if type(x) is str else x)
    
    # Drop rows with minimum number of 2 non-null values
    df = df.dropna(thresh=2)
    
    return df

In [60]:
# Read in data
df = pd.read_excel('./data/school+contact+list+31+October+2024.xlsx', sheet_name='Open Schools', skiprows=5)

# Tidy data
df = tidy(df)

# Select secondary schools
#df = df.loc[df['SECONDARY DEPARTMENT'] == 'YES']

# Sort values and reset index
df = df.sort_values(['LA NAME', 'SCHOOL NAME']).reset_index(drop=True)

# Select key columns
df = df[['SEED CODE', 'LA NAME', 'CENTRE TYPE', 'SCHOOL NAME',
         'ADDRESS LINE1', 'ADDRESS LINE2', 'ADDRESS LINE3', 'POST CODE', 
        'PRE-SCHOOL DEPARTMENT', 'PRIMARY DEPARTMENT',
        'SECONDARY DEPARTMENT', 'SPECIAL DEPARTMENT',
        'PROVISION FOR GAELIC LEARNERS']]

In [61]:
# Export to .csv
df.to_csv('./csvs/school_info.csv', index = False)