## Import libraries and define functions

In [None]:
# Import librarys
import pandas as pd
import numpy as np

In [None]:
# Define tidy up initial df
def tidy(df):
    
    #### Fix usual issues with all strings
    
    # Capitalise columns
    df = df.map(lambda x: x.upper() if type(x) is str else x)

    # Strip whitespace
    df = df.map(lambda x: x.strip() if type(x) is str else x)

    # Remove parenthesis
    df = df.map(lambda x: x.replace('(', '') if type(x) is str else x)
    df = df.map(lambda x: x.replace(')', '') if type(x) is str else x)
    
    # Remove linebreaks
    df = df.map(lambda x: x.replace('\n', '') if type(x) is str else x)

    # Replace annoying substrings
    df = df.map(lambda x: x.replace(' AND ', ' & ') if type(x) is str else x)
    df = df.map(lambda x: x.replace(' – ', ' - ') if type(x) is str else x)
    df = df.map(lambda x: x.replace(' / ', '/') if type(x) is str else x)
    df = df.map(lambda x: x.replace('/ ', '/') if type(x) is str else x)
    df = df.map(lambda x: x.replace(' /', '/') if type(x) is str else x)
    
    # Capitalise headers
    df.columns = df.columns.astype(str).str.upper()

    # Remove parenthesis from column headers
    df.columns = df.columns.map(lambda x: x.replace('(', '') if type(x) is str else x)
    df.columns = df.columns.map(lambda x: x.replace(')', '') if type(x) is str else x)
    
    # Strip whitespace from headers
    df.columns = df.columns.map(lambda x: x.strip() if type(x) is str else x)

    return df

## Read in and tidy data

In [None]:
# Read in csv
raw = pd.read_excel('./data/FOI 24 25 Review 01 Records_updated.xlsx', skiprows=2)

In [None]:
# Create working df
df = raw.copy()

# Tidy df
df = tidy(df)

# Rename columns
df = df.rename(columns = {'REGISTRATION STATUS':'STATUS', 'LAST KNOWN EMPLOYER':'EMPLOYER'})

df.head()

## Add new column with level

In [None]:
# Drop rows with NaN in Qualification(s) column as will give error
df = df.dropna(subset=['QUALIFICATIONS'])

# Define function to return level by looking through QUALIFICATIONS column cells
def level(row):
    
    # Create empty list
    id = []
    
    # Append to list if string contains SECONDARY EDUCATION etc.
    if 'SECONDARY EDUCATION' in row['QUALIFICATIONS']:
        id.append('S')
        
    if 'SECONDARY SCHOOL' in row['QUALIFICATIONS']:
        id.append('S')
    
    if 'PRIMARY EDUCATION' in row['QUALIFICATIONS']:
        id.append('P')
        
    if 'PRIMARY SCHOOL' in row['QUALIFICATIONS']:
        id.append('P')

    if 'FURTHER EDUCATION' in row['QUALIFICATIONS']:
        id.append('F')
        
    if '3-18' in row['QUALIFICATIONS']:
        id.append('3-18')
        
    if 'NAMED SCHOOL ONLY' in row['QUALIFICATIONS']:
        id.append('N')
        
    if 'TEACHER EDUCATION INSTITUTION' in row['QUALIFICATIONS']:
        id.append('T')

    if 'CROSS SECTOR' in row['QUALIFICATIONS']:
        id.append('C')
                
    # If length of list is longer that 1 then return 'MULTI' for mulitple levels
    if len(id) > 0:
    
        if len(id) > 1:
            return 'M'
        
        # Else return level as string
        else:
            return ''.join(id)
    
    # Else return level as OTHER
    else:
        return 'O'
    
# Apply function to each row
df['LEVEL'] = df.apply(lambda row: level(row), axis=1)

df.head()

## Reduce file size (starting from 6.1 mb)

In [None]:
# Shorten sex column
df['SEX'] = df['SEX'].str.replace('FEMALE', 'F')
df['SEX'] = df['SEX'].str.replace('MALE', 'M')
df['SEX'] = df['SEX'].str.replace('PREFER NOT TO SAY', 'X')

# Shorten status column
df['STATUS'] = df['STATUS'].str.replace('FULL ASSOCIATE', 'A')
df['STATUS'] = df['STATUS'].str.replace('FULL GENERAL', 'F')
df['STATUS'] = df['STATUS'].str.replace('PROVISIONAL', 'P')

# Shorten qualification column+
df['EMPLOYER'] = df['EMPLOYER'].str.replace('FURTHER EDUCATION SECTOR', 'F')
df['EMPLOYER'] = df['EMPLOYER'].str.replace('LOCAL AUTHORITY', 'L')
df['EMPLOYER'] = df['EMPLOYER'].str.replace('INDEPENDENT SCHOOLS', 'I')
df['EMPLOYER'] = df['EMPLOYER'].str.replace('NOT CURRENTLY WORKING', 'N')
df['EMPLOYER'] = df['EMPLOYER'].str.replace('OTHER', 'O')

In [None]:
# # Shorten qualification column
df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace('FURTHER EDUCATION FE', 'F')
df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace('SECONDARY EDUCATION', 'S')
df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace('SECONDARY SCHOOL - ', 'S')

df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace('PRIMARY EDUCATION PRIMARY EDUCATION WITH TRANSITION'
                                                                , 'P WITH TRANSITION')
df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace('PRIMARY EDUCATION PRIMARY', 'P')
df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace('P SCHOOL - ', 'P')
df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace('CROSS SECTOR', 'CROSS-SECTOR')
df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace('NAMED SCHOOL ONLY', 'NSO')
df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace('NSO NSO', 'NSO')
df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace(
                            'TEACHER EDUCATION INSTITUTION TEACHER EDUCATION INSTITUTION',
                                'TEACHER EDUCATION INSTITUTION')

df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace('TECHNOLOGICAL EDUCATION', 'TECH')
df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace('TECHNICAL EDUCATION', 'TECH')
df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace('PHYSICAL EDUCATION', 'PE')
df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace('PHYSICAL ED.', 'PE')
df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace('BUSINESS EDUCATION', 'BUSINESS')
df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace('BUSINESS STUDIES', 'BUSINESS')
df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace('RELIGIOUS EDUCATION', 'RE')
df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace('HOME ECONOMICS', 'HE')
df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace('CLASSICS CLASSICAL STUDIES', 'CLASSICS')
df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace('ASN 3-18 ADDITIONAL SUPPORT NEEDS', '3-18 ASN')
df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace('COMPUTING SCIENCE', 'COMPUTING')
df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace('MATHEMATICS', 'MATHS')

df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace('ASN 3-18 ADDITIONAL SUPPORT NEEDS', '3-18 ASN')
df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace('TESOL 3-18 TESOL', '3-18 TESOL')
df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace('DANCE 3-18 DANCE', '3-18 DANCE')

df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace('MODERN LANGUAGES - ', '')
df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace(' WITH SCIENCE', '')
df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace('- ENGLISH', 'ENGLISH')

df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace('S E N', 'SEN')
df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace('S F L', 'SFL')
df['QUALIFICATIONS'] = df['QUALIFICATIONS'].str.replace('SUPPORT FOR LEARNING', 'SFL')

# Strip whitespace
df = df.map(lambda x: x.strip() if type(x) is str else x)

In [None]:
# Sort columns
df = df.sort_values(by=['STATUS', 'LEVEL', 'QUALIFICATIONS', 'EMPLOYER', 'SEX'])

In [None]:
df.head()

In [None]:
df.to_csv('./csvs/gtcs.csv', index=False)