In [None]:
#First Stage of automating exception reports

In [None]:
#Downloading the raw data from Amazon Web Service (AWS) S3 
#This is the cloud warehouse where we store data for use on the analytical platform

#This is a package that allows us to download data from AWS S3 in a more usable way
#You may need to install it in the Terminal --> pip install arrow-pd-parser

In [None]:
from arrow_pd_parser import reader, writer
import boto3
import pandas as pd
import numpy as np
import io
import xlsxwriter

In [None]:
# Specifying the reader Both reader statements are equivalent and call the same readers under the hood
#Location of data - setting up the folder and file names
bucket = "s3://alpha-piatool/"
input_folder = "sopdq_AV/inputs/"
output_folder = "sopdq_AV/outputs/"
raw_data = "WFPT Staff Data Mar22.csv"

In [None]:
#Loading the raw data stored by David Yuen 
raw_data = reader.read(bucket+input_folder+raw_data, file_format="csv")

In [None]:
raw_data.head(2)

In [None]:
#Tracie Kilbey creates four sheets from the raw dataset, which she then uses for the exception reports
#Let's start with creating the first sheet, as it's the simplest - "WfPT all"
#This is just the raw data, but we'll make a few changes to column headings to make it easier to use

In [None]:
WfPT_all = raw_data

In [None]:
# WfPT_all.columns = WfPT_all.columns.str.replace(" ", "_")
# WfPT_all.columns = map(str.lower, WfPT_all.columns)

In [None]:
raw_data.columns

In [None]:
#Save this file in AWS S3 
writer.write(df=WfPT_all, output_path=bucket+output_folder+"WfPT all.csv", file_format="csv")

In [None]:
#Completed first file
WfPT_all
len(WfPT_all)

In [None]:
# Now create second file - Probation inc HQ
# Filter Column "NOMS-MOJ" to leave only "NPS" or "NOMS HQ"

In [None]:
Probation_inc_HQ = raw_data[raw_data["NOMS-MOJ"].isin(["NOMS HQ", "NPS"])]

In [None]:
#Save on AWS S3
writer.write(df=Probation_inc_HQ, output_path=bucket+output_folder+"Probation inc HQ.csv", file_format="csv")

In [None]:
#Completed second file
Probation_inc_HQ.head(2)
len(Probation_inc_HQ)

In [None]:
# Now create the third file - Probation some HQ
#For those working in Probation Delivery Units (PDUs) and Region Offices, 
#identify those who have NOMS-MOJ (column S) value of "NPS".
# Add in Other HQ staff – see establishment names below (use the Prog CC and Admin CC)
# Cost Centre = 10207830, 10207831, 10207812

In [None]:
raw_data.head(2)

In [None]:
a = raw_data[raw_data["NOMS-MOJ"] == "NPS"]
b = raw_data[raw_data["NOMS-MOJ"] == "NOMS HQ"]
c = b[b["Cost Centre"].isin([10207830, 10207831, 10207812])]
probation_some_hq = pd.concat([a,c])

In [None]:
probation_some_hq
len(probation_some_hq)

In [None]:
#Save on AWS S3
writer.write(df=probation_some_hq, output_path=bucket+output_folder+"Probation some HQ.csv", file_format="csv")

In [None]:
#Create the fourth sheet - APs
## Filter for NOMS-HQ field of "NOMS HQ" and then filter for all establishment with “APs" at end of name. 
## Filter for National Approved Premises and AP Professionalisation Project (as in the table below).
### Cost centres = 10207588, 10207586


In [None]:
a = raw_data[raw_data["NOMS-MOJ"] == "NOMS HQ"]
b = a[a["Establishment"].str.contains("APs")]
c = raw_data[raw_data["Cost Centre"].isin([10207588, 10207586])]
aps = pd.concat([b,c])

In [None]:
len(aps)

In [None]:
#Save on AWS S3
writer.write(df=aps, output_path=bucket+output_folder+"APs.csv", file_format="csv")

In [None]:
####################################################################################################################
# Next step - to combine csv files into one workbook on excel
####################################################################################################################

In [None]:
## Name of csv files
# WfPT_all
# Probation_inc_HQ
# probation_some_hq
# aps

In [None]:
with io.BytesIO() as output:
    with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
        WfPT_all.to_excel(writer, sheet_name='WfPT All')
        Probation_inc_HQ.to_excel(writer, sheet_name='Probation inc HQ')
        probation_some_hq.to_excel(writer, sheet_name='Probation some HQ')
        aps.to_excel(writer, sheet_name='APs')
    data = output.getvalue()
s3 = boto3.resource('s3')
s3.Bucket('alpha-piatool').put_object(Key='sopdq_AV/outputs/WFPT Staff Data Mar22 Probation.xlsx', Body=data)


In [None]:
####################################################################################################################
# Next step - Add look up files
####################################################################################################################

In [None]:
#Loading the look up file stored in reference folder
look_ups = pd.read_excel(bucket+input_folder+"Lookups for Probation DQ v1.4.xlsx", 
                         sheet_name=1, skiprows=2)

cost_centre_consistency = pd.read_excel(bucket+input_folder+"Lookups for Probation DQ v1.4.xlsx", 
                                        sheet_name="Cost Centre types", skiprows=2)

In [None]:
look_ups.columns

In [None]:
cost_centre_lookup = look_ups[['Cost Centre', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3','Cost Centre Description']]
location_name_lookup = look_ups[['Location Name', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Location Post Code']]
internal_function_lookup = look_ups[['Internal Function']]
job_title_lookup = look_ups[['Job Title']]
job_band_lookup = look_ups[['MoJ/NOMS Grade']]

In [None]:
################################################################################################################
# Main Data Quality checks 
## Cost Centres
## HR locations
## Internal functions
## Job (Title)
## *Job (Band) not required but added in any event
################################################################################################################

In [None]:
#Tidy up Cost Centre lookup column titles
new_header = cost_centre_lookup.iloc[0] #grab the first row for the header
cost_centre_lookup = cost_centre_lookup[1:] #take the data less the header row
cost_centre_lookup.columns = new_header #set the header row as the df header

In [None]:
cost_centre_lookup = cost_centre_lookup[['Cost Centre Number','Region','Pdu Name (as used in the target)/LDU name', "Cost centre name in SOP"]]
cost_centre_lookup.head(3)

In [None]:
#Tidy up HR Location lookup column titles
hr_location_lookup = location_name_lookup
new_header = hr_location_lookup.iloc[0] #grab the first row for the header
hr_location_lookup = hr_location_lookup[1:] #take the data less the header row
hr_location_lookup.columns = new_header #set the header row as the df header

In [None]:
hr_location_lookup.head(3)                        

In [None]:
#Tidy up Internal Function lookup column title
new_header = internal_function_lookup.iloc[0] #grab the first row for the header
internal_function_lookup = internal_function_lookup[1:] #take the data less the header row
internal_function_lookup.columns = new_header #set the header row as the df header

In [None]:
internal_function_lookup.head(3)

In [None]:
#Tidy up Job (title) lookup column title
new_header = job_title_lookup.iloc[0] #grab the first row for the header
job_title_lookup = job_title_lookup[1:] #take the data less the header row
job_title_lookup.columns = new_header #set the header row as the df header

In [None]:
job_title_lookup.head(3)

In [None]:
#Tidy up Job (band) lookup column title
new_header = job_band_lookup.iloc[0] #grab the first row for the header
job_band_lookup = job_band_lookup[1:] #take the data less the header row
job_band_lookup.columns = new_header #set the header row as the df header

In [None]:
job_band_lookup.head(3)

In [None]:
cost_centre_consistency = cost_centre_consistency[["Cost Centre", "Cost Centre Type"]]
cost_centre_consistency["Cost Centre"]=cost_centre_consistency["Cost Centre"].astype("Int64")
cost_centre_consistency.head(3)

In [None]:
# Save all lookups in single excel file
with io.BytesIO() as output:
    with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
        cost_centre_lookup.to_excel(writer, sheet_name='Cost Centre Lookup')
        internal_function_lookup.to_excel(writer, sheet_name='Internal Function Lookup')
        hr_location_lookup.to_excel(writer, sheet_name='HR Location Lookup')
        job_title_lookup.to_excel(writer, sheet_name='Job Title Lookup')
        cost_centre_consistency.to_excel(writer, sheet_name="Cost Centre type")
    data = output.getvalue()
s3 = boto3.resource('s3')
s3.Bucket('alpha-piatool').put_object(Key='sopdq_AV/outputs/WFPT lookups.xlsx', Body=data)


In [None]:
################################################################################
# Workforce Probation Error Report
################################################################################

In [None]:
## Step 1: Create Sheet "WfPT Probation Staff in Post"
### Use Probation Some HQ 
### Use cost_centre_consistency

In [None]:
len(probation_some_hq)

In [None]:
probation_some_hq.columns

In [None]:
wfpt_probation_sip = pd.merge(probation_some_hq, cost_centre_consistency, on = "Cost Centre", how = "left")

In [None]:
cols = ['Cost Centre Type', 'Internal Function', 'Job']
wfpt_probation_sip["Joined CCType,IF, Job For Consistency Check"] = wfpt_probation_sip[cols].apply(lambda row: ', '.join(row.values.astype(str)), axis=1)

In [None]:
##################################################################################################################

In [None]:
## Step 2: Create "Cost Centre Error Flag" column

# Merge main to lookup on cost centre
# If matched, "Valid", 
# If NA, "Missing", 
# If not NA and not matched, "Invalid"

In [None]:
cost_centre_lookup2 = cost_centre_lookup[["Cost Centre Number"]]
cost_centre_lookup2 = cost_centre_lookup2.dropna(subset=["Cost Centre Number"])
cost_centre_lookup2 =cost_centre_lookup2.drop_duplicates(subset=["Cost Centre Number"])
cost_centre_lookup2["Cost Centre"] = pd.to_numeric(cost_centre_lookup2["Cost Centre Number"])

In [None]:
wfpt_probation_sip2 = pd.merge(wfpt_probation_sip, cost_centre_lookup2, on="Cost Centre", how="left")

In [None]:
wfpt_probation_sip2["Cost Centre Error Flag"] = np.where(wfpt_probation_sip2["Cost Centre Number"] == wfpt_probation_sip2["Cost Centre"], "Valid", 
                                                         np.where(wfpt_probation_sip2["Cost Centre"].isna(), "Missing", "Invalid"))

In [None]:
print(
    len(wfpt_probation_sip2[wfpt_probation_sip2["Cost Centre Error Flag"]=="Invalid"]), 
    len(wfpt_probation_sip2[wfpt_probation_sip2["Cost Centre Error Flag"]=="Valid"]), 
    len(wfpt_probation_sip2[wfpt_probation_sip2["Cost Centre Error Flag"]=="Missing"]), 
    len(wfpt_probation_sip2)
)

In [None]:
## Step 3: Create "HR Location Error Flag" column

# Merge main to lookup on cost centre
# If matched, "Valid", 
# If NA, "Missing", 
# If not NA and not matched, "Invalid"