In [None]:
#First Stage of automating exception reports

In [None]:
#Downloading the raw data from Amazon Web Service (AWS) S3 
#This is the cloud warehouse where we store data for use on the analytical platform

#This is a package that allows us to download data from AWS S3 in a more usable way
#You may need to install it in the Terminal --> pip install arrow-pd-parser
from arrow_pd_parser import reader, writer

#Alternatively you can use Boto3 --> pip install boto3
import boto3

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Specifying the reader Both reader statements are equivalent and call the same readers under the hood
#Location of data - setting up the folder and file names
bucket = "s3://alpha-piatool/"
folder = "sopdq_AV/"
raw_data = "WFPT Staff Data Mar22.csv"
filename = bucket+folder+raw_data

In [None]:
#Loading the raw data stored by David Yuen 
raw_data = reader.read(filename, file_format="csv")

In [None]:
raw_data.head(2)

In [None]:
#Tracie Kilbey creates four sheets from the raw dataset, which she then uses for the exception reports
#Let's start with creating the first sheet, as it's the simplest - "WfPT all"
#This is just the raw data, but we'll make a few changes to column headings to make it easier to use

In [None]:
WfPT_all = raw_data

In [None]:
WfPT_all.columns = WfPT_all.columns.str.replace(" ", "_")
WfPT_all.columns = map(str.lower, WfPT_all.columns)

In [None]:
#Save this file in AWS S3 
writer.write(df=WfPT_all, output_path=bucket+folder+"WfPT all.csv", file_format="csv")

In [None]:
#Completed first file
WfPT_all
len(WfPT_all)

In [None]:
# Now create second file - Probation inc HQ
# Filter Column "NOMS-MOJ" to leave only "NPS" or "NOMS HQ"

In [None]:
Probation_inc_HQ = raw_data[raw_data["noms-moj"].isin(["NOMS HQ", "NPS"])]

In [None]:
#Save on AWS S3
writer.write(df=Probation_inc_HQ, output_path=bucket+folder+"Probation inc HQ.csv", file_format="csv")

In [None]:
#Completed second file
Probation_inc_HQ.head(2)
len(Probation_inc_HQ)

In [None]:
# Now create the third file - Probation some HQ
#For those working in Probation Delivery Units (PDUs) and Region Offices, 
#identify those who have NOMS-MOJ (column S) value of "NPS".
# Add in Other HQ staff – see establishment names below (use the Prog CC and Admin CC)
# Cost Centre = 10207830, 10207831, 10207812

In [None]:
raw_data.head(2)

In [None]:
a = raw_data[raw_data["NOMS-MOJ"] == "NPS"]
b = raw_data[raw_data["NOMS-MOJ"] == "NOMS HQ"]
c = b[b["Cost Centre"].isin([10207830, 10207831, 10207812])]
probation_some_hq = pd.concat([a,c])

In [None]:
probation_some_hq
len(probation_some_hq)

In [None]:
#Save on AWS S3
writer.write(df=probation_some_hq, output_path=bucket+folder+"Probation some HQ.csv", file_format="csv")

In [None]:
#Create the fourth sheet - APs
## Filter for NOMS-HQ field of "NOMS HQ" and then filter for all establishment with “APs" at end of name. 
## Filter for National Approved Premises and AP Professionalisation Project (as in the table below).
### Cost centres = 10207588, 10207586


In [None]:
a = raw_data[raw_data["NOMS-MOJ"] == "NOMS HQ"]
b = a[a["Establishment"].str.contains("APs")]
c = raw_data[raw_data["Cost Centre"].isin([10207588, 10207586])]
aps = pd.concat([b,c])

In [None]:
len(aps)

In [None]:
#Save on AWS S3
writer.write(df=aps, output_path=bucket+folder+"APs.csv", file_format="csv")

In [None]:
#Save all csv's as separate sheets in a single Excel file - "WFPT Staff Data Mar22 Probation.xlsx"

from openpyxl import Workbook

wb = Workbook()
wb.create_sheet("Probation inc HQ")
wb.create_sheet("Probation some HQ")
wb.create_sheet("APs")
wb['Sheet'].title = "WfPT all"

In [None]:
books_df = pd.DataFrame(
    data={"Title": ["Book I", "Book II", "Book III"], "Price": [56.6, 59.87, 74.54]},
    columns=["Title", "Price"],
)

key = "books.csv"

books_df.to_csv(bucket+folder+key)

In [None]:
bucket = "s3://alpha-piatool/"
folder = "sopdq_AV/"
bucket+folder

In [None]:
import boto3


s3 = boto3.resource('s3')
s3.Object('alpha-piatool', 'sopdq_AV').download_file('books.csv')
