## 1. Import packages and set options

In [None]:
import pandas as pd  # a module which provides the data structures and functions to store and manipulate tables in dataframes
import pydbtools as pydb  # A module which allows SQL queries to be run on the Analytical Platform from Python, see https://github.com/moj-analytical-services/pydbtools
import boto3  # allows you to directly create, update, and delete AWS resources from Python scripts
import numpy as np
import re
import math
#import pandasql

# sets parameters to view dataframes for tables easier
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 900)
pd.set_option("display.max_colwidth", 200)

In [None]:
#imports DOL extract data from S3 bucket into a temporary table
dol_table = pd.read_csv("s3://alpha-family-data/CSVs/Deprivation_of_Liberty/DoL_extract.csv", low_memory = False)
dol_table.columns = dol_table.columns.str.lower()
#Converting some columns to datetime
dol_table['appndate'] = pd.to_datetime(dol_table['appndate'], format = '%d/%m/%Y')
dol_table['dateofbirth'] = pd.to_datetime(dol_table['dateofbirth'], format = '%d/%m/%Y')

#Adding some extra columns
dol_table['year'] = dol_table['appndate'].dt.year
dol_table['quarter'] = dol_table['appndate'].dt.quarter
dol_table['childage'] = np.floor((dol_table.appndate - dol_table.dateofbirth).dt.days / 365.25)

#Adding age band
dol_table['ageband'] = np.where(pd.isnull(dol_table['childage']), 'Unknown',
                       np.where(dol_table['childage'] < 10, 'Under 10',
                       np.where(dol_table['childage'] < 13, '10-12 years',
                       np.where(dol_table['childage'] < 16, '13-15 years',
                       np.where(dol_table['childage'] < 19, '16-18 years', 'Other')))))

#Distinct applications
#dol_apps_distinct = dol_table.drop_duplicates(subset = ['casenumber', 'appndate', 'appninfo'])
#Putting case in order and ranking equivalent to a row over() statement in sql
#dol_apps_distinct["rank"] = dol_apps_distinct.groupby("casenumber")["appndate"].rank(method="first", ascending=True)



In [None]:
dol_table[dol_table["childage"] == 17]

In [None]:
#dol_table
#dol_grouped
#dol_grouped_total
#dol_apps_distinct

In [None]:
pydb.dataframe_to_temp_table(dol_table, "dol_new")

In [None]:
dol_table.groupby(by = 'partyname', as_index = False)['casenumber'].count().sort_values(by = 'casenumber', ascending = False)

In [None]:
test = pydb.read_sql_query("SELECT * from __temp__.dol_new")
test

In [None]:
#imports Date Dimension Table extract data from S3 bucket into a temporary table
dim_date = pd.read_csv("s3://alpha-family-data/CSVs/Deprivation_of_Liberty/DIM_DATE.csv", low_memory = False)
dim_date['DATE_KEY'] = pd.to_datetime(dim_date['DATE_KEY'], format = '%d-%b-%y')
dim_date['WEEK_BEGIN_DATE'] = pd.to_datetime(dim_date['WEEK_BEGIN_DATE'], format = '%d-%b-%y')
dim_date['SAME_DAY_YEAR_AGO'] = pd.to_datetime(dim_date['SAME_DAY_YEAR_AGO'], format = '%d-%b-%y')

In [None]:
dim_date

In [None]:
dol_region_lookup = pd.read_csv("s3://alpha-family-data/CSVs/Deprivation_of_Liberty/Council_Lookup.csv", low_memory = False)
pydb.dataframe_to_temp_table(dol_region_lookup, "dol_region_lookup")

In [None]:
# Selecting distinct applications and adding a count for rows
pydb.create_temp_table(
f"""
SELECT
*,
ROW_NUMBER() OVER(PARTITION BY casenumber
                       ORDER BY appndate) AS app_count,

CASE WHEN Closed is NULL THEN 'Open'
ELSE 'Closed'
END AS Closed_or_Open,

CASE WHEN ROW_NUMBER() OVER(PARTITION BY casenumber
                       ORDER BY appndate) = 1
THEN 'Initial'
WHEN ROW_NUMBER() OVER(PARTITION BY casenumber
                       ORDER BY appndate) > 1
THEN 'Extended'
ELSE 'Unknown'
END AS App_type

FROM(
SELECT
DISTINCT
t1.Year, 
t1.Quarter, 
t1.ageband,
t1.casenumber,
t1.appndate,
t1.appninfo,
t1.closureinfo,
t1.c21events,
t1.latestc21,
t1.opened,
t1.closed,
t2.party,
t2.region,
t2.party_type
FROM 
__temp__.dol_new t1
LEFT JOIN 
__temp__.dol_region_lookup t2
ON t1.partyname = t2.Party)
""",
"dol_apps_all")

In [None]:
# Application Count
pydb.create_temp_table(
f"""
SELECT 
t1.Year, 
t1.Quarter, 
'N/A' as ageband,
'Applications' as Count_type,
t1.App_type,
t1.Region,
t1.Party_type,
'N/A' as gender,
Count(*) as Count
FROM 
__temp__.dol_apps_all t1
GROUP BY
t1.Year, 
t1.Quarter, 
t1.App_type,
t1.Region,
t1.Party_type
""",
"dol_apps_agg")

In [None]:
# Child Count of DOL
pydb.create_temp_table(
f"""
SELECT 
t1.Year, 
t1.Quarter, 
t1.ageband,
'Child' as Count_type,
'All' as App_type,
t2.Region,
t2.Party_type,
t1.gender,
Count(*) as Count
FROM 
__temp__.dol_new t1
LEFT JOIN 
__temp__.dol_region_lookup t2
ON t1.partyname = t2.Party
GROUP BY
t1.Year, 
t1.Quarter, 
t1.ageband,
t2.Region,
t2.Party_type,
t1.gender
""",
"dol_child")


# Case Count
pydb.create_temp_table(
f"""
SELECT 
t1.Year, 
t1.Quarter, 
'N/A' as ageband,
'Case' as Count_type,
'N/A' as App_type,
t1.Region,
t1.Party_type,
'N/A' as gender,
Count(*) as Count
FROM 
__temp__.dol_apps_all t1
WHERE app_count = 1
GROUP BY
t1.Year, 
t1.Quarter, 
t1.Region,
t1.Party_type


""",
"dol_cases")




In [None]:
pydb.create_temp_table(
f"""
SELECT
*
FROM __temp__.dol_cases

UNION ALL

SELECT
*
FROM __temp__.dol_apps_agg

UNION ALL

SELECT
*
FROM __temp__.dol_child

""",
"dol_csv")


In [None]:
ORDER BY
year, quarter, count_type, region, party_type, app_type, ageband, closed_or_open

In [None]:
test = pydb.read_sql_query("SELECT * FROM __temp__.dol_csv").sort_values(by = ["year", "quarter", "count_type", "region", "party_type", "ageband", "gender"])
test[test["count_type"] == "Child"]

In [None]:
test.to_csv(path_or_buf = 's3://alpha-family-data/CSVs/Deprivation_of_Liberty/dol_csv.csv', index = False)