# Children Act Applications

### This code includes the following processes:
1. Children Act applications data extraction
2. Creation of main applications data
3. extration and creation of parties data
4. Events and cases tables
5. Individual children
6. High court cases

#### Import packages and set options

In [6]:
import os  # for file paths
import pandas as pd
import awswrangler as wr
import pydbtools as pydb  # see https://github.com/moj-analytical-services/pydbtools
import altair as alt  # for plotting, see https://altair-viz.github.io/getting_started/overview.html

# few things for viewing dataframes better
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 900)
pd.set_option("display.max_colwidth", 200)

#### Defining variables

In [7]:
# define some variables to be used throughout the notebook
database = "familyman_dev_v2"
snapshot_date = "2021-07-08"
database2 = "fcsq"

In [8]:
# create path for within athena FCSQ database in the S3 folder
fcsq_db_path = f"s3://alpha-family-data/fcsq_processing/CA_apps/"

#### Example code
##### Running SQL in Python

In [9]:
# Here we use an f-string which allows variables to be incorporated into the string.
# The database contains many snapshots in time. These need to be defined for each table
# to avoid returning duplicates

# triple quotation marks allow a string to be defined within the quaotation marks even on new lines
# makes long sql strings much more readable.
# single quotation marks limit the string to a single line.

s1 = f"""
select * from {database}.cases 
where mojap_snapshot_date = date '{snapshot_date}'
limit 10
"""

# pydbtools read_sql_query function will return the results of the query into a pandas dataframe
simple_results = pydb.read_sql_query(s1)
simple_results

Unnamed: 0,case_number,security,second_case_number,parent_case_number,closed,case_model,admin_court_id,date_printed,mojap_file_land_timestamp,mojap_snapshot_date
0,NN95D01595,5,,,,FM1,NN,NaT,1625742440,2021-07-08
1,NN95D01598,5,,,,FM1,NN,NaT,1625742440,2021-07-08
2,NN95D01600,5,,,,FM1,NN,NaT,1625742440,2021-07-08
3,NN95D01602,5,,,,FM1,NN,NaT,1625742440,2021-07-08
4,NN95D01605,5,,,,FM1,NN,NaT,1625742440,2021-07-08
5,NN95D01606,5,,,,FM1,NN,NaT,1625742440,2021-07-08
6,NN95D01607,5,,,,FM1,NN,NaT,1625742440,2021-07-08
7,NN95D01608,5,,,,FM1,NN,NaT,1625742440,2021-07-08
8,NN95D01609,5,,,,FM1,NN,NaT,1625742440,2021-07-08
9,NN95D01611,5,,,,FM1,NN,NaT,1625742440,2021-07-08


##### Creating temporary tables

In [10]:
# Here we are creating a temporary table that can be queried later
#  from the __temp__ database
pydb.create_temp_table(s1, "case_table")

# Query the temporary table just created. The database to query is called __temp__, this is
# an alias for a sandbox database that is created for each user. For more details, see
# the pydbtools docs
temp_table = pydb.read_sql_query("select count(*) as count from __temp__.case_table")

temp_table

Unnamed: 0,count
0,10


##### Creating tables in Athena
###### When creating tables in Athena, the underlying data also needs to be written to a folder in S3

In [11]:
# create path for new table (initial bucket path and sub-folder defined earlier)
test_table_s3_path = os.path.join(fcsq_db_path, "test_table/")

In [12]:
# create table in Athena FCSQ database, stating the S3 path to store underlying data
test_table = f"""
CREATE TABLE {database2}.test_table WITH
(
    external_location='{test_table_s3_path}'
) AS
SELECT *
FROM __temp__.case_table
"""
# execute the SQL query
_ = pydb.start_query_execution_and_wait(test_table)

##### Deleting tables in Athena

In [13]:
# Delete all the underlying data stored within the S3 location
if wr.s3.list_objects(test_table_s3_path):
    print("deleting objs")
    wr.s3.delete_objects(test_table_s3_path)

deleting objs


In [14]:
# drop the table in Athena
drop_test_table = f"""
DROP TABLE {database2}.test_table
"""
# execute the SQL query
_ = pydb.start_query_execution_and_wait(drop_test_table)

In [15]:
# This also works but not using old versions of pybd. Note the recommended option is the start_query...wait option
pydb.read_sql_query(f"""DROP TABLE {database2}.test_table""")

InvalidRequestException: An error occurred (InvalidRequestException) when calling the StartQueryExecution operation: line 7:1: mismatched input 'DROP'. Expecting: '(', <query>

In [None]:
# can also embed the code within the brackets rather than creating as a separate object
_ = pydb.start_query_execution_and_wait(f"""DROP TABLE {database2}.test_table""")

## 1. Applications - creating childout parties data

#### 1.1. Applications - table of all applications, filtered by specific order types

In [16]:
# Creating a table with all the applications for CA, codes selected from the order type lookup file.
# The cross join unnest function flattens the application values to one row per app type
ca1 = f"""
SELECT 
    e.case_number,
    e.receipt_date,
    EXTRACT(year FROM e.receipt_date) AS year,
    EXTRACT(quarter FROM e.receipt_date) AS quarter,
    f.event,
    f.field_model,
    f.value ord_types,
    TRIM(ord_type) as ord_type,
    CAST(SUBSTR(CAST(f.event AS varchar),1,3) AS integer) AS court_code
  FROM 
    {database}.event_fields F
    INNER JOIN {database}.events e
      ON f.event = e.event
    CROSS JOIN UNNEST(SPLIT(f.value,',')) AS t(ord_type)
  WHERE 
    field_model IN('U22_AT','G50_AT')
    AND   TRIM(ord_type) IN (SELECT 
                               order_type 
                             FROM 
                              {database2}.order_type_lookup
                             WHERE 
                               child_act = 'Y')
    AND e.error = 'N'
    AND f.mojap_snapshot_date = DATE'{snapshot_date}'
    AND e.mojap_snapshot_date = DATE'{snapshot_date}'
"""

pydb.create_temp_table(ca1, "ca_apps")

#### 1.2. Children (events) - joining the children onto the application data where children are recorded on the event

In [17]:
# Only events with children recorded are included. Children not recorded in the value field are dealt with in the following step
# As with the previous code above, the cross join unnest flattens the child data to 1 row per child recorded against the application (event)
ca2 = f"""
  SELECT 
    a.*,
    f.value children,
    TRY_CAST(TRIM(child_role_id) as bigint) child_role_id
  FROM 
    __temp__.ca_apps a
    LEFT JOIN {database}.event_fields f
      ON f.event = a.event
   CROSS JOIN UNNEST(SPLIT(f.value,',')) AS t(child_role_id)
  WHERE f.field_model IN('U22_CH','G50_CH')
    AND child_role_id <> ''
    AND f.mojap_snapshot_date = DATE'{snapshot_date}'
"""

pydb.create_temp_table(ca2, "ca_apps_child_event")

#### 1.3. Children (case) - joining the children onto the application data using the roles table (via case number) where children are not recorded under the event. 

In [18]:
# Take all events where no children were recorded against the event in the previous table and get children details from the roles/parties tables.
# It is assumed all children on the case were involved in the application
# Adding date of birth and gender from the parties table
ca3 = f"""
  SELECT
    a.*,
    r.role child_role_id,
    p.dob,
    p.gender,
    r.delete_flag
  FROM
    __temp__.ca_apps a
    JOIN {database}.roles r 
      on a.case_number = r.case_number
    JOIN {database}.parties p 
      on r.party = p.party 
  WHERE
    event not in (SELECT event FROM __temp__.ca_apps_child_event)
    AND role_model in ('CHLDC', 'CHLDZ')
    AND r.mojap_snapshot_date = DATE'{snapshot_date}'
    AND p.mojap_snapshot_date = DATE'{snapshot_date}'
"""

pydb.create_temp_table(ca3, "ca_apps_child_case")

#### 1.4. Appending children from events and children from case

In [19]:
# Unioning the dataset where child details were taken from the event to the table where the child details were taken from the roles table
# Gender and DoB of the child are added to the children from events table
ca4 = f"""
  SELECT 
    a.case_number,
    a.receipt_date,
    a.year,
    a.quarter,
    a.event,
    a.field_model,
    a.court_code,
    a.ord_type,
    a.child_role_id,
    p.dob,
    p.gender,
    r.delete_flag
  FROM
    __temp__.ca_apps_child_event a
    JOIN {database}.roles r on a.child_role_id = r.role
    JOIN {database}.parties p on r.party = p.party 
  WHERE
    r.mojap_snapshot_date = DATE'{snapshot_date}'
    AND p.mojap_snapshot_date = DATE'{snapshot_date}'
  UNION ALL
  SELECT
    case_number,
    receipt_date,
    year,
    quarter,
    event,
    field_model,
    court_code,
    ord_type,
    child_role_id,
    dob,
    gender,
    delete_flag
  FROM
    __temp__.ca_apps_child_case
"""

pydb.create_temp_table(ca4, "ca_apps_all_children")

#### 1.5 Ranking duplicate child/order type data within a case

In [20]:
# The apps all children table contains some records with the same order type for the same child in the same case. Here we
# order the cases so that in those cases initial apps are ranked earlier than subsequent apps, and where the ord type is the same the earliest app is ranked highest
ca5 = f"""
SELECT
    *,
    substr(case_number, 5,1) as case_type,
    ROW_NUMBER() OVER(PARTITION BY year, case_number, child_role_id, ord_type 
                       ORDER BY case_number, child_role_id, ord_type, field_model DESC, receipt_date ASC) 
      AS dup_rank
FROM 
   __temp__."ca_apps_all_children"
"""

pydb.create_temp_table(ca5, "ca_apps_dup_rank")

#### 1.6 Selecting the earliest duplicate child/order record within a case

In [21]:
# Select the ealliest record for each duplicate order type per child
# Add whether public or private law case type (potentially revisit this as private law may contain adoption cases)
ca6 = f"""
SELECT
    year,
    quarter,
    case_number,
    receipt_date,
    event,
    field_model,
    ord_type,
    child_role_id,
    court_code,
    CASE WHEN ord_type in ('CRO','SSC','DCO','OSA','SO','DSO','OC','OCST','ARC','ARST','ESO','XESO','CAO','EPO','XEPO','DEPO','WEP')
      OR case_type = 'C' THEN 'C' Else 'P' end as case_type
    
FROM 
   __temp__.ca_apps_dup_rank
WHERE
  dup_rank = 1
  and delete_flag = 'N'
"""

pydb.create_temp_table(ca6, "ca_apps_child_count")

## 2. Order count data

#### 2.1 Creating order count data

In [22]:
# Remove the child ID and group up so we only count an order type within an event once, rather than per child
ca7 = f"""
SELECT 
  DISTINCT 
    year,
    quarter,
    case_number,
    case_type,
    receipt_date,
    event,
    court_code,
    ord_type
FROM 
  __temp__.ca_apps_child_count
"""

pydb.create_temp_table(ca7, "ca_apps_order_count")

## 3. Application count data

#### 3.1 Creating applications data

In [23]:
# Counting applications (individual events). Here multiple orders applied for under one event are only counted once
ca8 = f"""
SELECT 
  DISTINCT 
    year,
    quarter,
    case_number,
    case_type,
    receipt_date,
    event,
    court_code
FROM 
  __temp__.ca_apps_child_count
"""

pydb.create_temp_table(ca8, "ca_apps_event_count")

## 4. Case count data

#### 4.1 Creating cases data

In [24]:
# Keeping just one record per case, and selecting the earliest record
ca9 = f"""
SELECT    
  case_type,    
  case_number,
  (MIN(receipt_date)) AS MIN_of_RECEIPT_DATE,
  EXTRACT (YEAR FROM (MIN(receipt_date))) AS Year,
  EXTRACT (QUARTER FROM (MIN(receipt_date))) AS Quarter
FROM 
  __temp__.ca_apps_event_count
GROUP BY 
  case_type, 
  case_number
"""

pydb.create_temp_table(ca9, "ca_apps_case_count")

## 5. Individual children

#### 5.1 Individual children by year

In [None]:
# Counting applications for individual children within a year
ca10 = f"""
SELECT    
  *,    
  ROW_NUMBER() OVER(PARTITION BY year, child_role_id, case_type
                       ORDER BY child_role_id, year, receipt_date ASC) 
      AS child_count_yr
FROM 
  __temp__.ca_apps_child_count
"""

pydb.create_temp_table(ca10, "ca_apps_rank_child_yr")

In [None]:
pydb.read_sql_query("select * from __temp__.ca_apps_rank_child_yr")

In [None]:
# Keeping only the earliest application per child per year
ca10 = f"""
SELECT    
  year, 
  quarter,
  case_number,
  child_role_id,
  case_type
FROM 
  __temp__.ca_apps_rank_child_yr
WHERE
  child_count_yr = 1
"""

pydb.create_temp_table(ca10, "ca_apps_ind_child_yr")

#### Individual children by quarter

In [None]:
# Counting applications for individual children within a quarter
ca11 = f"""
SELECT    
  *,    
  ROW_NUMBER() OVER(PARTITION BY year, quarter, child_role_id, case_type
                       ORDER BY child_role_id, year, quarter, receipt_date ASC) 
      AS child_count_qtr
FROM 
  __temp__.ca_apps_child_count
"""

pydb.create_temp_table(ca11, "ca_apps_rank_child_qtr")

In [None]:
# Keeping only the earliest application per child per quarter
ca10 = f"""
SELECT    
  year, 
  quarter,
  case_number,
  child_role_id,
  case_type
FROM 
  __temp__.ca_apps_rank_child_qtr
WHERE
  child_count_qtr = 1
"""

pydb.create_temp_table(ca10, "ca_apps_ind_child_qtr")

## 6. High Court

In [None]:
# Keeping only the earliest application per child per quarter
ca11 = f"""
SELECT    
  c.year, 
  c.quarter,
  c.case_type,
  c.case_number,
  e.creating_court,
  f.value
FROM 
  __temp__.ca_apps_case_count c
 LEFT JOIN {database}.events e
   ON c.case_number = e.case_number 
 LEFT JOIN {database}.event_fields F
   ON e.case_number = f.case_number
WHERE
  f.field_model = 'FM2C_HC'
  AND f.value = 'Y'
  AND f.mojap_snapshot_date = DATE'{snapshot_date}'
  AND e.mojap_snapshot_date = DATE'{snapshot_date}'
  AND e.error = 'N'
"""

pydb.create_temp_table(ca11, "ca_apps_high_court")

In [None]:
# extracting high court cases listed against the case field
ca11 = f"""
WITH high_court_cases AS (

SELECT
  DISTINCT
  c.case_number,
  c.value as case_HC_value,
  e.creating_court,
  CASE WHEN e.creating_court in ('EC','FD','IL','LB','WT','ZC')
        THEN 'Central London DFJ'
          ELSE 'Not Central London DFJ'
    END AS HC_London_Ind,
  ROW_NUMBER() OVER(PARTITION BY c.case_number, c.value
                       ORDER BY c.case_number, c.value DESC, receipt_date ASC) 
      AS case_rank
FROM
  {database}.case_fields c
  LEFT JOIN {database}.events e
    on c.case_number = e.case_number
WHERE
  (c.field_model = 'FM2C_HC' AND c.value = 'Y') 
  AND e.event_model = 'U22'
  AND e.error = 'N'
  AND c.mojap_snapshot_date = DATE'{snapshot_date}'
  AND e.mojap_snapshot_date = DATE'{snapshot_date}'
  
)

SELECT
  *
FROM 
  high_court_cases
WHERE 
  case_rank = 1;


"""

pydb.create_temp_table(ca11, "ca_high_court_cases")

In [None]:
pydb.read_sql_query("select * from __temp__.ca_high_court_cases where case_rank = 2")

In [None]:
pydb.read_sql_query(
    "select case_number, count(*) count from __temp__.ca_high_court_cases group by case_number having count (case_number) > 1 "
)

In [None]:
pydb.read_sql_query(
    "select * from __temp__.ca_high_court_cases where case_number in ('HG05P00436','CV04P00272') "
)

In [None]:
# link high court cases to original case table
ca12 = f"""
SELECT
  c.*,
  h.case_HC_value,
  h.creating_court AS HC_U22_court,
  h.HC_London_ind
FROM
  __temp__.ca_apps_case_count c
  LEFT JOIN __temp__.ca_high_court_cases h
    ON c.case_number = h.case_number
"""
pydb.create_temp_table(ca12, "ca_cases_HC")

In [None]:
pydb.read_sql_query(
    "select * from __temp__.ca_cases_HC where case_hc_value is not null"
)

## Parties

#### Creating a table with applicants and respondents

In [30]:
ca13 = f"""
  SELECT
    DISTINCT
    r.role_model,
    CASE WHEN r.role_model in ('APLC','APLZ','APLA')
          THEN 'Applicant'
         WHEN r.role_model in ('RSPC','RSPZ','RSPA')
          THEN 'Respondent'
      END AS case_role, 
    r.role role_id,
    r.party as party_id,
    r.case_number,
    p.gender
  FROM
    {database}.roles r 
    JOIN {database}.parties p 
      on r.party = p.party 
  WHERE
    r.role_model in ('APLC','APLZ','APLA','RSPC','RSPZ','RSPA')
    AND r.mojap_snapshot_date = DATE'{snapshot_date}'
    AND p.mojap_snapshot_date = DATE'{snapshot_date}'
    AND r.delete_flag = 'N'
"""

pydb.create_temp_table(ca13, "app_resp")

#### Linking applicants to case starts

In [48]:
ca14 = f"""
  SELECT
    c.year,
    c.quarter,
    c.MIN_of_RECEIPT_DATE,
    c.case_number,
    c.case_type,
    r.role_model,
    r.case_role, 
    r.role_id,
    r.party_id,
    r.gender,
    ROW_NUMBER() OVER(PARTITION BY c.case_number
                      ORDER BY c.case_number) 
      AS applicant_count
  FROM
    __temp__.ca_apps_case_count c 
    JOIN __temp__.app_resp r 
      on c.case_number = r.case_number 
  WHERE
    r.case_role = 'Applicant'
"""

pydb.create_temp_table(ca14, "applicants")

In [43]:
pydb.read_sql_query("select * from __temp__.applicants where case_number = 'US11P00027'")

Unnamed: 0,year,quarter,min_of_receipt_date,case_number,role_model,case_role,role_id,party_id,gender,applicant_count
0,2011,1,2011-01-13,US11P00027,APLC,Applicant,930808,981367,2,1
1,2011,1,2011-01-13,US11P00027,APLC,Applicant,930807,981366,1,2


#### Linking respondents to case starts

In [49]:
ca15 = f"""
  SELECT
    c.year,
    c.quarter,
    c.MIN_of_RECEIPT_DATE,
    c.case_number,
    c.case_type,
    r.role_model,
    r.case_role, 
    r.role_id,
    r.party_id,
    r.gender,
    ROW_NUMBER() OVER(PARTITION BY c.case_number
                      ORDER BY c.case_number) 
      AS respondent_count
  FROM
    __temp__.ca_apps_case_count c 
    JOIN __temp__.app_resp r 
      on c.case_number = r.case_number 
  WHERE
    r.case_role = 'Respondent'
"""

pydb.create_temp_table(ca15, "respondents")

#### Counting applicants and respondents in each case

In [58]:
ca16 = f"""
  SELECT
    c.year,
    c.quarter,
    c.case_number,
    c.case_type,
    MAX(a.applicant_count) as No_of_applicants,
    MAX(r.respondent_count) as No_of_respondents
  FROM
    __temp__.ca_apps_case_count c 
    JOIN __temp__.applicants a 
      on c.case_number = a.case_number
    JOIN __temp__.respondents r
      on c.case_number = r.case_number  
  GROUP BY
    c.year,
    c.quarter,
    c.case_type,
    c.case_number
"""
pydb.create_temp_table(ca16, "party_count")


In [62]:
pydb.read_sql_query("select year, case_type, count(*) count from __temp__.party_count where year between 2017 and 2020 and no_of_applicants = 1 and case_type = 'C' group by year,case_type order by year")

Unnamed: 0,year,case_type,count
0,2017,C,19008
1,2018,C,18915
2,2019,C,18274
3,2020,C,17955


In [63]:
pydb.read_sql_query("select year, case_type, count(*) count from __temp__.party_count where year between 2017 and 2020 and no_of_applicants = 1 and case_type = 'P' group by year,case_type order by year")

Unnamed: 0,year,case_type,count
0,2017,P,48368
1,2018,P,49808
2,2019,P,52812
3,2020,P,53376
