In [None]:
import os  # for file paths
import pandas as pd
import awswrangler as wr
import pydbtools as pydb  # see https://github.com/moj-analytical-services/pydbtools

# few things for viewing dataframes better
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 900)
pd.set_option("display.max_colwidth", 200)

In [None]:
# define some variables to be used throughout the notebook
db1 = "familyman_dev_v3" #database where Familyman data is stored
db2 = "fcsq" #database where tables created as part of FCSQ processing are stored where required
#snapshot dates and publication period are set in the main run file. However, if running this notebook independently you will need to set them here
snapshot_date = "2023-01-05"
#pub_year = 2011
#pub_qtr = 4

In [None]:
# create path for within athena FCSQ database in the S3 folder
fcsq_db_path = f"s3://alpha-family-data/power_bi"

### Orders Applied for (aggregated)

In [None]:
#Aggregate up orders applied for
pydb.create_temp_table( 
f"""
SELECT    
  year,
  quarter,
  order_case_type,
  order_code,
  order_desc,
  count(*) AS orders_applied_for
FROM 
  {db2}.ca_apps_order_types
WHERE
  year > 2010 
  AND order_case_type = 'P'
GROUP BY 
  year,
  quarter,
  order_case_type,
  order_code,
  order_desc
""",

"ca_orders_applied")

### Extract relationship to child info from Familyman

In [None]:
#Extract applicant and respondent relationship to child information
pydb.create_temp_table( 
f"""
SELECT    
  value AS rtc,
  field_model,
  role
FROM 
  {db1}.role_fields
WHERE
  field_model IN ('APLC_RC','RSPC_RC')
  AND mojap_snapshot_date = DATE'{snapshot_date}'
""",

"app_resp_rtc")

### RTC - applicants

In [None]:
#Get rtc for applicants
#As joining applicants using role id it will automatically filter out and rspc_rc records from the app_resp_rtc table
pydb.create_temp_table( 
f"""
SELECT    
  a.case_number,
  a.year,
  a.quarter,
  a.order_case_type,
  a.order_code,
  a.order_desc,
  ap.role_id,
  r.field_model AS app_resp,
  CASE WHEN (r.rtc IN ('--','Child')
              OR r.rtc IS NULL)
        THEN 'Not recorded'
        ELSE r.rtc
    END AS rtc   
FROM 
  {db2}.ca_apps_order_types a
  LEFT JOIN {db2}.ca_applicants ap 
    ON a.case_number = ap.case_number
  LEFT JOIN __temp__.app_resp_rtc r
    ON ap.role_id = r.role

""",

"apps_rtc")

In [None]:
#Aggregate up applicant rtc data
pydb.create_temp_table( 
f"""
SELECT    
  year,
  quarter,
  order_case_type,
  order_code,
  order_desc,
  rtc,
  count(*) AS applicants
FROM 
  __temp__.apps_rtc
WHERE
  year > 2010  
  AND order_case_type = 'P'
GROUP BY
  year,
  quarter,
  order_case_type,
  order_code,
  order_desc,
  rtc
""",

"agg_apps_rtc")

### RTC - Respondents

In [None]:
#Get rtc for resps
#As joining respondents using role id it will automatically filter out and aplc_rc records from the app_resp_rtc table
pydb.create_temp_table( 
f"""
SELECT    
  a.case_number,
  a.year,
  a.quarter,
  a.order_case_type,
  a.order_code,
  a.order_desc,
  rp.role_id,
  r.field_model AS app_resp,
  CASE WHEN (r.rtc IN ('--','Child')
              OR r.rtc IS NULL)
        THEN 'Not recorded'
        ELSE r.rtc
    END AS rtc   
FROM 
  {db2}.ca_apps_order_types a
  LEFT JOIN {db2}.ca_respondents rp 
    ON a.case_number = rp.case_number
  LEFT JOIN __temp__.app_resp_rtc r
    ON rp.role_id = r.role
""",

"resps_rtc")

In [None]:
#Aggregate up respondent rtc data
pydb.create_temp_table( 
f"""
SELECT    
  year,
  quarter,
  order_case_type,
  order_code,
  order_desc,
  rtc,
  count(*) AS respondents
FROM 
  __temp__.resps_rtc
WHERE
  year > 2010   
  AND order_case_type = 'P'
GROUP BY
  year,
  quarter,
  order_case_type,
  order_code,
  order_desc,
  rtc
""",

"agg_resps_rtc")

### Exporting data

###### Orders applied data

In [None]:
ca_orders_applied_data = pydb.read_sql_query("SELECT * FROM __temp__.ca_orders_applied")

In [None]:
ca_orders_applied_df = pd.DataFrame(ca_orders_applied_data)

In [None]:
ca_orders_applied_df.to_csv (r's3://alpha-family-data/power_bi/ca_orders_applied.csv', header = True, index=False)

###### Applicant RTC data

In [None]:
agg_apps_rtc_data = pydb.read_sql_query("SELECT * FROM __temp__.agg_apps_rtc")

In [None]:
agg_apps_rtc_df = pd.DataFrame(agg_apps_rtc_data)

In [None]:
agg_apps_rtc_df.to_csv (r's3://alpha-family-data/power_bi/agg_apps_rtc.csv', header = True, index=False)

###### Respondent RTC data

In [None]:
agg_resps_rtc_data = pydb.read_sql_query("SELECT * FROM __temp__.agg_resps_rtc")

In [None]:
agg_resps_rtc_df = pd.DataFrame(agg_resps_rtc_data)

In [None]:
agg_resps_rtc_df.to_csv (r's3://alpha-family-data/power_bi/agg_resps_rtc.csv', header = True, index=False)