# Matrimonial Matters County & UA Level Annual

## Contents
#### Setup
1. [import_packages](#import_packages) 
2. [define_key_variables](#define_key_variables) 



## 1. Import packages and set options 
<a name="import_packages"></a>

In [1]:
import pandas as pd  # a module which provides the data structures and functions to store and manipulate tables in dataframes
import pydbtools as pydb  # A module which allows SQL queries to be run on the Analytical Platform from Python, see https://github.com/moj-analytical-services/pydbtools
import boto3  # allows you to directly create, update, and delete AWS resources from Python scripts
import numpy as np
import re

# sets parameters to view dataframes for tables easier
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 900)
pd.set_option("display.max_colwidth", 200)

## 2. Define key variables to be used throughout the notebook 
<a name="define_key_variables"></a>

In [2]:
#this is the database we will be extracting from
database = "familyman_dev_v3" 

#this is the athena database we will be storing our tables in
fcsq_database = "fcsq"

#this is the s3 bucket we will be saving data to
s3 = boto3.resource("s3")
bucket = s3.Bucket("alpha-family-data")

#setting current year
current_year = 2023

#creating a variable to automate the column names from imported csvs
prev_endyear = (current_year - 1) - 2000

# Stage 1 - Divorce County and UA lookup
<a name="Divorce County and UA lookup"></a>

## Import ONS Postcode Directory 

### Create the ons_postcode table

In [3]:
ons_postcode_table = pd.read_csv("s3://alpha-family-data/CSVs/Divorce/Petitioner LA/Lookup/ONSPD_NOV_2022_UK.csv", low_memory=False)

In [4]:
pydb.dataframe_to_temp_table(ons_postcode_table, "ons_postcode")

#### ons_postcode validation

In [5]:
ons_postcode_count = pydb.read_sql_query("SELECT * from __temp__.ons_postcode limit 10")
ons_postcode_count

Unnamed: 0,pcd,pcd2,pcds,dointr,doterm,oscty,ced,oslaua,osward,parish,usertype,oseast1m,osnrth1m,osgrdind,oshlthau,nhser,ctry,rgn,streg,pcon,eer,teclec,ttwa,pct,itl,statsward,oa01,casward,park,lsoa01,msoa01,ur01ind,oac01,oa11,lsoa11,msoa11,wz11,ccg,bua11,buasd11,ru11ind,oac11,lat,long,lep1,lep2,pfa,imd,calncv,stp,oa21,lsoa21,msoa21
0,AB1 0AA,AB1 0AA,AB1 0AA,198001,199606.0,S99999999,S99999999,S12000033,S13002843,S99999999,0,385386.0,801193.0,1,S08000020,S99999999,S92000003,S99999999,0,S14000002,S15000001,S09000001,S22000047,S03000012,S30000026,99ZZ00,S00001364,01C30,S99999999,S01000011,S02000007,6,3C2,S00090303,S01006514,S02001237,S34002990,S03000012,S99999999,S99999999,3,1C3,57.101474,-2.242851,S99999999,S99999999,S23000009,6715,S99999999,S99999999,,,
1,AB1 0AB,AB1 0AB,AB1 0AB,198001,199606.0,S99999999,S99999999,S12000033,S13002843,S99999999,0,385177.0,801314.0,1,S08000020,S99999999,S92000003,S99999999,0,S14000002,S15000001,S09000001,S22000047,S03000012,S30000026,99ZZ00,S00001270,01C31,S99999999,S01000011,S02000007,6,4B3,S00090303,S01006514,S02001237,S34002990,S03000012,S99999999,S99999999,3,1C3,57.102554,-2.246308,S99999999,S99999999,S23000009,6715,S99999999,S99999999,,,
2,AB1 0AD,AB1 0AD,AB1 0AD,198001,199606.0,S99999999,S99999999,S12000033,S13002843,S99999999,0,385053.0,801092.0,1,S08000020,S99999999,S92000003,S99999999,0,S14000002,S15000001,S09000001,S22000047,S03000012,S30000026,99ZZ00,S00001364,01C30,S99999999,S01000011,S02000007,6,3C2,S00090399,S01006514,S02001237,S34003015,S03000012,S99999999,S99999999,3,6A1,57.100556,-2.248342,S99999999,S99999999,S23000009,6715,S99999999,S99999999,,,
3,AB1 0AE,AB1 0AE,AB1 0AE,199402,199606.0,S99999999,S99999999,S12000034,S13002864,S99999999,0,384600.0,799300.0,8,S08000020,S99999999,S92000003,S99999999,0,S14000058,S15000001,S09000001,S22000047,S03000013,S30000027,99ZZ00,S00002142,02C58,S99999999,S01000333,S02000061,6,3B1,S00091322,S01006853,S02001296,S34003292,S03000013,S99999999,S99999999,6,1A2,57.084444,-2.255708,S99999999,S99999999,S23000009,5069,S99999999,S99999999,,,
4,AB1 0AF,AB1 0AF,AB1 0AF,199012,199207.0,S99999999,S99999999,S12000033,S13002843,S99999999,1,384460.0,800660.0,8,S08000020,S99999999,S92000003,S99999999,0,S14000002,S15000001,S09000001,S22000047,S03000012,S30000026,99ZZ00,S00001266,01C30,S99999999,S01000007,S02000003,3,4D2,S00090299,S01006511,S02001236,S34003015,S03000012,S99999999,S99999999,3,6A4,57.096656,-2.258102,S99999999,S99999999,S23000009,6253,S99999999,S99999999,,,
5,AB1 0AG,AB1 0AG,AB1 0AG,199012,199207.0,S99999999,S99999999,S12000033,S13002843,S99999999,1,383890.0,800710.0,8,S08000020,S99999999,S92000003,S99999999,0,S14000002,S15000001,S09000001,S22000047,S03000012,S30000026,99ZZ00,S00001258,01C30,S99999999,S01000001,S02000003,3,5B4,S00090291,S01006506,S02001236,S34003124,S03000012,S99999999,S99999999,3,7C3,57.097085,-2.267513,S99999999,S99999999,S23000009,4691,S99999999,S99999999,,,
6,AB1 0AJ,AB1 0AJ,AB1 0AJ,198001,199606.0,S99999999,S99999999,S12000033,S13002843,S99999999,0,384779.0,800921.0,1,S08000020,S99999999,S92000003,S99999999,0,S14000002,S15000001,S09000001,S22000047,S03000012,S30000026,99ZZ00,S00001364,01C30,S99999999,S01000011,S02000007,6,3C2,S00090399,S01006514,S02001237,S34003015,S03000012,S99999999,S99999999,3,6A1,57.099011,-2.252854,S99999999,S99999999,S23000009,6715,S99999999,S99999999,,,
7,AB1 0AL,AB1 0AL,AB1 0AL,198001,199606.0,S99999999,S99999999,S12000033,S13002843,S99999999,0,384669.0,801228.0,1,S08000020,S99999999,S92000003,S99999999,0,S14000002,S15000001,S09000001,S22000047,S03000012,S30000026,99ZZ00,S00001364,01C30,S99999999,S01000011,S02000007,6,3C2,S00090381,S01006511,S02001236,S34002990,S03000012,S99999999,S99999999,3,6B3,57.101765,-2.254688,S99999999,S99999999,S23000009,6253,S99999999,S99999999,,,
8,AB1 0AN,AB1 0AN,AB1 0AN,198001,199606.0,S99999999,S99999999,S12000033,S13002843,S99999999,1,385225.0,800757.0,1,S08000020,S99999999,S92000003,S99999999,0,S14000002,S15000001,S09000001,S22000047,S03000012,S30000026,99ZZ00,S00001364,01C30,S99999999,S01000011,S02000007,6,3C2,S00090399,S01006514,S02001237,S34003015,S03000012,S99999999,S99999999,3,6A1,57.097553,-2.245483,S99999999,S99999999,S23000009,6715,S99999999,S99999999,,,
9,AB1 0AP,AB1 0AP,AB1 0AP,198001,199606.0,S99999999,S99999999,S12000033,S13002843,S99999999,0,385208.0,800834.0,1,S08000020,S99999999,S92000003,S99999999,0,S14000002,S15000001,S09000001,S22000047,S03000012,S30000026,99ZZ00,S00001364,01C30,S99999999,S01000011,S02000007,6,3C2,S00090399,S01006514,S02001237,S34003015,S03000012,S99999999,S99999999,3,6A1,57.098244,-2.245768,S99999999,S99999999,S23000009,6715,S99999999,S99999999,,,


## Import Local Authority Districts

### Create the la_districts table

In [6]:
la_districts_table = pd.read_csv("s3://alpha-family-data/CSVs/Divorce/Petitioner LA/Lookup/Local_Authority_Districts_(December_2022)_Names_and_Codes_in_the_United_Kingdom.csv")

In [7]:
pydb.dataframe_to_temp_table(la_districts_table, "la_districts")

#### la_districts validation

In [8]:
la_districts_count = pydb.read_sql_query("SELECT * from __temp__.la_districts")
la_districts_count

Unnamed: 0,lad22cd,lad22nm,lad22nmw,objectid
0,E07000141,South Kesteven,,1
1,E07000142,West Lindsey,,2
2,E07000143,Breckland,,3
3,E07000144,Broadland,,4
4,E07000145,Great Yarmouth,,5
...,...,...,...,...
369,E08000027,Dudley,,370
370,E08000028,Sandwell,,371
371,E08000029,Solihull,,372
372,E08000030,Walsall,,373


## Import Local Authority Districts to Counties

### Create the la_districts_counties table

In [9]:
la_districts_counties_table = pd.read_csv("s3://alpha-family-data/CSVs/Divorce/Petitioner LA/Lookup/Local_Authority_District_to_County_(December_2022)_Lookup_in_England.csv")

In [10]:
pydb.dataframe_to_temp_table(la_districts_counties_table, "la_districts_counties")

#### la_districts_counties validation

In [11]:
la_districts_counties_count = pydb.read_sql_query("SELECT * from __temp__.la_districts_counties")
la_districts_counties_count

Unnamed: 0,lad22cd,lad22nm,cty22cd,cty22nm,objectid
0,E07000008,Cambridge,E10000003,Cambridgeshire,1
1,E07000009,East Cambridgeshire,E10000003,Cambridgeshire,2
2,E07000010,Fenland,E10000003,Cambridgeshire,3
3,E07000011,Huntingdonshire,E10000003,Cambridgeshire,4
4,E07000012,South Cambridgeshire,E10000003,Cambridgeshire,5
...,...,...,...,...,...
245,E09000024,Merton,E13000002,Outer London,246
246,E09000026,Redbridge,E13000002,Outer London,247
247,E09000027,Richmond upon Thames,E13000002,Outer London,248
248,E09000029,Sutton,E13000002,Outer London,249


## Creating Lookup

### Create the lookup_working table

In [12]:
create_lookup_working =f"""
SELECT 
a.LAD{prev_endyear}CD AS code,
a.LAD{prev_endyear}NM AS la,
b.CTY{prev_endyear}NM AS county,
CASE WHEN a.LAD{prev_endyear}CD LIKE 'E%' THEN 'England'
WHEN a.LAD{prev_endyear}CD LIKE 'W%' THEN 'Wales'
END AS country
FROM __temp__.la_districts a
LEFT JOIN __temp__.la_districts_counties b
ON a.LAD22CD = b.LAD22CD 
WHERE a.LAD22CD LIKE 'E%' OR a.LAD22CD LIKE 'W%';
"""
pydb.create_temp_table(create_lookup_working,'lookup_working')

In [13]:
lookup_working = pydb.read_sql_query("SELECT * from __temp__.lookup_working")
lookup_working

Unnamed: 0,code,la,county,country
0,E07000141,South Kesteven,Lincolnshire,England
1,E07000142,West Lindsey,Lincolnshire,England
2,E07000143,Breckland,Norfolk,England
3,E07000144,Broadland,Norfolk,England
4,E07000145,Great Yarmouth,Norfolk,England
...,...,...,...,...
326,E08000027,Dudley,West Midlands,England
327,E08000028,Sandwell,West Midlands,England
328,E08000029,Solihull,West Midlands,England
329,E08000030,Walsall,West Midlands,England


### Create the divorce_county_ua_lookup table

In [14]:
create_divorce_county_ua_lookup =f"""
SELECT
code,
CASE WHEN county IN ('Greater Manchester', 'Merseyside', 'South Yorkshire', 'Tyne and Wear', 'West Midlands', 'West Yorkshire')
THEN CONCAT(' ',county,'(Met County)')
WHEN code LIKE 'W%'
THEN la
WHEN county IS NULL
THEN CONCAT(' ',la,'UA')
ELSE county
END AS county_ua,
country
FROM __temp__.lookup_working;
"""
pydb.create_temp_table(create_divorce_county_ua_lookup,'divorce_county_ua_lookup')

In [15]:
divorce_county_ua_lookup = pydb.read_sql_query("SELECT * from __temp__.divorce_county_ua_lookup")
divorce_county_ua_lookup

Unnamed: 0,code,county_ua,country
0,E07000141,Lincolnshire,England
1,E07000142,Lincolnshire,England
2,E07000143,Norfolk,England
3,E07000144,Norfolk,England
4,E07000145,Norfolk,England
...,...,...,...
326,E08000027,West Midlands(Met County),England
327,E08000028,West Midlands(Met County),England
328,E08000029,West Midlands(Met County),England
329,E08000030,West Midlands(Met County),England


# Stage 2 - Petitioner Postcode

## Import Petitioner Address Details 

### Create the petitioner_address_details table

In [16]:
petitioner_address_details_table = pd.read_csv("s3://alpha-family-data/CSVs/Divorce/Petitioner LA/Petitioner_Address_Details.csv", low_memory=False)

In [17]:
pydb.dataframe_to_temp_table(petitioner_address_details_table, "petitioner_address_details")

#### petitioner_address_details validation

In [18]:
petitioner_address_details_count = pydb.read_sql_query("SELECT * from __temp__.petitioner_address_details limit 10")
petitioner_address_details_count

Unnamed: 0,year,month,quarter,fm_case_cid,legal_case_id,pettnr_line_1_address,pettnr_line_2_address,pettnr_line_3_address,pettnr_postal_code,pettnr_line_4_address,pettnr_line_5_address,pettnr_line_6_address,pettnr_contact_details_confdntl_cind
0,2020,1,1,ZZ19D74058,1205371,,,,,,,,
1,2021,4,2,ZZ21D18144,2658348,,,,,,,,
2,2021,7,3,ZZ21D59619,3323101,,,,,,,,
3,2021,10,4,ZZ21D68994,3740779,,,,CM11 1LU,,,,share
4,2021,9,3,ZZ21D78446,3616793,,,,,,,,keep
5,2020,5,2,ZZ20D21210,1426083,,,,,,,,
6,2020,6,2,ZZ20D28785,1504747,,,,,,,,
7,2022,3,1,ZZ22D26732,4138379,,,,,,,,share
8,2021,4,2,ZZ21D07556,2915032,,,,,,,,
9,2020,12,4,ZZ20D73359,1876032,,,,,,,,


## Creating Final Output

### Create the petitioner_address table

In [19]:
create_petitioner_address =f"""
SELECT t1.Year, 
          t1.Month, 
          t1.Quarter,
          t1.PETTNR_LINE_1_ADDRESS,
          t1.PETTNR_LINE_2_ADDRESS,
          t1.PETTNR_LINE_3_ADDRESS,
          t1.PETTNR_LINE_4_ADDRESS,
          t1.PETTNR_LINE_5_ADDRESS, 
          t1.PETTNR_LINE_6_ADDRESS,
          t1.PETTNR_POSTAL_CODE,
          t1.PETTNR_CONTACT_DETAILS_CONFDNTL_CIND as CONFDNTL
FROM __temp__.petitioner_address_details t1;
"""
pydb.create_temp_table(create_petitioner_address,'petitioner_address')

In [20]:
petitioner_address = pydb.read_sql_query("SELECT * from __temp__.petitioner_address")
petitioner_address

Unnamed: 0,year,month,quarter,pettnr_line_1_address,pettnr_line_2_address,pettnr_line_3_address,pettnr_line_4_address,pettnr_line_5_address,pettnr_line_6_address,pettnr_postal_code,confdntl
0,2005,9,3,22 Curven Edge,Helmshore,,,,,,
1,2005,10,4,6 Fountain Close,Padiham,Burnley,Lancashire,,,,
2,2006,12,4,10 Hillcrest Avenue,Cliviger,Burnley,,,,,
3,2019,7,3,"4, COLWYN ROAD",CHEADLE HULME,CHEADLE,,,,SK86BX,N
4,2019,7,3,OLD DAIRY,KISLINGBURY STARMERS LANE,NORTHAMPTON,,,,NN74AE,N
...,...,...,...,...,...,...,...,...,...,...,...
3378203,2007,4,2,24 Willersey Road,Moseley,Birmingham,,,,B13 0AY,
3378204,2007,5,2,48b Springfield Road,Kings Heath,Birmingham,,,,,
3378205,2007,5,2,223 Islington Gates,110 Newhall Street,Birmingham,,,,B3 1JN,
3378206,2007,8,3,277 Alwold Road,Weoley Castle,Birmingham,,,,B29 5JH,


------------------

### Create the new_divorce_postcode table

In [21]:
create_new_divorce_postcode =f"""
SELECT t1.Year, 
    t1.Month, 
    t1.Quarter,
    REPLACE(UPPER(t1.PETTNR_LINE_1_ADDRESS), ' ', '') as Line1,
    REPLACE(UPPER(t1.PETTNR_LINE_2_ADDRESS), ' ', '') as Line2,
    REPLACE(UPPER(t1.PETTNR_LINE_3_ADDRESS), ' ', '') as Line3,
    REPLACE(UPPER(t1.PETTNR_LINE_4_ADDRESS), ' ', '') as Line4,
    REPLACE(UPPER(t1.PETTNR_LINE_5_ADDRESS), ' ', '') as Line5, 
    REPLACE(UPPER(t1.PETTNR_LINE_6_ADDRESS), ' ', '') as Line6,
    REPLACE(UPPER(t1.PETTNR_POSTAL_CODE), ' ', '') as postcode,
    UPPER(t1.PETTNR_CONTACT_DETAILS_CONFDNTL_CIND) as CONFDNTL
    
      FROM __temp__.petitioner_address_details t1;
"""
pydb.create_temp_table(create_new_divorce_postcode,'new_divorce_postcode')

In [22]:
new_divorce_postcode = pydb.read_sql_query("SELECT * from __temp__.new_divorce_postcode")
new_divorce_postcode

Unnamed: 0,year,month,quarter,line1,line2,line3,line4,line5,line6,postcode,confdntl
0,2013,2,1,26KINGSMEADWALK,SEAFORD,EASTSUSSEX,,,,BN252EX,N
1,2008,6,2,63RODINGROAD,LONDON,,,,,E50DN,
2,2009,11,4,29THACKERAYCLOSE,ISLEWORTH,MIDDLESEX,,,,TW76TJ,
3,2011,2,1,10BBODNEYROAD,LONDON,,,,,E81AY,N
4,2011,9,3,FLAT17LODDIGESHOUSE,LODDIGESROAD,LONDON,,,,E97PJ,N
...,...,...,...,...,...,...,...,...,...,...,...
3378203,1999,4,2,11ABIRCHWOODAVENUE,HATFIELD,HERTS,AL100PL,,,,N
3378204,1999,6,2,42HORNBEAMSPRING,KNEBWORTH,HERTS,,,,SG36BE,N
3378205,2020,8,3,11LAWNHURSTGROVE,AIGBURTH,LIVERPOOL,,,,L176ES,N
3378206,1998,7,3,21HILLFOOTGREEN,WOOLTON,LIVERPOOL,,,,L257UH,N


In [23]:
create_new_divorce_with_postcode_temp1 =f"""
SELECT *,
CASE 
WHEN regexp_like(line1, '[A-Z][A-Z][0-9][0-9][A-Z][A-Z]') THEN regexp_extract_all(line1, '[A-Z][A-Z][0-9][0-9][A-Z][A-Z]')
WHEN regexp_like(line1, '[A-Z][A-Z][0-9][0-9][0-9][A-Z][A-Z]') THEN regexp_extract_all(line1, '[A-Z][A-Z][0-9][0-9][0-9][A-Z][A-Z]')
WHEN regexp_like(line1, '[A-Z][A-Z][0-9][A-Z][0-9][A-Z][A-Z]') THEN regexp_extract_all(line1, '[A-Z][A-Z][0-9][A-Z][0-9][A-Z][A-Z]')

WHEN regexp_like(line1, '[A-Z][0-9][0-9][A-Z][A-Z]') THEN regexp_extract_all(line1, '[A-Z][0-9][0-9][A-Z][A-Z]')
WHEN regexp_like(line1, '[A-Z][0-9][0-9][0-9][A-Z][A-Z]') THEN regexp_extract_all(line1, '[A-Z][0-9][0-9][0-9][A-Z][A-Z]')
WHEN regexp_like(line1, '[A-Z][0-9][A-Z][0-9][A-Z][A-Z]') THEN regexp_extract_all(line1, '[A-Z][0-9][A-Z][0-9][A-Z][A-Z]')
ELSE NULL
END newpostcode1,


CASE 
WHEN regexp_like(line2, '[A-Z][A-Z][0-9][0-9][A-Z][A-Z]') THEN regexp_extract_all(line2, '[A-Z][A-Z][0-9][0-9][A-Z][A-Z]')
WHEN regexp_like(line2, '[A-Z][A-Z][0-9][0-9][0-9][A-Z][A-Z]') THEN regexp_extract_all(line2, '[A-Z][A-Z][0-9][0-9][0-9][A-Z][A-Z]')
WHEN regexp_like(line2, '[A-Z][A-Z][0-9][A-Z][0-9][A-Z][A-Z]') THEN regexp_extract_all(line2, '[A-Z][A-Z][0-9][A-Z][0-9][A-Z][A-Z]')

WHEN regexp_like(line2, '[A-Z][0-9][0-9][A-Z][A-Z]') THEN regexp_extract_all(line2, '[A-Z][0-9][0-9][A-Z][A-Z]')
WHEN regexp_like(line2, '[A-Z][0-9][0-9][0-9][A-Z][A-Z]') THEN regexp_extract_all(line2, '[A-Z][0-9][0-9][0-9][A-Z][A-Z]')
WHEN regexp_like(line2, '[A-Z][0-9][A-Z][0-9][A-Z][A-Z]') THEN regexp_extract_all(line2, '[A-Z][0-9][A-Z][0-9][A-Z][A-Z]')
ELSE NULL
END newpostcode2,


CASE 
WHEN regexp_like(line3, '[A-Z][A-Z][0-9][0-9][A-Z][A-Z]') THEN regexp_extract_all(line3, '[A-Z][A-Z][0-9][0-9][A-Z][A-Z]')
WHEN regexp_like(line3, '[A-Z][A-Z][0-9][0-9][0-9][A-Z][A-Z]') THEN regexp_extract_all(line3, '[A-Z][A-Z][0-9][0-9][0-9][A-Z][A-Z]')
WHEN regexp_like(line3, '[A-Z][A-Z][0-9][A-Z][0-9][A-Z][A-Z]') THEN regexp_extract_all(line3, '[A-Z][A-Z][0-9][A-Z][0-9][A-Z][A-Z]')

WHEN regexp_like(line3, '[A-Z][0-9][0-9][A-Z][A-Z]') THEN regexp_extract_all(line3, '[A-Z][0-9][0-9][A-Z][A-Z]')
WHEN regexp_like(line3, '[A-Z][0-9][0-9][0-9][A-Z][A-Z]') THEN regexp_extract_all(line3, '[A-Z][0-9][0-9][0-9][A-Z][A-Z]')
WHEN regexp_like(line3, '[A-Z][0-9][A-Z][0-9][A-Z][A-Z]') THEN regexp_extract_all(line3, '[A-Z][0-9][A-Z][0-9][A-Z][A-Z]')
ELSE NULL
END newpostcode3,


CASE 
WHEN regexp_like(line4, '[A-Z][A-Z][0-9][0-9][A-Z][A-Z]') THEN regexp_extract_all(line4, '[A-Z][A-Z][0-9][0-9][A-Z][A-Z]')
WHEN regexp_like(line4, '[A-Z][A-Z][0-9][0-9][0-9][A-Z][A-Z]') THEN regexp_extract_all(line4, '[A-Z][A-Z][0-9][0-9][0-9][A-Z][A-Z]')
WHEN regexp_like(line4, '[A-Z][A-Z][0-9][A-Z][0-9][A-Z][A-Z]') THEN regexp_extract_all(line4, '[A-Z][A-Z][0-9][A-Z][0-9][A-Z][A-Z]')

WHEN regexp_like(line4, '[A-Z][0-9][0-9][A-Z][A-Z]') THEN regexp_extract_all(line4, '[A-Z][0-9][0-9][A-Z][A-Z]')
WHEN regexp_like(line4, '[A-Z][0-9][0-9][0-9][A-Z][A-Z]') THEN regexp_extract_all(line4, '[A-Z][0-9][0-9][0-9][A-Z][A-Z]')
WHEN regexp_like(line4, '[A-Z][0-9][A-Z][0-9][A-Z][A-Z]') THEN regexp_extract_all(line4, '[A-Z][0-9][A-Z][0-9][A-Z][A-Z]')
ELSE NULL
END newpostcode4,


CASE 
WHEN regexp_like(line5, '[A-Z][A-Z][0-9][0-9][A-Z][A-Z]') THEN regexp_extract_all(line5, '[A-Z][A-Z][0-9][0-9][A-Z][A-Z]')
WHEN regexp_like(line5, '[A-Z][A-Z][0-9][0-9][0-9][A-Z][A-Z]') THEN regexp_extract_all(line5, '[A-Z][A-Z][0-9][0-9][0-9][A-Z][A-Z]')
WHEN regexp_like(line5, '[A-Z][A-Z][0-9][A-Z][0-9][A-Z][A-Z]') THEN regexp_extract_all(line5, '[A-Z][A-Z][0-9][A-Z][0-9][A-Z][A-Z]')

WHEN regexp_like(line5, '[A-Z][0-9][0-9][A-Z][A-Z]') THEN regexp_extract_all(line5, '[A-Z][0-9][0-9][A-Z][A-Z]')
WHEN regexp_like(line5, '[A-Z][0-9][0-9][0-9][A-Z][A-Z]') THEN regexp_extract_all(line5, '[A-Z][0-9][0-9][0-9][A-Z][A-Z]')
WHEN regexp_like(line5, '[A-Z][0-9][A-Z][0-9][A-Z][A-Z]') THEN regexp_extract_all(line5, '[A-Z][0-9][A-Z][0-9][A-Z][A-Z]')
ELSE NULL
END newpostcode5,

CASE 
WHEN regexp_like(line6, '[A-Z][A-Z][0-9][0-9][A-Z][A-Z]') THEN regexp_extract_all(line6, '[A-Z][A-Z][0-9][0-9][A-Z][A-Z]')
WHEN regexp_like(line6, '[A-Z][A-Z][0-9][0-9][0-9][A-Z][A-Z]') THEN regexp_extract_all(line6, '[A-Z][A-Z][0-9][0-9][0-9][A-Z][A-Z]')
WHEN regexp_like(line6, '[A-Z][A-Z][0-9][A-Z][0-9][A-Z][A-Z]') THEN regexp_extract_all(line6, '[A-Z][A-Z][0-9][A-Z][0-9][A-Z][A-Z]')

WHEN regexp_like(line6, '[A-Z][0-9][0-9][A-Z][A-Z]') THEN regexp_extract_all(line6, '[A-Z][0-9][0-9][A-Z][A-Z]')
WHEN regexp_like(line6, '[A-Z][0-9][0-9][0-9][A-Z][A-Z]') THEN regexp_extract_all(line6, '[A-Z][0-9][0-9][0-9][A-Z][A-Z]')
WHEN regexp_like(line6, '[A-Z][0-9][A-Z][0-9][A-Z][A-Z]') THEN regexp_extract_all(line6, '[A-Z][0-9][A-Z][0-9][A-Z][A-Z]')
ELSE NULL
END newpostcode6,

CASE 
WHEN regexp_like(postcode, '[A-Z][A-Z][0-9][0-9][A-Z][A-Z]') THEN regexp_extract_all(postcode, '[A-Z][A-Z][0-9][0-9][A-Z][A-Z]')
WHEN regexp_like(postcode, '[A-Z][A-Z][0-9][0-9][0-9][A-Z][A-Z]') THEN regexp_extract_all(postcode, '[A-Z][A-Z][0-9][0-9][0-9][A-Z][A-Z]')
WHEN regexp_like(postcode, '[A-Z][A-Z][0-9][A-Z][0-9][A-Z][A-Z]') THEN regexp_extract_all(postcode, '[A-Z][A-Z][0-9][A-Z][0-9][A-Z][A-Z]')

WHEN regexp_like(postcode, '[A-Z][0-9][0-9][A-Z][A-Z]') THEN regexp_extract_all(postcode, '[A-Z][0-9][0-9][A-Z][A-Z]')
WHEN regexp_like(postcode, '[A-Z][0-9][0-9][0-9][A-Z][A-Z]') THEN regexp_extract_all(postcode, '[A-Z][0-9][0-9][0-9][A-Z][A-Z]')
WHEN regexp_like(postcode, '[A-Z][0-9][A-Z][0-9][A-Z][A-Z]') THEN regexp_extract_all(postcode, '[A-Z][0-9][A-Z][0-9][A-Z][A-Z]')
ELSE NULL
END newpostcode7


FROM __temp__.new_divorce_postcode;
"""
pydb.create_temp_table(create_new_divorce_with_postcode_temp1,'new_divorce_with_postcode_temp1')


In [24]:
code = pydb.read_sql_query("SELECT * from __temp__.new_divorce_with_postcode_temp1")
code

Unnamed: 0,year,month,quarter,line1,line2,line3,line4,line5,line6,postcode,confdntl,newpostcode1,newpostcode2,newpostcode3,newpostcode4,newpostcode5,newpostcode6,newpostcode7
0,2020,1,1,,,,,,,,,,,,,,,
1,2021,2,1,,,,,,,,,,,,,,,
2,2021,3,1,,,,,,,,,,,,,,,
3,2020,6,2,,,,,,,,,,,,,,,
4,2021,10,4,,,,,,,LS72SA,SHARE,,,,,,,[LS72SA]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3378203,2001,6,2,22PRINCESSROAD,GOLDTHORPE,ROTHERHAM,SOUTHYORKSHIRE,,,S639NP,,,,,,,,[S639NP]
3378204,2001,7,3,52AISBYDRIVE,ROSSINGTON,DONCASTER,,,,,,,,,,,,
3378205,2002,4,2,90ROTHERHAMROAD,GREATHOUGHTON,BARNSLEY,,,,,,,,,,,,
3378206,2003,3,1,5BROUGHGREEN,GILROYD,BARNSLEY,,,,S753PF,,,,,,,,[S753PF]


In [25]:
create_new_divorce_with_postcode_temp2 =f"""
SELECT year, 
month, 
quarter,
line1,
line2,
line3,
line4,
line5, 
line6,
postcode,
confdntl,

array_join(newpostcode1,  '') as newpostcode1,
array_join(newpostcode2,  '') as newpostcode2,
array_join(newpostcode3,  '') as newpostcode3,
array_join(newpostcode4,  '') as newpostcode4,
array_join(newpostcode5,  '') as newpostcode5,
array_join(newpostcode6,  '') as newpostcode6,
array_join(newpostcode7,  '') as newpostcode7

FROM __temp__.new_divorce_with_postcode_temp1;
"""
pydb.create_temp_table(create_new_divorce_with_postcode_temp2,'new_divorce_with_postcode_temp2')

In [26]:
code = pydb.read_sql_query("SELECT * FROM __temp__.new_divorce_with_postcode_temp2")
code

Unnamed: 0,year,month,quarter,line1,line2,line3,line4,line5,line6,postcode,confdntl,newpostcode1,newpostcode2,newpostcode3,newpostcode4,newpostcode5,newpostcode6,newpostcode7
0,2005,3,1,15ERDSWICKCLOSE,NORTHGATEVILLAGE,CHESTER,,,,,N,,,,,,,
1,2005,7,3,19HAZELROAD,BOGNORREGIS,WESTSUSSEX,,,,PO229DX,,,,,,,,PO229DX
2,2002,10,4,3STMARGARET'SDRIVE,CRAIGYDON,LLANDUDNO,CONWY,,,,,,,,,,,
3,2002,5,2,98REDBRIDGELANEWEST,WANSTEAD,LONDON,,,,E112LA,N,,,,,,,E112LA
4,2005,1,1,9KIRKLEES,CHELMSFORD,ESSEX,,,,CM12AG,,,,,,,,CM12AG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3378203,2002,4,2,.,.,,,,,,N,,,,,,,
3378204,2003,5,2,CLARECOLLEGE,CAMBRIDGE,,,,,CB21TL,N,,,,,,,CB21TL
3378205,2003,8,3,14HILLSLANE,ELY,CAMBRIDGESHIRE,,,,,N,,,,,,,
3378206,2014,7,3,C/OMRLJREAD,MIDDLEMOATHOUSE,MELLIS,SUFFOLK,,,IP238EE,N,,,,,,,IP238EE


-------------------------

### Create the new_divorce_with_postcode table

In [27]:
create_new_divorce_with_postcode =f"""
SELECT year, 
month, 
quarter,
line1,
line2,
line3,
line4,
line5, 
line6,
postcode,
confdntl,

CASE WHEN newpostcode7 IS NOT NULL THEN newpostcode7
WHEN newpostcode6 IS NOT NULL THEN newpostcode6
WHEN newpostcode5 IS NOT NULL THEN newpostcode5
WHEN newpostcode4 IS NOT NULL THEN newpostcode4
WHEN newpostcode3 IS NOT NULL THEN newpostcode3
WHEN newpostcode2 IS NOT NULL THEN newpostcode2
WHEN newpostcode1 IS NOT NULL THEN newpostcode1

ELSE NULL
END newpostcode

FROM __temp__.new_divorce_with_postcode_temp2;
"""
pydb.create_temp_table(create_new_divorce_with_postcode,'new_divorce_with_postcode')

In [None]:
code = pydb.read_sql_query("SELECT * FROM __temp__.new_divorce_with_postcode")
code

### Create the divorce_postcode_1 table

In [None]:
create_divorce_postcode_1 =f"""
SELECT t1.Year, 
    t1.Month, 
    t1.Quarter, 
    t1.Line1, 
    t1.Line2, 
    t1.Line3, 
    t1.Line4, 
    t1.Line5,
    t1.Line6,
    t1.CONFDNTL,
    t1.postcode,
    /* Addr Postcode */
    REPLACE(t1.newpostcode, ' ', '') AS newpostcode
FROM __temp__.new_divorce_with_postcode t1;
"""
pydb.create_temp_table(create_divorce_postcode_1,'divorce_postcode_1')

In [None]:
divorce_postcode_1 = pydb.read_sql_query("SELECT * from __temp__.divorce_postcode_1 LIMIT 10")
divorce_postcode_1

### Create the ons_postcode_data table

In [None]:
create_ons_postcode_data =f"""
SELECT REPLACE(t1.pcd , ' ', '') AS PCD, 
t1.oslaua,
t1.ctry

FROM __temp__.ons_postcode t1

WHERE t1.ctry IN ('E92000001','W92000004');
"""
pydb.create_temp_table(create_ons_postcode_data,'ons_postcode_data')

In [None]:
ons_postcode_data = pydb.read_sql_query("SELECT * from __temp__.ons_postcode_data LIMIT 10")
ons_postcode_data

### Create the divorce_postcode_ons_match table

In [None]:
create_divorce_postcode_ons_match =f"""
SELECT t1.Year, 
          t1.Quarter, 
          t1.LINE1, 
          t1.LINE2, 
          t1.LINE3, 
          t1.LINE4, 
          t1.LINE5,
          t1.LINE6,
          t1.CONFDNTL,
          t1.postcode,
          t1.newpostcode, 
          t2.PCD, 
          t2.oslaua
FROM __temp__.divorce_postcode_1 t1
LEFT JOIN __temp__.ons_postcode_data t2 
    ON (t1.newpostcode = t2.PCD);
"""
pydb.create_temp_table(create_divorce_postcode_ons_match,'divorce_postcode_ons_match')

In [None]:
divorce_postcode_ons_match = pydb.read_sql_query("SELECT * from __temp__.divorce_postcode_ons_match LIMIT 10")
divorce_postcode_ons_match

### Create the divorce_postcode_la table

In [None]:
create_divorce_postcode_la =f"""
SELECT t1.Year, 
          t1.Quarter, 
          t1.LINE1, 
          t1.LINE2, 
          t1.LINE3, 
          t1.LINE4, 
          t1.LINE5,
          t1.LINE6,
          t1.CONFDNTL,
          t1.postcode, 
          t1.newpostcode, 
          t1.PCD, 
          t1.oslaua, 
          LTRIM(t2.county_ua) as county_ua,
          t2.country
FROM __temp__.divorce_postcode_ons_match t1
LEFT JOIN __temp__.divorce_county_ua_lookup t2 
ON (t1.oslaua = t2.code);

"""
pydb.create_temp_table(create_divorce_postcode_la,'divorce_postcode_la')

In [None]:
divorce_postcode_la = pydb.read_sql_query("SELECT * from __temp__.divorce_postcode_la LIMIT 10")
divorce_postcode_la

### Create the divorce_la_c8 table

In [None]:
create_divorce_la_c8 =f"""
SELECT *,
CASE 
WHEN CONFDNTL = 'KEEP' THEN 'Confidentiality requested' 
WHEN CONFDNTL = 'Y' THEN 'Confidentiality requested' 
WHEN Line1 IS NULL AND Line2 IS NULL AND Line3 IS NULL AND Line4 IS NULL AND Line5 IS NULL AND LINE6 IS NULL AND Postcode IS NULL THEN 'Confidentiality requested'
WHEN Line1 = '-' AND Line2 ='-' AND Postcode IS NULL THEN 'Confidentiality requested'
WHEN Line1 = '.' And Line2 = '.' AND Line3 IS NULL AND Postcode IS NULL THEN 'Confidentiality requested'
WHEN Line1 = 'XX' OR Line1 = 'XXX' OR Line1 = 'XXXX' OR Line1 = 'XXXXX' or Line1 = 'XXXXXX' OR Line1 = 'XXXXXXX' or Line1 = 'XXXXXXXX' then 'Confidentiality requested'
WHEN strpos(Line1,'PRIVATE') <> 0 then 'Confidentiality requested'
WHEN strpos(Line1,'WITHHELD') <> 0 then 'Confidentiality requested'
WHEN strpos(Line1,'CONFIDENT') <> 0 then 'Confidentiality requested'
WHEN strpos(Line2,'CONFIDENT') <> 0 then 'Confidentiality requested'
WHEN strpos(Line1,'C8') <> 0 then 'Confidentiality requested'
WHEN strpos(Line2,'C8') <> 0 then 'Confidentiality requested'

WHEN Newpostcode IS NOT NULL AND PCD IS NULL then 'Postcode invalid/not given or foreign'
WHEN Newpostcode IS NOT NULL AND PCD IS NOT NULL AND county_ua IS NULL then 'Postcode invalid/not given or foreign'
WHEN county_ua IS NULL THEN 'Postcode invalid/not given or foreign'

ELSE county_ua 
END county_ua2
FROM __temp__.divorce_postcode_la;

"""
pydb.create_temp_table(create_divorce_la_c8,'divorce_la_c8')

In [None]:
divorce_la_c8 = pydb.read_sql_query("SELECT * from __temp__.divorce_la_c8")
divorce_la_c8

#### Check Confidentiality Filter

In [None]:
check = pydb.read_sql_query("SELECT DISTINCT * FROM __temp__.DIVORCE_LA_C8 WHERE CONFDNTL = 'Y' or CONFDNTL = 'KEEP';")
check

### Create the divorce_county table

In [None]:
create_divorce_county =f"""
SELECT year,
quarter,
line1,
line2,
line3,
line4,
line5,
line6,
confdntl,
postcode, 
newpostcode,
pcd,
oslaua,

CASE 
WHEN county_ua2 = 'Isles of Scilly UA' then 'Cornwall & Isles of Scilly'
WHEN county_ua2 = 'Cornwall UA' then 'Cornwall & Isles of Scilly'

ELSE county_ua2 
END county_ua,

CASE 
WHEN county_ua2 = 'Confidentiality requested' then 'Confidentiality requested'
WHEN county_ua2 = 'Postcode invalid/not given or foreign' then 'Postcode invalid/not given or foreign'
ELSE country
END country

FROM __temp__.divorce_la_c8;

"""
pydb.create_temp_table(create_divorce_county,'divorce_county')

In [None]:
divorce_county = pydb.read_sql_query("SELECT * from __temp__.divorce_county")
divorce_county

### Create the petitioner_summary_la table

In [None]:
create_petitioner_summary_la =f"""
SELECT DISTINCT 'Petitioner' as Type,
t1.year,
t1.country,
t1.county_ua,
(COUNT(t1.county_ua)) AS COUNT_of_County

FROM __temp__.divorce_county t1

WHERE year > 2010
AND year < {current_year}

GROUP BY t1.year,
t1.county_ua,
t1.country;

"""
pydb.create_temp_table(create_petitioner_summary_la,'petitioner_summary_la')

In [None]:
petitioner_summary_la = pydb.read_sql_query("SELECT * from __temp__.petitioner_summary_la")
petitioner_summary_la

In [None]:
# Check that counts of county 
petitioner_summary_la[['count_of_county']].sum()

In [None]:
final_output = pydb.read_sql_query("""
SELECT *
from __temp__.petitioner_summary_la
ORDER BY year,
country,
county_ua
""")

In [None]:
# Export the final csv
final_output.to_csv("s3://alpha-family-data/CSVs/Divorce/CSV Matrimonial Matters County & UA Annual 2022.csv", index = False)