In [1]:
import pandas as pd
import numpy as np

In [2]:
# Using data generated from branch PRMT-1742-duplicates-analysis.
# This is needed to correctly handle duplicates.
# Once the upstream pipeline has a fix for duplicate EHRs, then we can go back to using the main output.
transfer_file_location = "s3://prm-gp2gp-data-sandbox-dev/transfers-duplicates-hypothesis/"
transfer_files = [
    "9-2020-transfers.parquet",
    "10-2020-transfers.parquet",
    "11-2020-transfers.parquet",
    "12-2020-transfers.parquet",
    "1-2021-transfers.parquet",
    "2-2021-transfers.parquet"
]

transfer_input_files = [transfer_file_location + f for f in transfer_files]
transfers_raw = pd.concat((
    pd.read_parquet(f)
    for f in transfer_input_files
))

# In the data from the PRMT-1742-duplicates-analysis branch, these columns have been added , but contain only empty values.
transfers_raw = transfers_raw.drop(["sending_supplier", "requesting_supplier"], axis=1)


# Given the findings in PRMT-1742 - many duplicate EHR errors are misclassified, the below reclassifies the relevant data

has_at_least_one_successful_integration_code = lambda errors: any((np.isnan(e) or e==15 for e in errors))
successful_transfers_bool = transfers_raw['request_completed_ack_codes'].apply(has_at_least_one_successful_integration_code)
transfers = transfers_raw.copy()
transfers.loc[successful_transfers_bool, "status"] = "INTEGRATED"

# Correctly interpret certail sender errors as failed.
# This is explained in PRMT-1974. Eventaully this will be fixed upstream in the pipeline. 
pending_sender_error_codes=[6,7,10,24,30,23,14,99]
transfers_with_pending_sender_code_bool=transfers['sender_error_code'].isin(pending_sender_error_codes)
transfers_with_pending_with_error_bool=transfers['status']=='PENDING_WITH_ERROR'
transfers_which_need_pending_to_failure_change_bool=transfers_with_pending_sender_code_bool & transfers_with_pending_with_error_bool
transfers.loc[transfers_which_need_pending_to_failure_change_bool,'status']='FAILED'

# Add integrated Late status
eight_days_in_seconds=8*24*60*60
transfers_after_sla_bool=transfers['sla_duration']>eight_days_in_seconds
transfers_with_integrated_bool=transfers['status']=='INTEGRATED'
transfers_integrated_late_bool=transfers_after_sla_bool & transfers_with_integrated_bool
transfers.loc[transfers_integrated_late_bool,'status']='INTEGRATED LATE'

# If the record integrated after 28 days, change the status back to pending.
# This is to handle each month consistentently and to always reflect a transfers status 28 days after it was made.
# TBD how this is handled upstream in the pipeline
twenty_eight_days_in_seconds=28*24*60*60
transfers_after_month_bool=transfers['sla_duration']>twenty_eight_days_in_seconds
transfers_pending_at_month_bool=transfers_after_month_bool & transfers_integrated_late_bool
transfers.loc[transfers_pending_at_month_bool,'status']='PENDING'
transfers_with_early_error_bool=(~transfers.loc[:,'sender_error_code'].isna()) |(~transfers.loc[:,'intermediate_error_codes'].apply(len)>0)
transfers.loc[transfers_with_early_error_bool & transfers_pending_at_month_bool,'status']='PENDING_WITH_ERROR'

# Supplier name mapping
supplier_renaming = {
    "EGTON MEDICAL INFORMATION SYSTEMS LTD (EMIS)":"EMIS",
    "IN PRACTICE SYSTEMS LTD":"Vision",
    "MICROTEST LTD":"Microtest",
    "THE PHOENIX PARTNERSHIP":"TPP",
    None: "Unknown"
}

asid_lookup_file = "s3://prm-gp2gp-data-sandbox-dev/asid-lookup/asidLookup-Mar-2021.csv.gz"
asid_lookup = pd.read_csv(asid_lookup_file)
lookup = asid_lookup[["ASID", "MName", "NACS","OrgName"]]

transfers = transfers.merge(lookup, left_on='requesting_practice_asid',right_on='ASID',how='left')
transfers = transfers.rename({'MName': 'requesting_supplier', 'ASID': 'requesting_supplier_asid', 'NACS': 'requesting_ods_code'}, axis=1)
transfers = transfers.merge(lookup, left_on='sending_practice_asid',right_on='ASID',how='left')
transfers = transfers.rename({'MName': 'sending_supplier', 'ASID': 'sending_supplier_asid', 'NACS': 'sending_ods_code'}, axis=1)

transfers["sending_supplier"] = transfers["sending_supplier"].replace(supplier_renaming.keys(), supplier_renaming.values())
transfers["requesting_supplier"] = transfers["requesting_supplier"].replace(supplier_renaming.keys(), supplier_renaming.values())

In [3]:
status_counts = transfers["status"].value_counts().rename("Total Transfers")
status_percents = (status_counts / status_counts.sum() * 100).rename("Percent")
status_table = pd.concat([status_counts, status_percents], axis=1)
status_table.index = [status_name.replace("_", " ").title() for status_name in status_table.index]
paper_fallback = status_table.copy().drop("Integrated").sum().round(2)
status_table["Percent"] = status_table["Percent"].round(2)

In [25]:
total_count = len(transfers)
print("Total transfers:", total_count)
print("First transfer:", transfers["date_requested"].min())
print("Last transfer:", transfers["date_requested"].max())

print("\nHigh level breakdown of statuses at 28 days:")

for status in status_table.index:
    count = status_table.loc[status, "Total Transfers"]
    percent = status_table.loc[status, "Percent"]
    print(f"{status}: {count} ({percent}%)")
    
fallback_count = paper_fallback["Total Transfers"].astype(int)
fallback_percent = paper_fallback["Percent"]
print(f"\nEstimated paper fallback: {fallback_count} ({fallback_percent}%)")

print("\nNB: These values may be calculated using a slightly different methodology from those published on the data platform")

Total transfers: 1343234
First transfer: 2020-09-01 04:51:16.148000
Last transfer: 2021-02-28 23:04:58.544000

High level breakdown of statuses at 28 days:
Integrated: 1175456 (87.51%)
Integrated Late: 71547 (5.33%)
Pending: 46860 (3.49%)
Failed: 46146 (3.44%)
Pending With Error: 3225 (0.24%)

Estimated paper fallback: 167778 (12.49%)

NB: These values may be calculated using a slightly different methodology from those published on the data platform


In [22]:
transfers.pivot_table(index="requesting_supplier", columns=["sending_supplier"], aggfunc="count", values="conversation_id")

sending_supplier,EMIS,Microtest,TPP,Unknown,Vision
requesting_supplier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
EMIS,788019,507,236887,3055,19723
Microtest,532,129,365,7,13
TPP,258060,381,322,1851,5014
Unknown,2053,9,1716,63,54
Vision,16234,3,3893,56,4288
