In [1]:
import pandas as pd
import numpy as np

In [2]:
transfer_file_location = "s3://prm-gp2gp-data-sandbox-dev/transfers-duplicates-hypothesis/"
transfer_files = [
    "9-2020-transfers.parquet",
    "10-2020-transfers.parquet",
    "11-2020-transfers.parquet",
    "12-2020-transfers.parquet",
    "1-2021-transfers.parquet",
    "2-2021-transfers.parquet"
]
transfer_input_files = [transfer_file_location + f for f in transfer_files]
transfers_raw = pd.concat((
    pd.read_parquet(f)
    for f in transfer_input_files
))
# This is only needed when using transfers-duplicates-hypothesis datasets
transfers_raw = transfers_raw.drop(["sending_supplier", "requesting_supplier"], axis=1)

In [7]:
asid_lookup_file = "s3://prm-gp2gp-data-sandbox-dev/asid-lookup/asidLookup-Mar-2021.csv.gz"
asid_lookup = pd.read_csv(asid_lookup_file)

In [3]:
# Given the findings in PRMT-1742 - many duplicate EHR errors are misclassified, the below reclassifies the relevant data
successful_transfers_bool = transfers_raw['request_completed_ack_codes'].apply(lambda x: True in [(np.isnan(i) or i==15) for i in x])
transfers = transfers_raw.copy()
transfers.loc[successful_transfers_bool, "status"] = "INTEGRATED"

In [4]:
pending_sender_error_codes=[6,7,10,24,30,23,14,99]
transfers_with_pending_sender_code_bool=transfers['sender_error_code'].isin(pending_sender_error_codes)
transfers_with_pending_with_error_bool=transfers['status']=='PENDING_WITH_ERROR'
transfers_which_need_pending_to_failure_change_bool=transfers_with_pending_sender_code_bool & transfers_with_pending_with_error_bool
transfers.loc[transfers_which_need_pending_to_failure_change_bool,'status']='FAILED'

In [5]:
eight_days_in_seconds=8*24*60*60
transfers_after_sla_bool=transfers['sla_duration']>eight_days_in_seconds
transfers_with_integrated_bool=transfers['status']=='INTEGRATED'
transfers_integrated_late_bool=transfers_after_sla_bool & transfers_with_integrated_bool
transfers.loc[transfers_integrated_late_bool,'status']='INTEGRATED LATE'

In [8]:
# Supplier name mapping
supplier_renaming = {
    "EGTON MEDICAL INFORMATION SYSTEMS LTD (EMIS)":"EMIS",
    "IN PRACTICE SYSTEMS LTD":"Vision",
    "MICROTEST LTD":"Microtest",
    "THE PHOENIX PARTNERSHIP":"TPP",
    None: "Unknown"
}

lookup = asid_lookup[["ASID", "MName", "NACS"]]

transfers = transfers.merge(lookup, left_on='requesting_practice_asid',right_on='ASID',how='left').drop("NACS", axis=1)
transfers = transfers.rename({'MName': 'requesting_supplier', 'ASID': 'requesting_supplier_asid'}, axis=1)
transfers = transfers.merge(lookup, left_on='sending_practice_asid',right_on='ASID',how='left')
transfers = transfers.rename({'MName': 'sending_supplier', 'ASID': 'sending_supplier_asid', 'NACS': 'sending_ods_code'}, axis=1)

transfers["sending_supplier"] = transfers["sending_supplier"].replace(supplier_renaming.keys(), supplier_renaming.values())
transfers["requesting_supplier"] = transfers["requesting_supplier"].replace(supplier_renaming.keys(), supplier_renaming.values())

In [12]:
transfers['Day of Week']=transfers['date_requested'].dt.dayofweek
transfers_by_day=pd.pivot_table(transfers,index='Day of Week',columns='status',values='conversation_id',aggfunc='count')
transfers_by_day

status,FAILED,INTEGRATED,INTEGRATED LATE,PENDING,PENDING_WITH_ERROR
Day of Week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,8604,218817,13277,7974,567
1,10099,266147,16166,8270,610
2,9833,248220,16053,8032,689
3,9156,228737,15734,7855,701
4,7688,193851,14846,6323,568
5,564,14292,2403,501,60
6,202,5392,867,132,4


In [24]:
total_daily_transfers=transfers_by_day.sum(axis=1)

Day of Week
0    249239
1    301292
2    282827
3    262183
4    223276
5     17820
6      6597
dtype: int64

In [16]:
daily_transfers_percentage=100*transfers_by_day.sum(axis=1)/transfers_by_day.sum().sum()

Day of Week
0    18.555144
1    22.430343
2    21.055676
3    19.518788
4    16.622271
5     1.326649
6     0.491128
dtype: float64

In [23]:
daily_transfers_chance_status=100*transfers_by_day.div(transfers_by_day.sum(axis=1),axis=0)

status,FAILED,INTEGRATED,INTEGRATED LATE,PENDING,PENDING_WITH_ERROR
Day of Week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,3.452108,87.794045,5.327015,3.199339,0.227492
1,3.351898,88.335236,5.365559,2.744846,0.202461
2,3.476684,87.763898,5.675908,2.839899,0.243612
3,3.492217,87.243261,6.001152,2.995999,0.267371
4,3.443272,86.821244,6.64917,2.831921,0.254394
5,3.164983,80.20202,13.484848,2.811448,0.3367
6,3.061998,81.734122,13.142337,2.00091,0.060634


In [34]:
transfers['Month']=transfers['date_requested'].dt.to_period('W')
transfers_by_day=pd.pivot_table(transfers,index='Month',columns='status',values='conversation_id',aggfunc='count')
total_daily_transfers=transfers_by_day.sum(axis=1)
daily_transfers_percentage=100*transfers_by_day.sum(axis=1)/transfers_by_day.sum().sum()
daily_transfers_chance_status=100*transfers_by_day.div(transfers_by_day.sum(axis=1),axis=0)
daily_transfers_chance_status

status,FAILED,INTEGRATED,INTEGRATED LATE,PENDING,PENDING_WITH_ERROR
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-08-31/2020-09-06,3.794488,87.399623,6.196094,2.493308,0.116487
2020-09-07/2020-09-13,3.555075,86.886437,7.022287,2.404407,0.131795
2020-09-14/2020-09-20,3.541009,86.009826,7.372316,2.92851,0.14834
2020-09-21/2020-09-27,3.711661,87.722968,4.982332,3.386572,0.196466
2020-09-28/2020-10-04,3.559322,85.754641,6.481033,3.995157,0.209847
2020-10-05/2020-10-11,3.667808,87.885173,5.664121,2.510754,0.272145
2020-10-12/2020-10-18,3.531356,88.344505,5.31448,2.598476,0.211184
2020-10-19/2020-10-25,3.435557,88.124798,5.536437,2.697873,0.205335
2020-10-26/2020-11-01,3.348594,88.700267,4.758528,2.953569,0.239041
2020-11-02/2020-11-08,3.448276,88.001515,5.596348,2.678252,0.27561
