# [“HYPOTHESIS”] Creation of tables to determine and assess the most common causes of failure to integrate

- Import data and adjust for duplicate EHR issue (see PRMT-1742)
- Implement the sender error pipeline change (see PRMT-1960)
- Add in supplier

Table 1:
- Relabel status to reflect integrated late (integrated after 8 days)
- Add in column for absolutely no errors, at any stage
- Create separate dataframes for each error type and concatenate
- generate the pivot table
- Add in Error Descriptions/Name
- Add in Supplier Pathway %s
- Create Monthly Views


In [1]:
import pandas as pd
import numpy as np

## Import 6 months of data and adjust for duplicates issue

In [2]:
# Note - this dataset has a cut off point of one month
transfer_file_location = "s3://prm-gp2gp-data-sandbox-dev/transfers-duplicates-hypothesis/"
transfer_files = [
    "9-2020-transfers.parquet",
    "10-2020-transfers.parquet",
    "11-2020-transfers.parquet",
    "12-2020-transfers.parquet",
    "1-2021-transfers.parquet",
    "2-2021-transfers.parquet"
]
transfer_input_files = [transfer_file_location + f for f in transfer_files]
transfers_raw = pd.concat((
    pd.read_parquet(f)
    for f in transfer_input_files
))

# This is only needed when using transfers-duplicates-hypothesis datasets
transfers_raw=transfers_raw.drop(['requesting_supplier','sending_supplier'],axis=1)

In [3]:
asid_lookup_file = "s3://prm-gp2gp-data-sandbox-dev/asid-lookup/asidLookup-Mar-2021.csv.gz"
asid_lookup = pd.read_csv(asid_lookup_file)

In [4]:
# Given the findings in PRMT-1742 - many duplicate EHR errors are misclassified, the below reclassifies the relevant data
transfers=transfers_raw.copy()
successful_transfers_bool = transfers['request_completed_ack_codes'].apply(lambda ack_codes: True in [(np.isnan(code) or code==15) for code in ack_codes])
transfers.loc[successful_transfers_bool,'status']='INTEGRATED'

In [5]:
# Given the findings in PRMT-1960 - we re-classify pending as error transfers with certain sender error codes as failed instead
pending_sender_error_codes=[6,7,10,24,30,23,14,99]
transfers_with_pending_sender_code_bool=transfers['sender_error_code'].isin(pending_sender_error_codes)
transfers_with_pending_with_error_bool=transfers['status']=='PENDING_WITH_ERROR'
transfers_which_need_pending_to_failure_change_bool=transfers_with_pending_sender_code_bool & transfers_with_pending_with_error_bool
transfers.loc[transfers_which_need_pending_to_failure_change_bool,'status']='FAILED'

In [6]:
# Supplier name mapping
supplier_renaming = {
    "EGTON MEDICAL INFORMATION SYSTEMS LTD (EMIS)":"EMIS",
    "IN PRACTICE SYSTEMS LTD":"Vision",
    "MICROTEST LTD":"Microtest",
    "THE PHOENIX PARTNERSHIP":"TPP",
    None: "Unknown"
}

lookup = asid_lookup[["ASID", "MName"]]
transfers = transfers.merge(lookup, left_on='requesting_practice_asid',right_on='ASID',how='left')
transfers = transfers.rename({'MName': 'requesting_supplier', 'ASID': 'requesting_supplier_asid'}, axis=1)
transfers = transfers.merge(lookup, left_on='sending_practice_asid',right_on='ASID',how='left')
transfers = transfers.rename({'MName': 'sending_supplier', 'ASID': 'sending_supplier_asid'}, axis=1)

transfers["sending_supplier"] = transfers["sending_supplier"].replace(supplier_renaming.keys(), supplier_renaming.values())
transfers["requesting_supplier"] = transfers["requesting_supplier"].replace(supplier_renaming.keys(), supplier_renaming.values())

In [7]:
error_code_lookup_file = pd.read_csv("https://raw.githubusercontent.com/nhsconnect/prm-gp2gp-data-sandbox/master/data/gp2gp_response_codes.csv")

## Table 1: High Level View of Issues

### Relabel status to reflect integrated late

In [8]:
eight_days_in_seconds=8*24*60*60
transfers_after_sla_bool=transfers['sla_duration']>eight_days_in_seconds
transfers_with_integrated_bool=transfers['status']=='INTEGRATED'
transfers_integrated_late_bool=transfers_after_sla_bool & transfers_with_integrated_bool
transfers.loc[transfers_integrated_late_bool,'status']='INTEGRATED LATE'

### Add in column for absolutely no errors


In [9]:
transfers_without_sender_error_bool=transfers['sender_error_code'].isna()
transfers_without_intermediate_error_bool=transfers['intermediate_error_codes'].apply(len)==0
transfers_without_final_ack_error_bool=transfers['request_completed_ack_codes'].apply(lambda lis: [x for x in lis if np.isfinite(x)]).apply(len)==0

transfers_without_any_error_bool=transfers_without_sender_error_bool & transfers_without_intermediate_error_bool & transfers_without_final_ack_error_bool
transfers['No error codes']=np.nan
transfers.loc[transfers_without_any_error_bool,'No error codes']='No Error'

### Add in month column

In [10]:
transfers['month']=transfers['date_requested'].dt.to_period('M')

### Create seperate dataframes for each error type and concatenate

In [11]:
reduced_transfers=transfers[['requesting_supplier','sending_supplier','sender_error_code','intermediate_error_codes','request_completed_ack_codes','No error codes','status','conversation_id','month']]

#### Sender Error Table

In [12]:
sender_table=reduced_transfers.drop(['intermediate_error_codes','request_completed_ack_codes','No error codes'],axis=1)
sender_table=sender_table.loc[~sender_table['sender_error_code'].isna()]
sender_table=sender_table.rename({'sender_error_code':'Error Code'},axis=1)
sender_table['Error Type']='Sender'
sender_table=pd.pivot_table(sender_table, index=['sending_supplier','requesting_supplier','Error Type','Error Code','status'],columns='month', aggfunc='count', values='conversation_id').fillna(0)
sender_table['Total Volume']=sender_table.sum(axis=1)
sender_table=sender_table.astype(int)

In [13]:
sender_table.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,month,2020-09,2020-10,2020-11,2020-12,2021-01,2021-02,Total Volume
sending_supplier,requesting_supplier,Error Type,Error Code,status,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
EMIS,EMIS,Sender,6.0,FAILED,83,94,73,96,106,84,536
EMIS,EMIS,Sender,7.0,FAILED,87,80,30,12,39,43,291
EMIS,EMIS,Sender,10.0,FAILED,471,487,354,326,458,511,2607
EMIS,EMIS,Sender,14.0,FAILED,1,2,4,2,5,1,15
EMIS,EMIS,Sender,14.0,INTEGRATED,0,0,0,1,0,1,2


#### Intermediate Error Table

In [14]:
intermediate_table=reduced_transfers.drop(['sender_error_code','request_completed_ack_codes','No error codes'],axis=1)
intermediate_table=intermediate_table.loc[intermediate_table['intermediate_error_codes'].apply(len)>0]
intermediate_table=intermediate_table.explode('intermediate_error_codes')
intermediate_table=intermediate_table.rename({'intermediate_error_codes':'Error Code'},axis=1)
intermediate_table['Error Type']='Intermediate'
intermediate_table=pd.pivot_table(intermediate_table, index=['sending_supplier','requesting_supplier','Error Type','Error Code','status'],columns='month', aggfunc='count', values='conversation_id').fillna(0)
intermediate_table['Total Volume']=intermediate_table.sum(axis=1)
intermediate_table=intermediate_table.astype(int)

In [15]:
intermediate_table.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,month,2020-09,2020-10,2020-11,2020-12,2021-01,2021-02,Total Volume
sending_supplier,requesting_supplier,Error Type,Error Code,status,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
EMIS,EMIS,Intermediate,12,PENDING_WITH_ERROR,0,0,0,1,3,1,5
EMIS,EMIS,Intermediate,15,PENDING_WITH_ERROR,0,4,1,8,7,4,24
EMIS,EMIS,Intermediate,29,FAILED,218,166,79,89,63,142,757
EMIS,EMIS,Intermediate,29,INTEGRATED,71,92,58,28,30,68,347
EMIS,EMIS,Intermediate,29,INTEGRATED LATE,3,12,2,2,0,13,32


#### Final Request Acknowledgements Table

In [16]:
reqack_table=reduced_transfers.drop(['sender_error_code','intermediate_error_codes','No error codes'],axis=1)
reqack_table['request_completed_ack_codes']=reqack_table['request_completed_ack_codes'].apply(lambda lis: [x for x in lis if np.isfinite(x)])
reqack_table=reqack_table.loc[reqack_table['request_completed_ack_codes'].apply(len)>0]
reqack_table=reqack_table.explode('request_completed_ack_codes')
reqack_table=reqack_table.rename({'request_completed_ack_codes':'Error Code'},axis=1)
reqack_table['Error Type']='Final Request Acknowledgment'
reqack_table=pd.pivot_table(reqack_table, index=['sending_supplier','requesting_supplier','Error Type','Error Code','status'],columns='month', aggfunc='count', values='conversation_id').fillna(0)
reqack_table['Total Volume']=reqack_table.sum(axis=1)
reqack_table=reqack_table.astype(int)

In [17]:
reqack_table.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,month,2020-09,2020-10,2020-11,2020-12,2021-01,2021-02,Total Volume
sending_supplier,requesting_supplier,Error Type,Error Code,status,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
EMIS,EMIS,Final Request Acknowledgment,11.0,FAILED,116,131,87,37,47,88,506
EMIS,EMIS,Final Request Acknowledgment,11.0,INTEGRATED,295,351,155,156,208,210,1375
EMIS,EMIS,Final Request Acknowledgment,11.0,INTEGRATED LATE,113,44,41,56,85,74,413
EMIS,EMIS,Final Request Acknowledgment,12.0,FAILED,460,345,334,259,254,276,1928
EMIS,EMIS,Final Request Acknowledgment,12.0,INTEGRATED,3108,2712,2305,1845,2480,2594,15044


#### No error codes Table

In [18]:
noerror_table=reduced_transfers.drop(['intermediate_error_codes','request_completed_ack_codes','sender_error_code'],axis=1)
noerror_table=noerror_table.loc[~noerror_table['No error codes'].isna()]
noerror_table=noerror_table.rename({'No error codes':'Error Code'},axis=1)
noerror_table['Error Type']='No Error Code'
noerror_table=pd.pivot_table(noerror_table, index=['sending_supplier','requesting_supplier','Error Type','Error Code','status'],columns='month', aggfunc='count', values='conversation_id').fillna(0)
noerror_table['Total Volume']=noerror_table.sum(axis=1)
noerror_table=noerror_table.astype(int)

In [19]:
noerror_table.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,month,2020-09,2020-10,2020-11,2020-12,2021-01,2021-02,Total Volume
sending_supplier,requesting_supplier,Error Type,Error Code,status,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
EMIS,EMIS,No Error Code,No Error,INTEGRATED,129663,112414,98785,83229,109525,119752,653368
EMIS,EMIS,No Error Code,No Error,INTEGRATED LATE,8499,6560,5469,7350,6924,6505,41307
EMIS,EMIS,No Error Code,No Error,PENDING,2548,1854,1856,1683,1973,1891,11805
EMIS,Microtest,No Error Code,No Error,INTEGRATED,21,10,11,14,8,2,66
EMIS,Microtest,No Error Code,No Error,INTEGRATED LATE,9,0,3,1,5,0,18


### Concatenate Tables

In [20]:
high_level_table=pd.concat([sender_table,intermediate_table,reqack_table,noerror_table])

# Start index at 1 for readability
high_level_table=high_level_table.reset_index()
high_level_table=high_level_table.reset_index().drop('index',axis=1)
high_level_table.index=high_level_table.index+1

high_level_table=high_level_table.merge(error_code_lookup_file,left_on='Error Code',right_on='ErrorCode',how='left')

transfers_with_no_errors_bool = high_level_table['Error Code']=='No Error'
high_level_table.loc[transfers_with_no_errors_bool,['ErrorName','ResponseText']]='No Error'

# Dropped since Error Code column exists
high_level_table=high_level_table.drop(['ErrorCode','ErrorName'],axis=1)

In [21]:
high_level_table.head()

Unnamed: 0,sending_supplier,requesting_supplier,Error Type,Error Code,status,2020-09,2020-10,2020-11,2020-12,2021-01,2021-02,Total Volume,ResponseText
0,EMIS,EMIS,Sender,6,FAILED,83,94,73,96,106,84,536,Patient not at surgery
1,EMIS,EMIS,Sender,7,FAILED,87,80,30,12,39,43,291,GP2GP Messaging is not enabled on this system
2,EMIS,EMIS,Sender,10,FAILED,471,487,354,326,458,511,2607,Failed to successfully generate EHR Extract
3,EMIS,EMIS,Sender,14,FAILED,1,2,4,2,5,1,15,Message not sent because requesting practice i...
4,EMIS,EMIS,Sender,14,INTEGRATED,0,0,0,1,0,1,2,Message not sent because requesting practice i...


## Add in Supplier Pathway %

In [22]:
full_high_level_table=high_level_table.copy()
total_transfers_supplier_pathway=transfers.groupby(['sending_supplier','requesting_supplier']).agg({'conversation_id':'count'}).rename({'conversation_id':'Total Supplier Pathway Transfers'},axis=1).reset_index()
full_high_level_table=full_high_level_table.merge(total_transfers_supplier_pathway,left_on=['sending_supplier','requesting_supplier'],right_on=['sending_supplier','requesting_supplier'])
full_high_level_table['% Supplier Pathway Transfers']=(100*full_high_level_table['Total Volume']/full_high_level_table['Total Supplier Pathway Transfers']).round(2)
full_high_level_table=full_high_level_table.drop('Total Supplier Pathway Transfers',axis=1)
high_level_table=full_high_level_table.copy()

In [23]:
full_high_level_table.head()

Unnamed: 0,sending_supplier,requesting_supplier,Error Type,Error Code,status,2020-09,2020-10,2020-11,2020-12,2021-01,2021-02,Total Volume,ResponseText,% Supplier Pathway Transfers
0,EMIS,EMIS,Sender,6,FAILED,83,94,73,96,106,84,536,Patient not at surgery,0.07
1,EMIS,EMIS,Sender,7,FAILED,87,80,30,12,39,43,291,GP2GP Messaging is not enabled on this system,0.04
2,EMIS,EMIS,Sender,10,FAILED,471,487,354,326,458,511,2607,Failed to successfully generate EHR Extract,0.33
3,EMIS,EMIS,Sender,14,FAILED,1,2,4,2,5,1,15,Message not sent because requesting practice i...,0.0
4,EMIS,EMIS,Sender,14,INTEGRATED,0,0,0,1,0,1,2,Message not sent because requesting practice i...,0.0


## Create Monthly views

In [24]:
monthly_transfers_supplier_pathway=pd.pivot_table(transfers,index=['sending_supplier','requesting_supplier'],columns='month',aggfunc='count', values='conversation_id').fillna(0).astype(int)
monthly_transfers_supplier_pathway
total_monthly_volumes_by_row=high_level_table[['sending_supplier','requesting_supplier']].merge(monthly_transfers_supplier_pathway, left_on=['sending_supplier','requesting_supplier'], right_index=True)
monthly_columns=monthly_transfers_supplier_pathway.columns

monthly_percentages=high_level_table[monthly_columns]/(total_monthly_volumes_by_row[monthly_columns])
monthly_percentages=(monthly_percentages.fillna(0)*100).round(2)
high_level_table[monthly_columns]=monthly_percentages

In [25]:
monthly_percentages.head()

Unnamed: 0,2020-09,2020-10,2020-11,2020-12,2021-01,2021-02
0,0.05,0.07,0.06,0.09,0.08,0.06
1,0.06,0.06,0.03,0.01,0.03,0.03
2,0.3,0.36,0.3,0.32,0.35,0.36
3,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
monthly_columns=[x for x in high_level_table.columns if type(x)!=str]
column_order=['sending_supplier','requesting_supplier','Error Type','ResponseText','status', 'Total Volume', '% Supplier Pathway Transfers','Error Code']+monthly_columns
high_level_table=high_level_table[column_order]

high_level_table=high_level_table.sort_values(by='Total Volume',ascending=False)

In [27]:
high_level_table.head()

Unnamed: 0,sending_supplier,requesting_supplier,Error Type,ResponseText,status,Total Volume,% Supplier Pathway Transfers,Error Code,2020-09,2020-10,2020-11,2020-12,2021-01,2021-02
55,EMIS,EMIS,No Error Code,No Error,INTEGRATED,653368,82.91,No Error,82.92,83.06,83.39,80.57,83.0,83.98
101,EMIS,TPP,No Error Code,No Error,INTEGRATED,225674,87.45,No Error,86.57,88.05,88.3,85.85,87.41,88.39
212,TPP,EMIS,No Error Code,No Error,INTEGRATED,179472,75.76,No Error,76.79,76.07,75.81,73.12,74.17,77.61
33,EMIS,EMIS,Final Request Acknowledgment,A-B-A EHR Extract Received and Stored As Suppr...,INTEGRATED,52387,6.65,15,6.2,6.88,6.85,7.03,6.63,6.49
56,EMIS,EMIS,No Error Code,No Error,INTEGRATED LATE,41307,5.24,No Error,5.43,4.85,4.62,7.12,5.25,4.56


In [28]:
# Output as excel workbook
high_level_table
high_level_table.to_excel('top_level_problems_view.xlsx')