# PRMT-1984 [HYPOTHESIS] A high proportion of pending transfers are caused by a small group of practices

### Hypothesis
We believe that a large proportion of pending transfers are caused by a small group of practices receiving, regardless of the sending supplier. 

We will know this to be true when we can see that the same practices causing a high number of EMIS-EMIS pending transfers are the same as those causing TPP-EMIS or Vision-EMIS, (and the same would occur for each supplier as a receiver)

# Import and setup Data

import pandas as pd
import numpy as np

transfer_file_location = "s3://prm-gp2gp-data-sandbox-dev/transfers-duplicates-hypothesis/"
transfer_files = [
    "9-2020-transfers.parquet",
    "10-2020-transfers.parquet",
    "11-2020-transfers.parquet",
    "12-2020-transfers.parquet",
    "1-2021-transfers.parquet",
    "2-2021-transfers.parquet"
]
transfer_input_files = [transfer_file_location + f for f in transfer_files]
transfers_raw = pd.concat((
    pd.read_parquet(f)
    for f in transfer_input_files
))
# This is only needed when using transfers-duplicates-hypothesis datasets
transfers_raw = transfers_raw.drop(["sending_supplier", "requesting_supplier"], axis=1)


# Given the findings in PRMT-1742 - many duplicate EHR errors are misclassified, the below reclassifies the relevant data
successful_transfers_bool = transfers_raw['request_completed_ack_codes'].apply(lambda x: True in [(np.isnan(i) or i==15) for i in x])
transfers = transfers_raw.copy()
transfers.loc[successful_transfers_bool, "status"] = "INTEGRATED"

# Pending (Sender Error) Code
pending_sender_error_codes=[6,7,10,24,30,23,14,99]
transfers_with_pending_sender_code_bool=transfers['sender_error_code'].isin(pending_sender_error_codes)
transfers_with_pending_with_error_bool=transfers['status']=='PENDING_WITH_ERROR'
transfers_which_need_pending_to_failure_change_bool=transfers_with_pending_sender_code_bool & transfers_with_pending_with_error_bool
transfers.loc[transfers_which_need_pending_to_failure_change_bool,'status']='FAILED'

# Add integrated Late status
eight_days_in_seconds=8*24*60*60
transfers_after_sla_bool=transfers['sla_duration']>eight_days_in_seconds
transfers_with_integrated_bool=transfers['status']=='INTEGRATED'
transfers_integrated_late_bool=transfers_after_sla_bool & transfers_with_integrated_bool
transfers.loc[transfers_integrated_late_bool,'status']='INTEGRATED LATE'

twenty_eight_days_in_seconds=28*24*60*60
transfers_after_month_bool=transfers['sla_duration']>twenty_eight_days_in_seconds
transfers_pending_at_month_bool=transfers_after_month_bool & transfers_integrated_late_bool
transfers.loc[transfers_pending_at_month_bool,'status']='PENDING'
transfers_with_early_error_bool=(~transfers.loc[:,'sender_error_code'].isna()) |(~transfers.loc[:,'intermediate_error_codes'].apply(len)>0)
transfers.loc[transfers_with_early_error_bool & transfers_pending_at_month_bool,'status']='PENDING_WITH_ERROR'

# Supplier name mapping
supplier_renaming = {
    "EGTON MEDICAL INFORMATION SYSTEMS LTD (EMIS)":"EMIS",
    "IN PRACTICE SYSTEMS LTD":"Vision",
    "MICROTEST LTD":"Microtest",
    "THE PHOENIX PARTNERSHIP":"TPP",
    None: "Unknown"
}

asid_lookup_file = "s3://prm-gp2gp-data-sandbox-dev/asid-lookup/asidLookup-Mar-2021.csv.gz"
asid_lookup = pd.read_csv(asid_lookup_file)
lookup = asid_lookup[["ASID", "MName", "NACS","OrgName"]]

transfers = transfers.merge(lookup, left_on='requesting_practice_asid',right_on='ASID',how='left')
transfers = transfers.rename({'MName': 'requesting_supplier', 'ASID': 'requesting_supplier_asid', 'NACS': 'requesting_ods_code'}, axis=1)
transfers = transfers.merge(lookup, left_on='sending_practice_asid',right_on='ASID',how='left')
transfers = transfers.rename({'MName': 'sending_supplier', 'ASID': 'sending_supplier_asid', 'NACS': 'sending_ods_code'}, axis=1)

transfers["sending_supplier"] = transfers["sending_supplier"].replace(supplier_renaming.keys(), supplier_renaming.values())
transfers["requesting_supplier"] = transfers["requesting_supplier"].replace(supplier_renaming.keys(), supplier_renaming.values())

In [None]:
import pandas as pd 
import numpy as np
# Using data generated from branch PRMT-1742-duplicates-analysis.
# This is needed to correctly handle duplicates.
# Once the upstream pipeline has a fix for duplicate EHRs, then we can go back to using the main output.
transfer_file_location = "s3://prm-gp2gp-data-sandbox-dev/transfers-duplicates-hypothesis/"
transfer_files = [
    "9-2020-transfers.parquet",
    "10-2020-transfers.parquet",
    "11-2020-transfers.parquet",
    "12-2020-transfers.parquet",
    "1-2021-transfers.parquet",
    "2-2021-transfers.parquet"
]

transfer_input_files = [transfer_file_location + f for f in transfer_files]
transfers_raw = pd.concat((
    pd.read_parquet(f)
    for f in transfer_input_files
))

# In the data from the PRMT-1742-duplicates-analysis branch, these columns have been added , but contain only empty values.
transfers_raw = transfers_raw.drop(["sending_supplier", "requesting_supplier"], axis=1)


# Given the findings in PRMT-1742 - many duplicate EHR errors are misclassified, the below reclassifies the relevant data

has_at_least_one_successful_integration_code = lambda errors: any((np.isnan(e) or e==15 for e in errors))
successful_transfers_bool = transfers_raw['request_completed_ack_codes'].apply(has_at_least_one_successful_integration_code)
transfers = transfers_raw.copy()
transfers.loc[successful_transfers_bool, "status"] = "INTEGRATED"

# Correctly interpret certail sender errors as failed.
# This is explained in PRMT-1974. Eventaully this will be fixed upstream in the pipeline. 
pending_sender_error_codes=[6,7,10,24,30,23,14,99]
transfers_with_pending_sender_code_bool=transfers['sender_error_code'].isin(pending_sender_error_codes)
transfers_with_pending_with_error_bool=transfers['status']=='PENDING_WITH_ERROR'
transfers_which_need_pending_to_failure_change_bool=transfers_with_pending_sender_code_bool & transfers_with_pending_with_error_bool
transfers.loc[transfers_which_need_pending_to_failure_change_bool,'status']='FAILED'

# Add integrated Late status
eight_days_in_seconds=8*24*60*60
transfers_after_sla_bool=transfers['sla_duration']>eight_days_in_seconds
transfers_with_integrated_bool=transfers['status']=='INTEGRATED'
transfers_integrated_late_bool=transfers_after_sla_bool & transfers_with_integrated_bool
transfers.loc[transfers_integrated_late_bool,'status']='INTEGRATED LATE'

# If the record integrated after 28 days, change the status back to pending.
# This is to handle each month consistentently and to always reflect a transfers status 28 days after it was made.
# TBD how this is handled upstream in the pipeline
twenty_eight_days_in_seconds=28*24*60*60
transfers_after_month_bool=transfers['sla_duration']>twenty_eight_days_in_seconds
transfers_pending_at_month_bool=transfers_after_month_bool & transfers_integrated_late_bool
transfers.loc[transfers_pending_at_month_bool,'status']='PENDING'
transfers_with_early_error_bool=(~transfers.loc[:,'sender_error_code'].isna()) |(~transfers.loc[:,'intermediate_error_codes'].apply(len)>0)
transfers.loc[transfers_with_early_error_bool & transfers_pending_at_month_bool,'status']='PENDING_WITH_ERROR'

# Supplier name mapping
supplier_renaming = {
    "EGTON MEDICAL INFORMATION SYSTEMS LTD (EMIS)":"EMIS",
    "IN PRACTICE SYSTEMS LTD":"Vision",
    "MICROTEST LTD":"Microtest",
    "THE PHOENIX PARTNERSHIP":"TPP",
    None: "Unknown"
}

asid_lookup_file = "s3://prm-gp2gp-data-sandbox-dev/asid-lookup/asidLookup-Mar-2021.csv.gz"
asid_lookup = pd.read_csv(asid_lookup_file)
lookup = asid_lookup[["ASID", "MName", "NACS","OrgName"]]

transfers = transfers.merge(lookup, left_on='requesting_practice_asid',right_on='ASID',how='left')
transfers = transfers.rename({'MName': 'requesting_supplier', 'ASID': 'requesting_supplier_asid', 'NACS': 'requesting_ods_code'}, axis=1)
transfers = transfers.merge(lookup, left_on='sending_practice_asid',right_on='ASID',how='left')
transfers = transfers.rename({'MName': 'sending_supplier', 'ASID': 'sending_supplier_asid', 'NACS': 'sending_ods_code'}, axis=1)

transfers["sending_supplier"] = transfers["sending_supplier"].replace(supplier_renaming.keys(), supplier_renaming.values())
transfers["requesting_supplier"] = transfers["requesting_supplier"].replace(supplier_renaming.keys(), supplier_renaming.values())

In [None]:
transfers_reduced_columns = transfers[["requesting_practice_asid","requesting_supplier","sending_supplier", "status"]].copy()
is_pending_transfers = transfers["status"] == "PENDING"
transfers_reduced_columns["is_pending"] = is_pending_transfers
transfers_reduced_columns = transfers_reduced_columns.drop("status", axis=1)
transfers_reduced_columns.head()

# For all Pending, what is the distribution by number of Practices

In [None]:
def transfers_quantile_status_table(transfers_df,status,quantiles=5):
    practice_status_table=pd.pivot_table(transfers_df,index='requesting_practice_asid',columns='status',values='conversation_id',aggfunc='count').fillna(0)
    
    practice_status_table['TOTAL']=practice_status_table.sum(axis=1)
    
    practice_profile_data=practice_status_table.sort_values(by=status,ascending=False)

    cumulative_percentage=practice_profile_data[status].cumsum()/practice_profile_data[status].sum()

    practice_profile_data['Percentile Group']=(100/quantiles)*np.ceil(cumulative_percentage*quantiles)

    practice_profile_data=practice_profile_data.groupby('Percentile Group').agg({status:'sum','TOTAL':'sum','INTEGRATED':'count'}).astype(int)
    practice_profile_data=practice_profile_data.rename({status:'Total ' + status,'TOTAL':'Total Transfers','INTEGRATED':'Total Practices'},axis=1)

    practice_profile_data_percentages=(100*practice_profile_data/practice_profile_data.sum()).round(2)
    practice_profile_data_percentages.columns= "% " + practice_profile_data_percentages.columns

    return pd.concat([practice_profile_data,practice_profile_data_percentages],axis=1)

In [None]:
transfers_quantile_status_table(transfers,"PENDING")

In [None]:
import matplotlib.pyplot as plt
practice_status_table=pd.pivot_table(transfers,index='requesting_practice_asid',columns='status',values='conversation_id',aggfunc='count').fillna(0).sort_values(by="PENDING",ascending=False)
ax=practice_status_table['PENDING'].cumsum().reset_index(drop=True).plot(figsize=(8,5))
ax.set_ylabel('Number of Pending without error Transfers')
ax.set_xlabel('Number of GP Practices')
ax.set_title('Cumulative graph of total pending transfers for GP Practices')
plt.gcf().savefig('Cumulative_pending_transfers.jpg')

# For practices of each Supplier, does their "policy" towards pending transfers differ by the supplier they are requesting from?

In [None]:
suppliers_to_investigate = ["EMIS", "TPP", "Vision"]

In [None]:
pending_by_supplier_pathway=transfers_reduced_columns.pivot_table(index='requesting_supplier',columns='sending_supplier',values='is_pending',aggfunc='mean').multiply(100).round(2)
pending_by_supplier_pathway=pending_by_supplier_pathway.loc[suppliers_to_investigate,suppliers_to_investigate]
pending_by_supplier_pathway

### Correlations of volume pending

In [None]:
pending_transfers_supplier_pathways_pivot = pd.pivot_table(transfers_reduced_columns, index=["requesting_supplier", "requesting_practice_asid"], columns="sending_supplier", values="is_pending", aggfunc="sum").fillna(0)

pending_transfers_as_list = [pending_transfers_supplier_pathways_pivot.loc[supplier].corr().stack().rename(supplier) for supplier in suppliers_to_investigate]
pending_transfers_volume_between_suppliers_correlation = pd.concat(pending_transfers_as_list, axis=1)
pending_transfers_volume_between_suppliers_correlation.loc[[("EMIS", "TPP"), ("EMIS", "Vision"), ("TPP", "Vision")]].round(2)

### Correlations of % pending

In [None]:
pending_transfers_supplier_pathways_pivot = pd.pivot_table(transfers_reduced_columns, index=["requesting_supplier", "requesting_practice_asid"], columns="sending_supplier", values="is_pending", aggfunc="mean").fillna(0)

pending_transfers_as_list = [pending_transfers_supplier_pathways_pivot.loc[supplier].corr().stack().rename(supplier) for supplier in suppliers_to_investigate]
pending_transfers_percentage_between_suppliers_correlation = pd.concat(pending_transfers_as_list, axis=1)
pending_transfers_percentage_between_suppliers_correlation.loc[[("EMIS", "TPP"), ("EMIS", "Vision"), ("TPP", "Vision")]].round(2)

# Create frame of all practices ranked by pending
Output to Excel

In [None]:
# Total volume Pending by practice
pending_transfers_supplier_pathways_pivot = pd.pivot_table(transfers_reduced_columns, index=["requesting_supplier", "requesting_practice_asid"], columns="sending_supplier", values="is_pending", aggfunc="sum").fillna(0)
pending_transfers_for_supplier_pathways = pending_transfers_supplier_pathways_pivot.copy().astype(int)
pending_transfers_for_supplier_pathways = pending_transfers_for_supplier_pathways.loc[:, ["EMIS","TPP","Vision","Microtest","Unknown"]]
pending_transfers_for_supplier_pathways.insert(0, "Total Pending Transfers", pending_transfers_for_supplier_pathways.sum(axis=1))

# Percentage Pending by practice
pending_transfers_supplier_pathways_pivot_percentage = pd.pivot_table(transfers_reduced_columns, index=["requesting_supplier", "requesting_practice_asid"], columns="sending_supplier", values="is_pending", aggfunc="mean").fillna(0)
pending_transfers_supplier_pathways_percentage = pending_transfers_supplier_pathways_pivot_percentage.copy().round(4).multiply(100)
pending_transfers_supplier_pathways_percentage = pending_transfers_supplier_pathways_percentage.loc[:, ["EMIS","TPP","Vision","Microtest","Unknown"]]
pending_transfers_supplier_pathways_percentage.columns = pending_transfers_supplier_pathways_percentage.columns + " %"
 
# Join the two and clean up and re-organise the frame    
complete_pending_transfers_for_supplier_pathways = pd.concat([pending_transfers_for_supplier_pathways, pending_transfers_supplier_pathways_percentage], axis=1)
complete_pending_transfers_for_supplier_pathways = complete_pending_transfers_for_supplier_pathways.sort_values(by="Total Pending Transfers", ascending=False)
complete_pending_transfers_for_supplier_pathways = asid_lookup[["ASID", "PostCode", "OrgName"]].merge(complete_pending_transfers_for_supplier_pathways.reset_index(), right_on="requesting_practice_asid", left_on="ASID", how="right")
complete_pending_transfers_for_supplier_pathways=complete_pending_transfers_for_supplier_pathways.drop('requesting_practice_asid',axis=1).set_index(['requesting_supplier','ASID'])#.insert(0,"Supplier",supplier)

# Save to Excel
complete_pending_transfers_for_supplier_pathways.to_excel('PRMT-1984 Pending Transfers all practices.xlsx')

complete_pending_transfers_for_supplier_pathways.head(10)

In [None]:
# View Emis practices
complete_pending_transfers_for_supplier_pathways.loc["EMIS"].head(10)

In [None]:
gants_hill = (transfers["requesting_practice_asid"] == "926102461049") & (transfers["status"] == "PENDING") & (transfers["sending_supplier"] == "Vision")
gants_hill = transfers.loc[gants_hill]

#gants_hill.head(20)

def create_pending_transfers_with_org_info_for_supplier(supplier,transfers_df):
    pending_transfers_supplier_pathways_pivot = pd.pivot_table(transfers_df, index=["requesting_supplier", "requesting_practice_asid"], columns="sending_supplier", values="is_pending", aggfunc="sum").fillna(0)
    pending_transfers_for_supplier_pathways = pending_transfers_supplier_pathways_pivot.loc[supplier].copy().astype(int)
    pending_transfers_for_supplier_pathways = pending_transfers_for_supplier_pathways.loc[:, ["EMIS","TPP","Vision","Microtest","Unknown"]]
    pending_transfers_for_supplier_pathways.insert(0, "Total Pending Transfers", pending_transfers_for_supplier_pathways.sum(axis=1))
    
    pending_transfers_supplier_pathways_pivot_percentage = pd.pivot_table(transfers_df, index=["requesting_supplier", "requesting_practice_asid"], columns="sending_supplier", values="is_pending", aggfunc="mean").fillna(0)
    pending_transfers_supplier_pathways_percentage = pending_transfers_supplier_pathways_pivot_percentage.loc[supplier].copy().round(4).multiply(100)
    pending_transfers_supplier_pathways_percentage = pending_transfers_supplier_pathways_percentage.loc[:, ["EMIS","TPP","Vision","Microtest","Unknown"]]
    pending_transfers_supplier_pathways_percentage.columns = pending_transfers_supplier_pathways_percentage.columns + " %"
    
    complete_pending_transfers_for_supplier_pathways = pd.concat([pending_transfers_for_supplier_pathways, pending_transfers_supplier_pathways_percentage], axis=1)
    complete_pending_transfers_for_supplier_pathways = complete_pending_transfers_for_supplier_pathways.sort_values(by="Total Pending Transfers", ascending=False)
    complete_pending_transfers_for_supplier_pathways = asid_lookup[["ASID", "PostCode", "OrgName"]].merge(complete_pending_transfers_for_supplier_pathways, right_index=True, left_on="ASID", how="right")
    complete_pending_transfers_for_supplier_pathways.insert(0,"Supplier",supplier)
    return complete_pending_transfers_for_supplier_pathways

emis_pending_transfers_with_org_info = create_pending_transfers_with_org_info_for_supplier("EMIS",transfers_reduced_columns)
emis_pending_transfers_with_org_info.head(20)

In [None]:
#pd.pivot_table(transfers_reduced_columns, index="requesting_supplier", columns="sending_supplier", values="is_pending", aggfunc="sum")

all_suppliers = [create_pending_transfers_with_org_info_for_supplier(supplier,transfers_reduced_columns) for supplier in suppliers_to_investigate]
all_suppliers = pd.concat(all_suppliers, axis=0).fillna(0)
all_suppliers.sort_values("Total Pending Transfers", ascending=False).head(20)