# PRMT-2276 Run top level table with new categorisations using Sept-Jun data

## Context
Now that we have recategorised GP2GP transfers in a more accurate and granular way, we want to be able to accurately measure GP2GP success over time so we can see how supplier changes are impacting the failure rate. This means regenerating previous months data with the new categories applied so we can do a more like for like comparison. 

## Scope 
- Regenerate the high level table
- For each month between September 2020 - June 2021 with 14 day cutoff
- Generate a separate table for each month

In [1]:
import pandas as pd 
import numpy as np

In [2]:
transfer_file_location = "s3://prm-gp2gp-transfer-data-dev/14-day-cutoff/v4/"
transfer_files = [
    "2020/9/transfers.parquet",
    "2020/10/transfers.parquet",
    "2020/11/transfers.parquet",
    "2020/12/transfers.parquet",
    "2021/1/transfers.parquet",
    "2021/2/transfers.parquet",
    "2021/3/transfers.parquet",
    "2021/4/transfers.parquet",
    "2021/5/transfers.parquet",
    "2021/6/transfers.parquet",
]
transfer_input_files = [transfer_file_location + f for f in transfer_files]
transfers_raw = pd.concat((
    pd.read_parquet(f)
    for f in transfer_input_files
))

In [3]:
# Supplier data was only available from Feb/Mar 2021. Sending and requesting supplier values for all transfers before that are empty
# Dropping these columns to merge supplier data from ASID lookup files
transfers_raw = transfers_raw.drop(["sending_supplier", "requesting_supplier"], axis=1)
transfers = transfers_raw.copy()

# Supplier name mapping
supplier_renaming = {
    "EGTON MEDICAL INFORMATION SYSTEMS LTD (EMIS)":"EMIS",
    "IN PRACTICE SYSTEMS LTD":"Vision",
    "MICROTEST LTD":"Microtest",
    "THE PHOENIX PARTNERSHIP":"TPP",
    None: "Unknown"
}

# Generate ASID lookup that contains all the most recent entry for all ASIDs encountered
asid_file_location = "s3://prm-gp2gp-asid-lookup-dev/"
asid_files = [
    "2020/11/asidLookup.csv.gz",
    "2020/12/asidLookup.csv.gz",
    "2021/1/asidLookup.csv.gz",
    "2021/2/asidLookup.csv.gz",
    "2021/3/asidLookup.csv.gz",
    "2021/4/asidLookup.csv.gz",
    "2021/5/asidLookup.csv.gz",
    "2021/6/asidLookup.csv.gz",
]
asid_lookup_files = [asid_file_location + f for f in asid_files]
asid_lookup = pd.concat((
    pd.read_csv(f)
    for f in asid_lookup_files
))
asid_lookup = asid_lookup.drop_duplicates().groupby("ASID").last().reset_index()
lookup = asid_lookup[["ASID", "MName"]]

transfers = transfers.merge(lookup, left_on='requesting_practice_asid',right_on='ASID',how='left')
transfers = transfers.rename({'MName': 'requesting_supplier', 'ASID': 'requesting_supplier_asid'}, axis=1)
transfers = transfers.merge(lookup, left_on='sending_practice_asid',right_on='ASID',how='left')
transfers = transfers.rename({'MName': 'sending_supplier', 'ASID': 'sending_supplier_asid'}, axis=1)

transfers["sending_supplier"] = transfers["sending_supplier"].replace(supplier_renaming.keys(), supplier_renaming.values())
transfers["requesting_supplier"] = transfers["requesting_supplier"].replace(supplier_renaming.keys(), supplier_renaming.values())

# Making the status to be more human readable here
transfers["status"] = transfers["status"].str.replace("_", " ").str.title()

In [4]:
error_code_lookup_file = pd.read_csv("https://raw.githubusercontent.com/nhsconnect/prm-gp2gp-data-sandbox/master/data/gp2gp_response_codes.csv")

In [5]:
outcome_counts = transfers.fillna("N/A").groupby(by=["status", "failure_reason"]).agg({"conversation_id": "count"})
outcome_counts = outcome_counts.rename({"conversation_id": "Number of transfers", "failure_reason": "Failure Reason"}, axis=1)
outcome_counts["% of transfers"] = (outcome_counts["Number of transfers"] / outcome_counts["Number of transfers"].sum()).multiply(100)
outcome_counts.round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Number of transfers,% of transfers
status,failure_reason,Unnamed: 2_level_1,Unnamed: 3_level_1
Integrated On Time,,2065109,87.61
Process Failure,Integrated Late,74797,3.17
Process Failure,"Transferred, not integrated",107495,4.56
Technical Failure,COPC(s) not Acknowledged,1361,0.06
Technical Failure,COPC(s) not sent,280,0.01
Technical Failure,Contains Fatal Sender Error,38093,1.62
Technical Failure,Core Extract not Sent,28642,1.22
Technical Failure,Final Error,28150,1.19
Technical Failure,Request not Acknowledged,9611,0.41
Unclassified Failure,Ambiguous COPC messages,2877,0.12


In [6]:
transfers['month']=transfers['date_requested'].dt.to_period('M')

In [7]:
def convert_error_list_to_tuple(error_code_list, error_code_type):
    return [(error_code_type, error_code) for error_code in set(error_code_list) if not np.isnan(error_code)]

def combine_error_codes(row):
    sender_list = convert_error_list_to_tuple(row["sender_error_codes"], "Sender")
    intermediate_list = convert_error_list_to_tuple(row["intermediate_error_codes"], "COPC")
    final_list = convert_error_list_to_tuple(row["final_error_codes"], "Final")
    full_error_code_list = sender_list + intermediate_list + final_list
    if len(full_error_code_list) == 0:
        return [("No Error Code", "No Error")]
    else:
        return full_error_code_list
    
transfers["all_error_codes"] = transfers.apply(combine_error_codes, axis=1)

In [8]:
def generate_high_level_table(transfers_sample):

    # Break up lines by error code
    transfers_split_by_error_code=transfers_sample.explode("all_error_codes")

    # Create High level table
    high_level_table=transfers_split_by_error_code.fillna("N/A").groupby(["requesting_supplier","sending_supplier","status","failure_reason","all_error_codes"]).agg({'conversation_id':'count'})
    high_level_table=high_level_table.rename({'conversation_id':'Number of Transfers'},axis=1).reset_index()

    # Count % of transfers
    total_number_transfers = transfers_sample.shape[0]
    high_level_table['% of Transfers']=(high_level_table['Number of Transfers']/total_number_transfers).multiply(100)

    # Count by supplier pathway
    supplier_pathway_counts = transfers_sample.fillna("Unknown").groupby(by=["sending_supplier", "requesting_supplier"]).agg({"conversation_id": "count"})['conversation_id']
    high_level_table['% Supplier Pathway Transfers']=high_level_table.apply(lambda row: row['Number of Transfers']/supplier_pathway_counts.loc[(row['sending_supplier'],row['requesting_supplier'])],axis=1).multiply(100)

    # Add in Paper Fallback columns
    total_fallback = transfers_sample["failure_reason"].dropna().shape[0]
    fallback_bool=high_level_table['status']!='Integrated On Time'
    high_level_table.loc[fallback_bool,'% Paper Fallback']=(high_level_table['Number of Transfers']/total_fallback).multiply(100)

    # % of error codes column
    total_number_of_error_codes=transfers_split_by_error_code['all_error_codes'].value_counts().drop(('No Error Code','No Error')).sum()
    error_code_bool=high_level_table['all_error_codes']!=('No Error Code', 'No Error')
    high_level_table.loc[error_code_bool,'% of error codes']=(high_level_table['Number of Transfers']/total_number_of_error_codes).multiply(100)
    
    # Adding columns to describe errors
    high_level_table['error_type']=high_level_table['all_error_codes'].apply(lambda error_tuple: error_tuple[0])
    high_level_table['error_code']=high_level_table['all_error_codes'].apply(lambda error_tuple: error_tuple[1])
    high_level_table=high_level_table.merge(error_code_lookup_file[['ErrorCode','ResponseText']],left_on='error_code',right_on='ErrorCode',how='left')

    # Select and re-order table
    grouping_columns_order=['requesting_supplier','sending_supplier','status','failure_reason','error_type','ResponseText','error_code']
    counting_columns_order=['Number of Transfers','% of Transfers','% Supplier Pathway Transfers','% Paper Fallback','% of error codes']
    high_level_table=high_level_table[grouping_columns_order+counting_columns_order].sort_values(by='Number of Transfers',ascending=False)
    
    return high_level_table

In [9]:
with pd.ExcelWriter("High Level Tables 14 Day Cutoff PRMT-2276.xlsx") as writer:
    generate_high_level_table(transfers.copy()).to_excel(writer, sheet_name="All",index=False)
    [generate_high_level_table(transfers[transfers['month']==month].copy()).to_excel(writer, sheet_name=str(month),index=False) for month in transfers['month'].unique()]