# PRMT-2116 Generate High level table with new transfer categorisation

We’ve completed work for recategorising transfers, so now we want to regenerate the top level table of GP2GP transfers with these categorisations, so we can prioritise next things to look at. We also want to update the table with more recent data, as we’ve currently got September - Feb 2020. 

### Scope
Generate the top level problems table
- With new transfer categorisations
- With March-May data only (excluding three months prior)
- Generate individual for each month

In [1]:
import pandas as pd 
import numpy as np

In [2]:
transfer_file_location = "s3://prm-gp2gp-data-sandbox-dev/transfers-sample-6/"
transfer_files = [
    "2021-3-transfers.parquet",
    "2021-4-transfers.parquet",
    "2021-5-transfers.parquet",
    "2021-6-transfers.parquet",
]
transfer_input_files = [transfer_file_location + f for f in transfer_files]
transfers_raw = pd.concat((
    pd.read_parquet(f)
    for f in transfer_input_files
))

#### TODO: How do we deal with status at exactly 14 or 28 days rather than 14/28 days after the month ended

In [3]:
error_code_lookup_file = pd.read_csv("https://raw.githubusercontent.com/nhsconnect/prm-gp2gp-data-sandbox/master/data/gp2gp_response_codes.csv")

In [4]:
transfers = transfers_raw.copy()
transfers["status"] = transfers["status"].str.replace("_", " ").str.title()

In [5]:
outcome_counts = transfers.fillna("N/A").groupby(by=["status", "failure_reason"]).agg({"conversation_id": "count"})
outcome_counts = outcome_counts.rename({"conversation_id": "Number of transfers", "failure_reason": "Failure Reason"}, axis=1)
outcome_counts["% of transfers"] = (outcome_counts["Number of transfers"] / outcome_counts["Number of transfers"].sum()).multiply(100)
outcome_counts.round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Number of transfers,% of transfers
status,failure_reason,Unnamed: 2_level_1,Unnamed: 3_level_1
Integrated On Time,,893562,88.12
Process Failure,Integrated Late,52816,5.21
Process Failure,"Transferred, not integrated",26486,2.61
Technical Failure,COPC(s) not Acknowledged,507,0.05
Technical Failure,COPC(s) not sent,77,0.01
Technical Failure,Contains Fatal Sender Error,14661,1.45
Technical Failure,Core Extract not Sent,13493,1.33
Technical Failure,Final Error,8323,0.82
Technical Failure,Request not Acknowledged,2751,0.27
Unclassified Failure,Ambiguous COPC messages,879,0.09


In [6]:
transfers['month']=transfers['date_requested'].dt.to_period('M')

In [7]:
def convert_error_list_to_tuple(error_code_list, error_code_type):
    return [(error_code_type, error_code) for error_code in set(error_code_list) if not np.isnan(error_code)]
    
def convert_error_to_tuple(error_code, error_code_type):
    if np.isnan(error_code):
        return []
    else:
        return [(error_code_type, error_code)]

def combine_error_codes(row):
    sender_list = convert_error_to_tuple(row["sender_error_code"], "Sender")
    intermediate_list = convert_error_list_to_tuple(row["intermediate_error_codes"], "COPC")
    final_list = convert_error_list_to_tuple(row["final_error_codes"], "Final")
    full_error_code_list = sender_list + intermediate_list + final_list
    if len(full_error_code_list) == 0:
        return [("No Error Code", "No Error")]
    else:
        return full_error_code_list
    
transfers["all_error_codes"] = transfers.apply(combine_error_codes, axis=1)

In [8]:
# We spotted a discrepency - patches of investigative code here - to delete!!
transfers.loc[transfers['failure_reason']=='Contains Fatal Sender Error']
convert_error_to_tuple(14.0, "Sender")
transfers.loc[transfers['failure_reason']=='Contains Fatal Sender Error','all_error_codes'].value_counts()
discrepency_bool=(transfers['failure_reason']=='Contains Fatal Sender Error') & (transfers['sender_error_code'].isna())


In [11]:
def generate_high_level_table(transfers_sample):

    # Break up lines by error code
    transfers_split_by_error_code=transfers_sample.explode("all_error_codes")

    # Create High level table
    high_level_table=transfers_split_by_error_code.fillna("N/A").groupby(["requesting_supplier","sending_supplier","status","failure_reason","all_error_codes"]).agg({'conversation_id':'count'})
    high_level_table=high_level_table.rename({'conversation_id':'Number of Transfers'},axis=1).reset_index()

    # Count % of transfers
    total_number_transfers = transfers_sample.shape[0]
    high_level_table['% of Transfers']=(high_level_table['Number of Transfers']/total_number_transfers).multiply(100)

    # Count by supplier pathway
    supplier_pathway_counts = transfers_sample.fillna("Unknown").groupby(by=["sending_supplier", "requesting_supplier"]).agg({"conversation_id": "count"})['conversation_id']
    high_level_table['% Supplier Pathway Transfers']=high_level_table.apply(lambda row: row['Number of Transfers']/supplier_pathway_counts.loc[(row['sending_supplier'],row['requesting_supplier'])],axis=1).multiply(100)

    # Add in Paper Fallback columns
    total_fallback = transfers_sample["failure_reason"].dropna().shape[0]
    fallback_bool=high_level_table['status']!='Integrated On Time'
    high_level_table.loc[fallback_bool,'% Paper Fallback']=(high_level_table['Number of Transfers']/total_fallback).multiply(100)

    # % of error codes column
    total_number_of_error_codes=transfers_split_by_error_code['all_error_codes'].value_counts().drop(('No Error Code','No Error')).sum()
    error_code_bool=high_level_table['all_error_codes']!=('No Error Code', 'No Error')
    high_level_table.loc[error_code_bool,'% of error codes']=(high_level_table['Number of Transfers']/total_number_of_error_codes).multiply(100)
    
    # Adding columns to describe errors
    high_level_table['error_type']=high_level_table['all_error_codes'].apply(lambda error_tuple: error_tuple[0])
    high_level_table['error_code']=high_level_table['all_error_codes'].apply(lambda error_tuple: error_tuple[1])
    high_level_table=high_level_table.merge(error_code_lookup_file[['ErrorCode','ResponseText']],left_on='error_code',right_on='ErrorCode',how='left')

    # Select and re-order table
    grouping_columns_order=['requesting_supplier','sending_supplier','status','failure_reason','error_type','ResponseText','error_code']
    counting_columns_order=['Number of Transfers','% of Transfers','% Supplier Pathway Transfers','% Paper Fallback','% of error codes']
    high_level_table=high_level_table[grouping_columns_order+counting_columns_order].sort_values(by='Number of Transfers',ascending=False)
    
    return high_level_table

In [25]:
with pd.ExcelWriter("High Level Tables PRMT-2116.xlsx") as writer:
    generate_high_level_table(transfers.copy()).to_excel(writer, sheet_name="All",index=False)
    [generate_high_level_table(transfers[transfers['month']==month].copy()).to_excel(writer, sheet_name=str(month),index=False) for month in transfers['month'].unique()]