# PRMT-2324 Run top level table for first 2 weeks of August 2021

## Context
In our July data we saw a significant increase in GP2GP failures. We want to understand if these were blips, perhaps caused by something that happening during July, or whether these failures are continuing. We don’t want to wait until we have all August data to identify this as we are starting conversations with suppliers now.



In [16]:
import pandas as pd 
import numpy as np
from datetime import datetime

In [17]:
transfer_file_location = "s3://prm-gp2gp-notebook-data-prod/PRMT-2324-2-weeks-august-data/transfers/v4/2021/8/transfers.parquet"

transfers_raw = pd.read_parquet(transfer_file_location)

In [18]:
transfers_raw.head()

Unnamed: 0,conversation_id,sla_duration,requesting_practice_asid,sending_practice_asid,requesting_supplier,sending_supplier,sender_error_codes,final_error_codes,intermediate_error_codes,status,failure_reason,date_requested,date_completed
0,6F6982D2-EECD-47B8-8E66-1DF45C7B3CE1,,200000000467,937482173047,EMIS,EMIS,[nan],[],[],PROCESS_FAILURE,"Transferred, not integrated",2021-08-20 12:25:28.048,NaT
1,92775515-30CD-4F0C-9FD0-DAD724A03754,,773425693043,200000000983,EMIS,EMIS,[nan],[],[],PROCESS_FAILURE,"Transferred, not integrated",2021-08-20 12:26:27.022,NaT
2,55952762-5924-409C-9E32-CDE978365D99,,200000008789,200000014674,EMIS,EMIS,[nan],[],[],PROCESS_FAILURE,"Transferred, not integrated",2021-08-20 12:26:10.290,NaT
3,6DDF64B1-A3FD-45F1-A4CD-CBF40AE965BE,2151.0,200000010476,374837391043,EMIS,EMIS,[nan],[15.0],[],INTEGRATED_ON_TIME,,2021-08-20 12:23:59.152,2021-08-20 13:03:26.992
4,32E422DD-6FC9-4F70-8438-23B883FB0193,2880.0,888676243015,140691361012,EMIS,EMIS,[nan],[nan],[],INTEGRATED_ON_TIME,,2021-08-20 12:26:51.211,2021-08-20 13:15:42.715


In [19]:
# filter data to just include the first 2 weeks of august
date_filter_bool = transfers_raw["date_requested"] < datetime(2021, 8, 16)
transfers_half_august = transfers_raw[date_filter_bool]


In [20]:
# Supplier data was only available from Feb/Mar 2021. Sending and requesting supplier values for all transfers before that are empty
# Dropping these columns to merge supplier data from ASID lookup files
transfers_half_august = transfers_half_august.drop(["sending_supplier", "requesting_supplier"], axis=1)
transfers = transfers_half_august.copy()

# Supplier name mapping
supplier_renaming = {
    "EGTON MEDICAL INFORMATION SYSTEMS LTD (EMIS)":"EMIS",
    "IN PRACTICE SYSTEMS LTD":"Vision",
    "MICROTEST LTD":"Microtest",
    "THE PHOENIX PARTNERSHIP":"TPP",
    None: "Unknown"
}

# Generate ASID lookup that contains all the most recent entry for all ASIDs encountered
asid_file_location = "s3://prm-gp2gp-asid-lookup-preprod/2021/6/asidLookup.csv.gz"
asid_lookup = pd.read_csv(asid_file_location)

asid_lookup = asid_lookup.drop_duplicates().groupby("ASID").last().reset_index()
lookup = asid_lookup[["ASID", "MName"]]

transfers = transfers.merge(lookup, left_on='requesting_practice_asid',right_on='ASID',how='left')
transfers = transfers.rename({'MName': 'requesting_supplier', 'ASID': 'requesting_supplier_asid'}, axis=1)
transfers = transfers.merge(lookup, left_on='sending_practice_asid',right_on='ASID',how='left')
transfers = transfers.rename({'MName': 'sending_supplier', 'ASID': 'sending_supplier_asid'}, axis=1)

transfers["sending_supplier"] = transfers["sending_supplier"].replace(supplier_renaming.keys(), supplier_renaming.values())
transfers["requesting_supplier"] = transfers["requesting_supplier"].replace(supplier_renaming.keys(), supplier_renaming.values())

# Making the status to be more human readable here
transfers["status"] = transfers["status"].str.replace("_", " ").str.title()

In [21]:
import paths
import data
error_code_lookup_file = pd.read_csv(data.gp2gp_response_codes.path)

In [22]:
outcome_counts = transfers.fillna("N/A").groupby(by=["status", "failure_reason"]).agg({"conversation_id": "count"})
outcome_counts = outcome_counts.rename({"conversation_id": "Number of transfers", "failure_reason": "Failure Reason"}, axis=1)
outcome_counts["% of transfers"] = (outcome_counts["Number of transfers"] / outcome_counts["Number of transfers"].sum()).multiply(100)
outcome_counts.round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Number of transfers,% of transfers
status,failure_reason,Unnamed: 2_level_1,Unnamed: 3_level_1
Integrated On Time,,87278,84.14
Process Failure,Integrated Late,2598,2.5
Process Failure,"Transferred, not integrated",7439,7.17
Technical Failure,COPC(s) not Acknowledged,987,0.95
Technical Failure,COPC(s) not sent,13,0.01
Technical Failure,Contains Fatal Sender Error,1584,1.53
Technical Failure,Core Extract not Sent,1509,1.45
Technical Failure,Final Error,1608,1.55
Technical Failure,Request not Acknowledged,445,0.43
Unclassified Failure,Ambiguous COPC messages,207,0.2


In [23]:
transfers['month']=transfers['date_requested'].dt.to_period('M')

In [24]:
def convert_error_list_to_tuple(error_code_list, error_code_type):
    return [(error_code_type, error_code) for error_code in set(error_code_list) if not np.isnan(error_code)]

def combine_error_codes(row):
    sender_list = convert_error_list_to_tuple(row["sender_error_codes"], "Sender")
    intermediate_list = convert_error_list_to_tuple(row["intermediate_error_codes"], "COPC")
    final_list = convert_error_list_to_tuple(row["final_error_codes"], "Final")
    full_error_code_list = sender_list + intermediate_list + final_list
    if len(full_error_code_list) == 0:
        return [("No Error Code", "No Error")]
    else:
        return full_error_code_list
    
transfers["all_error_codes"] = transfers.apply(combine_error_codes, axis=1)

In [25]:
def generate_high_level_table(transfers_sample):

    # Break up lines by error code
    transfers_split_by_error_code=transfers_sample.explode("all_error_codes")

    # Create High level table
    high_level_table=transfers_split_by_error_code.fillna("N/A").groupby(["requesting_supplier","sending_supplier","status","failure_reason","all_error_codes"]).agg({'conversation_id':'count'})
    high_level_table=high_level_table.rename({'conversation_id':'Number of Transfers'},axis=1).reset_index()

    # Count % of transfers
    total_number_transfers = transfers_sample.shape[0]
    high_level_table['% of Transfers']=(high_level_table['Number of Transfers']/total_number_transfers).multiply(100)

    # Count by supplier pathway
    supplier_pathway_counts = transfers_sample.fillna("Unknown").groupby(by=["sending_supplier", "requesting_supplier"]).agg({"conversation_id": "count"})['conversation_id']
    high_level_table['% Supplier Pathway Transfers']=high_level_table.apply(lambda row: row['Number of Transfers']/supplier_pathway_counts.loc[(row['sending_supplier'],row['requesting_supplier'])],axis=1).multiply(100)

    # Add in Paper Fallback columns
    total_fallback = transfers_sample["failure_reason"].dropna().shape[0]
    fallback_bool=high_level_table['status']!='Integrated On Time'
    high_level_table.loc[fallback_bool,'% Paper Fallback']=(high_level_table['Number of Transfers']/total_fallback).multiply(100)

    # % of error codes column
    total_number_of_error_codes=transfers_split_by_error_code['all_error_codes'].value_counts().drop(('No Error Code','No Error')).sum()
    error_code_bool=high_level_table['all_error_codes']!=('No Error Code', 'No Error')
    high_level_table.loc[error_code_bool,'% of error codes']=(high_level_table['Number of Transfers']/total_number_of_error_codes).multiply(100)
    
    # Adding columns to describe errors
    high_level_table['error_type']=high_level_table['all_error_codes'].apply(lambda error_tuple: error_tuple[0])
    high_level_table['error_code']=high_level_table['all_error_codes'].apply(lambda error_tuple: error_tuple[1])
    high_level_table=high_level_table.merge(error_code_lookup_file[['ErrorCode','ResponseText']],left_on='error_code',right_on='ErrorCode',how='left')

    # Select and re-order table
    grouping_columns_order=['requesting_supplier','sending_supplier','status','failure_reason','error_type','ResponseText','error_code']
    counting_columns_order=['Number of Transfers','% of Transfers','% Supplier Pathway Transfers','% Paper Fallback','% of error codes']
    high_level_table=high_level_table[grouping_columns_order+counting_columns_order].sort_values(by='Number of Transfers',ascending=False)
    
    return high_level_table

In [26]:
with pd.ExcelWriter("High Level Table First 2 weeks of August PRMT-2324.xlsx") as writer:
    generate_high_level_table(transfers.copy()).to_excel(writer, sheet_name="All",index=False)
    [generate_high_level_table(transfers[transfers['month']==month].copy()).to_excel(writer, sheet_name=str(month),index=False) for month in transfers['month'].unique()]