# PRMT-2460 November - Produce transfer level report of failures for November

## Context
Produce a transfer level report for failures using a weeks worth of data
- Date range: November 18th - November 24th (7 days)
- No cutoff - but set overflow as late as possible (gives transfers earlier on more time to receive messages)

In [200]:
import pandas as pd 
import numpy as np
import paths, data
from sklearn.preprocessing import MultiLabelBinarizer
from datetime import datetime
from data.practice_metadata import read_asid_metadata

In [201]:
error_code_lookup_file = pd.read_csv(data.gp2gp_response_codes.path)
error_code_lookup = error_code_lookup_file.set_index("ErrorCode")["ErrorName"]

In [202]:
asid_lookup = read_asid_metadata("prm-gp2gp-ods-metadata-prod", "v2/2021/11/organisationMetadata.json")
transfer_file_location = "s3://prm-gp2gp-notebook-data-prod/PRMT-2460-transfer-level-report-of-failures/transfer-data/v6/2021/11/2021-11-transfers.parquet"

transfers_raw = pd.read_parquet(transfer_file_location)
transfers = transfers_raw\
    .join(asid_lookup.add_prefix("requesting_"), on="requesting_practice_asid", how="left")\
    .join(asid_lookup.add_prefix("sending_"), on="sending_practice_asid", how="left")\

transfers["status"] = transfers["status"].str.replace("_", " ").str.capitalize()

len(transfers)

63317

In [203]:
# Ensure we only look at transfers between 18th and 24th November
first_half_date_filter_bool_start = transfers["date_requested"] >= datetime(2021, 11, 18)
first_half_date_filter_bool_end = transfers["date_requested"] < datetime(2021, 11, 25)
transfers_within_date_range = transfers[first_half_date_filter_bool_start & first_half_date_filter_bool_end]
len(transfers_within_date_range)

53411

In [204]:
# Look for transfers that have status Technical Failure or Unclassified Failure
failed_transfers_bool = (transfers_within_date_range["status"] == "Technical failure") | (transfers_within_date_range["status"] == "Unclassified failure")
failed_transfers = transfers_within_date_range[failed_transfers_bool]
len(failed_transfers)

1878

In [205]:
# Remove Nan and duplicate occurrences in error codes
def filter_error_codes(error_codes_column):
    return error_codes_column.apply(lambda sender_error_codes: [sender_error_code for sender_error_code in set(sender_error_codes) if not np.isnan(sender_error_code)])

# This is important otherwise notebook will complain about mutating original list/dataset in fuction calls below
failed_transfers = failed_transfers.copy()

failed_transfers["unique_sender_error_codes"] = filter_error_codes(failed_transfers["sender_error_codes"])
failed_transfers["unique_final_error_codes"] = filter_error_codes(failed_transfers["final_error_codes"])
failed_transfers["unique_intermediate_error_codes"] = filter_error_codes(failed_transfers["intermediate_error_codes"])
failed_transfers.head(1)

Unnamed: 0,conversation_id,sla_duration,requesting_practice_asid,sending_practice_asid,requesting_supplier,sending_supplier,sender_error_codes,final_error_codes,intermediate_error_codes,status,...,requesting_practice_name,requesting_ccg_ods_code,requesting_ccg_name,sending_practice_ods_code,sending_practice_name,sending_ccg_ods_code,sending_ccg_name,unique_sender_error_codes,unique_final_error_codes,unique_intermediate_error_codes
88,B98BF520-4D37-11EC-9A41-E9231299C290,115818.0,760915831019,847520267018,SystmOne,EMIS,[nan],[31.0],[],Technical failure,...,MILTON ROAD SURGERY,07G,NHS THURROCK CCG,G85023,LEWISHAM MEDICAL CENTRE,72Q,NHS SOUTH EAST LONDON CCG,[],[31.0],[]


In [206]:
import paths, data
error_code_lookup_file = pd.read_csv(data.gp2gp_response_codes.path)
error_code_lookup = error_code_lookup_file.set_index("ErrorCode")["ErrorName"]

In [207]:
# Add descriptions from lookup to error code columns and ensure errors are sorted
def convert_error_list_to_formatted_string_list_with_error_lookup(error_code_list):
    return ", ".join([str(error_code.astype(int))+" - "+str(error_code_lookup[error_code]) for error_code in sorted(set(error_code_list)) if not np.isnan(error_code)])

failed_transfers["unique_sender_error_codes_with_description"] = failed_transfers["unique_sender_error_codes"].apply(convert_error_list_to_formatted_string_list_with_error_lookup)
failed_transfers["unique_final_error_codes_with_description"] = failed_transfers["unique_final_error_codes"].apply(convert_error_list_to_formatted_string_list_with_error_lookup)
failed_transfers["unique_intermediate_error_codes_with_description"] = failed_transfers["unique_intermediate_error_codes"].apply(convert_error_list_to_formatted_string_list_with_error_lookup)

In [208]:
# Prune columns we want to keep for the report
columns_to_keep = [
    'sending_practice_ods_code',
    'sending_supplier',
    'requesting_practice_ods_code',    
    'requesting_supplier',
    'conversation_id', 
    'date_requested', 
    'status',
    'failure_reason', 
    'unique_sender_error_codes_with_description', 
    'unique_final_error_codes_with_description',
    'unique_intermediate_error_codes_with_description']

failed_transfers_with_filtered_and_renamed_columns = failed_transfers[columns_to_keep]

failed_transfers_with_filtered_and_renamed_columns=failed_transfers_with_filtered_and_renamed_columns.rename({
    "sending_practice_ods_code":"Sending Practice ODS",
    "sending_supplier":"Sending Supplier",
    "requesting_practice_ods_code":"Requesting Practice ODS",
    "requesting_supplier":"Requesting Supplier",
    "conversation_id":"Conversation ID",
    "date_requested":"Date Requested",
    "status":"Status",
    "failure_reason":"Failure Reason",
    "unique_sender_error_codes_with_description":"Unique Sender Errors",
    "unique_final_error_codes_with_description":"Unique Final Errors",
    "unique_intermediate_error_codes_with_description":"Unique Intermediate Errors"
    },axis=1)

failed_transfers_with_filtered_and_renamed_columns.head(2)

Unnamed: 0,Sending Practice ODS,Sending Supplier,Requesting Practice ODS,Requesting Supplier,Conversation ID,Date Requested,Status,Failure Reason,Unique Sender Errors,Unique Final Errors,Unique Intermediate Errors
88,G85023,EMIS,F81641,SystmOne,B98BF520-4D37-11EC-9A41-E9231299C290,2021-11-24 15:03:48.049,Technical failure,Final error,,31 - Missing LM,
169,F83652,EMIS,K81018,EMIS,E259788F-9362-42FD-9B04-736A04123B9C,2021-11-24 09:00:06.413,Technical failure,Final error,,11 - Failed to integrate,


In [209]:
data = {
    'unique_sender_error_codes_count': failed_transfers.explode("unique_sender_error_codes").value_counts("unique_sender_error_codes"),
    'unique_final_error_codes_count': failed_transfers.explode("unique_final_error_codes").value_counts("unique_final_error_codes"),
    'unique_intermediate_error_codes_count': failed_transfers.explode("unique_intermediate_error_codes").value_counts("unique_intermediate_error_codes")
    }
df = pd.DataFrame(data=data)

print("November 17-25 after fix [total transfers:"+str(len(transfers_within_date_range))+", total failed transfers (technical + unclassified): "+str(len(failed_transfers))+"]")
df.sort_values(by=["unique_sender_error_codes_count", "unique_final_error_codes_count", "unique_intermediate_error_codes_count"], ascending=False)

November 17-25 after fix [total transfers:53411, total failed transfers (technical + unclassified): 1878]


Unnamed: 0,unique_sender_error_codes_count,unique_final_error_codes_count,unique_intermediate_error_codes_count
20.0,506.0,,
30.0,400.0,41.0,1.0
10.0,127.0,,
6.0,42.0,,
7.0,8.0,,
99.0,3.0,56.0,
23.0,3.0,,
14.0,2.0,,
19.0,2.0,,
12.0,,133.0,


In [210]:
df.sum(axis=0)

unique_sender_error_codes_count          1093.0
unique_final_error_codes_count            455.0
unique_intermediate_error_codes_count     179.0
dtype: float64

In [211]:
# Format the errors to be readable strings
transfers_within_date_range["unique_sender_error_codes"] = filter_error_codes(transfers_within_date_range["sender_error_codes"])
transfers_within_date_range["unique_final_error_codes"] = filter_error_codes(transfers_within_date_range["final_error_codes"])
transfers_within_date_range["unique_intermediate_error_codes"] = filter_error_codes(transfers_within_date_range["intermediate_error_codes"])

transfers_within_date_range["unique_sender_error_codes_with_description"] = transfers_within_date_range["unique_sender_error_codes"].apply(convert_error_list_to_formatted_string_list_with_error_lookup)
transfers_within_date_range["unique_final_error_codes_with_description"] = transfers_within_date_range["unique_final_error_codes"].apply(convert_error_list_to_formatted_string_list_with_error_lookup)
transfers_within_date_range["unique_intermediate_error_codes_with_description"] = transfers_within_date_range["unique_intermediate_error_codes"].apply(convert_error_list_to_formatted_string_list_with_error_lookup)
transfers_within_date_range.head(1)

Unnamed: 0,conversation_id,sla_duration,requesting_practice_asid,sending_practice_asid,requesting_supplier,sending_supplier,sender_error_codes,final_error_codes,intermediate_error_codes,status,...,sending_practice_name,sending_ccg_ods_code,sending_ccg_name,all_error_codes,unique_sender_error_codes,unique_final_error_codes,unique_intermediate_error_codes,unique_sender_error_codes_with_description,unique_final_error_codes_with_description,unique_intermediate_error_codes_with_description
0,949AFE9C-889B-481A-B19B-25F5C76D1C67,,669024222046,200000000124,EMIS,EMIS,[nan],[],[],Process failure,...,WHITE HORSE MEDICAL PRACTICE,10Q,NHS OXFORDSHIRE CCG,"((No Error Code, No Error, N/A),)",[],[],[],,,


In [212]:
def generate_high_level_table(transfers_sample):
    # Create High level table
    high_level_table=transfers_sample.fillna("N/A").groupby(["requesting_supplier","sending_supplier","status","failure_reason", "unique_final_error_codes_with_description", "unique_sender_error_codes_with_description", "unique_intermediate_error_codes_with_description"]).agg({"conversation_id":"count"})
    high_level_table=high_level_table.rename({"conversation_id":"number of transfers"},axis=1).reset_index()

    # Count % of transfers
    total_number_transfers = transfers_sample.shape[0]
    high_level_table["% of transfers"]=(high_level_table["number of transfers"]/total_number_transfers).multiply(100)
    
    # Count % of technical failures
    technical_failed_transfers_bool = transfers_sample["status"] == "Technical failure"
    technical_failed_transfers = transfers_sample[technical_failed_transfers_bool]
    total_number_technical_failed_transfers = technical_failed_transfers.shape[0]
    technical_failed_transfers_table_bool = high_level_table["status"] == "Technical failure"
    high_level_table.loc[technical_failed_transfers_table_bool, "% of technical failures"]=(high_level_table.loc[technical_failed_transfers_table_bool, "number of transfers"]/total_number_technical_failed_transfers).multiply(100)
    
    # Count by supplier pathway
    supplier_pathway_counts = transfers_sample.fillna("Unknown").groupby(by=["sending_supplier", "requesting_supplier"]).agg({"conversation_id": "count"})["conversation_id"]
    high_level_table["% of supplier pathway"]=high_level_table.apply(lambda row: row["number of transfers"]/supplier_pathway_counts.loc[(row["sending_supplier"],row["requesting_supplier"])],axis=1).multiply(100)

    # Select and re-order table
    grouping_columns_order=["requesting_supplier","sending_supplier","status","failure_reason", "unique_final_error_codes_with_description", "unique_sender_error_codes_with_description", "unique_intermediate_error_codes_with_description"]
    counting_columns_order=["number of transfers","% of transfers","% of technical failures","% of supplier pathway"]
    high_level_table=high_level_table[grouping_columns_order+counting_columns_order].sort_values(by="number of transfers",ascending=False)
    
    # Rename the columns
    high_level_table=high_level_table.rename({
        "sending_supplier":"sending supplier",
        "requesting_supplier":"requesting supplier",
        "status":"status",
        "failure_reason":"failure reason",
        "unique_final_error_codes_with_description":"unique final errors",
        "unique_sender_error_codes_with_description":"unique sender errors",
        "unique_intermediate_error_codes_with_description":"unique intermediate errors"
        },axis=1)
    
    # Replace all N/A with empty strings primarily for error columns
    high_level_table=high_level_table.replace("N/A", "")
    
    return high_level_table

In [120]:
with pd.ExcelWriter("PRMT-2460-Error-code-combination-table-November.xlsx") as writer:
     generate_high_level_table(transfers_within_date_range).to_excel(writer, sheet_name="Error Code Combination Table",index=False)

In [143]:
with pd.ExcelWriter("PRMT-2460-Transfer-level-report-of-failures-for-November-18-to-November-24-2021.xlsx") as writer:
    failed_transfers_with_filtered_and_renamed_columns.to_excel(writer, sheet_name="Transfer level failures",index=False)