# Hypothesis: Majority of Vision pending is technical failure
**We believe that** for Vision pending without error transfers (Vision is the sender)
the majority of these are technical failures (i.e. get stuck) 

**We will know this to be true when** we see for a sample of data that for pending transfers where Vision is a sender, more than 50% show an incomplete message pattern

### Scope

Group together the message patterns for a sample of pending transfers where Vision is the sender

Show the proportions and actuals of each group

In [1]:
import pandas as pd 
import numpy as np

In [8]:
# Import transfer files to extract whether message creator is sender or requester
# Using data generated from branch PRMT-1742-duplicates-analysis.
# This is needed to correctly handle duplicates.
# Once the upstream pipeline has a fix for duplicate EHRs, then we can go back to using the main output.
transfer_file_location = "s3://prm-gp2gp-data-sandbox-dev/transfers-sample-5/"
transfer_files = [
    "2020-9-transfers.parquet",
    "2020-10-transfers.parquet",
    "2020-11-transfers.parquet",
    "2020-12-transfers.parquet",
    "2021-1-transfers.parquet",
    "2021-2-transfers.parquet"
]

transfer_input_files = [transfer_file_location + f for f in transfer_files]
transfers_raw = pd.concat((
    pd.read_parquet(f)
    for f in transfer_input_files
))

# In the data from the PRMT-1742-duplicates-analysis branch, these columns have been added , but contain only empty values.
transfers_raw = transfers_raw.drop(["sending_supplier", "requesting_supplier"], axis=1)
transfers = transfers_raw.copy()

# Correctly interpret certain sender errors as failed.
# This is explained in PRMT-1974. Eventually this will be fixed upstream in the pipeline.
# Step Two: reclassifying the relevant transfers with pending sender error codes to FAILED DUE TO SENDER ERROR CODE status for comparison
pending_sender_error_codes=[6,7,10,24,30,23,14,99]
transfers_with_pending_sender_code_bool=transfers['sender_error_code'].isin(pending_sender_error_codes)
transfers_with_pending_with_error_bool=transfers['status']=='PENDING_WITH_ERROR'
transfers_which_need_pending_to_failure_change_bool=transfers_with_pending_sender_code_bool & transfers_with_pending_with_error_bool
transfers.loc[transfers_which_need_pending_to_failure_change_bool,'status']='FAILED DUE TO SENDER ERROR CODE'

# Add integrated Late status
eight_days_in_seconds=8*24*60*60
transfers_after_sla_bool=transfers['sla_duration']>eight_days_in_seconds
transfers_with_integrated_bool=transfers['status']=='INTEGRATED'
transfers_integrated_late_bool=transfers_after_sla_bool & transfers_with_integrated_bool
transfers.loc[transfers_integrated_late_bool,'status']='INTEGRATED LATE'

# If the record integrated after 28 days, change the status back to pending.
# This is to handle each month consistently and to always reflect a transfers status 28 days after it was made.
# TBD how this is handled upstream in the pipeline
twenty_eight_days_in_seconds=28*24*60*60
transfers_after_month_bool=transfers['sla_duration']>twenty_eight_days_in_seconds
transfers_pending_at_month_bool=transfers_after_month_bool & transfers_integrated_late_bool
transfers.loc[transfers_pending_at_month_bool,'status']='PENDING'
transfers_with_early_error_bool=(~transfers.loc[:,'sender_error_code'].isna()) |(~transfers.loc[:,'intermediate_error_codes'].apply(len)>0)
transfers.loc[transfers_with_early_error_bool & transfers_pending_at_month_bool,'status']='PENDING_WITH_ERROR'

# Supplier name mapping
supplier_renaming = {
    "EGTON MEDICAL INFORMATION SYSTEMS LTD (EMIS)":"EMIS",
    "IN PRACTICE SYSTEMS LTD":"Vision",
    "MICROTEST LTD":"Microtest",
    "THE PHOENIX PARTNERSHIP":"TPP",
    None: "Unknown"
}

# Generate ASID lookup that contains all the most recent entry for all ASIDs encountered
asid_file_location = "s3://prm-gp2gp-data-sandbox-dev/asid-lookup/"
asid_files = [
    "asidLookup-Nov-2020.csv.gz",
    "asidLookup-Dec-2020.csv.gz",
    "asidLookup-Jan-2021.csv.gz",
    "asidLookup-Feb-2021.csv.gz",
    "asidLookup-Mar-2021.csv.gz",
    "asidLookup-Apr-2021.csv.gz"
]
asid_lookup_files = [asid_file_location + f for f in asid_files]
asid_lookup = pd.concat((
    pd.read_csv(f)
    for f in asid_lookup_files
))
asid_lookup = asid_lookup.drop_duplicates().groupby("ASID").last().reset_index()
lookup = asid_lookup[["ASID", "MName", "NACS","OrgName"]]

transfers = transfers.merge(lookup, left_on='requesting_practice_asid',right_on='ASID',how='left')
transfers = transfers.rename({'MName': 'requesting_supplier', 'ASID': 'requesting_supplier_asid', 'NACS': 'requesting_ods_code','OrgName':'requesting_practice_name'}, axis=1)
transfers = transfers.merge(lookup, left_on='sending_practice_asid',right_on='ASID',how='left')
transfers = transfers.rename({'MName': 'sending_supplier', 'ASID': 'sending_supplier_asid', 'NACS': 'sending_ods_code','OrgName':'sending_practice_name'}, axis=1)

transfers["sending_supplier"] = transfers["sending_supplier"].replace(supplier_renaming.keys(), supplier_renaming.values())
transfers["requesting_supplier"] = transfers["requesting_supplier"].replace(supplier_renaming.keys(), supplier_renaming.values())

# Making the status to be more human readable here
transfers["status"] = transfers["status"].str.replace("_", " ").str.title()

In [9]:
COPC_tag = "-reduced-COPCs" 
conversations_extended_interaction_messages=pd.read_parquet(f's3://prm-gp2gp-data-sandbox-dev/extra-fields-data-from-splunk/Sept_20_Feb_21_conversations_extended_interaction_messages{COPC_tag}.parquet')
# turning messages from list of list to tuple of tuples (since they are hasable)
conversations_extended_interaction_messages["messages"]=conversations_extended_interaction_messages["messages"].apply(lambda message_list: tuple([tuple(message) for message in message_list]))
# Attach this message list to the transfers dataframe
transfers_with_message_list = transfers.merge(conversations_extended_interaction_messages, left_on="conversation_id", right_index=True)

In [83]:
print(f"Total number of transfers in time range: {transfers_with_message_list.shape[0]}")
vision_sending_transfers=transfers_with_message_list.loc[transfers_with_message_list['sending_supplier']=='Vision']
print(f"Number of transfers with Vision as the Sender: {vision_sending_transfers.shape[0]}")

vision_sending_pending_transfers=vision_sending_transfers.loc[vision_sending_transfers['status']=='Pending']
print(f"Number of these Vision-Sender transfers which are Pending: {vision_sending_pending_transfers.shape[0]}")
print(f"Percentage of these Vision-Sender transfers which are Pending: {round(100*vision_sending_pending_transfers.shape[0]/vision_sending_transfers.shape[0],2)}%")


Total number of transfers in time range: 1343234
Number of transfers with Vision as the Sender: 29092
Number of these Vision-Sender transfers which are Pending: 12788
Percentage of these Vision-Sender transfers which are Pending: 43.96%


In [82]:
vision_sending_pending_with_core_transfer_bool=vision_sending_pending_transfers['messages'].apply(lambda messages: True in [message[1]=='req complete' for message in messages])

print(f"Number of Pending Vision-Sender transfers with a Request Completed Message (ie the Core Extract is sent): {vision_sending_pending_with_core_transfer_bool.sum()}")
print(f"Percentage of these Pending Vision-Sender with a Request Completed Message (ie the Core Extract is sent): {round(100*vision_sending_pending_with_core_transfer_bool.mean(),2)}%")




Number of Pending Vision-Sender transfers with a Request Completed Message (ie the Core Extract is sent): 680
Percentage of these Pending Vision-Sender with a Request Completed Message (ie the Core Extract is sent): 5.32%


In [34]:
vision_sending_pending_messages=pd.DataFrame(vision_sending_pending_transfers['messages'].value_counts(dropna=False).rename('Number of Pending Transfers'))
vision_sending_pending_messages['% of Pending Transfers']=100*vision_sending_pending_messages['Number of Pending Transfers']/vision_sending_pending_transfers.shape[0]
vision_sending_pending_messages.round(2)

Unnamed: 0,Number of Pending Transfers,% of Pending Transfers
"((requestor, req start, ), (sender, req start ack, ))",11495,89.89
"((requestor, req start, ), (sender, req start ack, ), (sender, req complete, ))",675,5.28
"((requestor, req start, ),)",599,4.68
"((requestor, req start, ), (sender, req start ack, ), (sender, req start ack, ))",9,0.07
"((requestor, req start, ), (sender, req start ack, ), (requestor, ack, ))",4,0.03
"((requestor, req start, ), (sender, req start ack, ), (sender, req complete, ), (requestor, req complete ack, 11), (requestor, req complete ack, 11))",2,0.02
"((requestor, req start, ), (sender, req start ack, ), (sender, req complete, ), (sender, req complete, ), (requestor, req complete ack, 12))",1,0.01
"((requestor, req start, ), (sender, req complete, ))",1,0.01
"((requestor, req start, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), (sender, req start ack, ), ...)",1,0.01
"((requestor, req start, ), (sender, req complete, ), (sender, req start ack, ))",1,0.01
