# PRMT-2124 - Can we consider transfers without a Sender Request Completed message as failed?

In [7]:
import pandas as pd
import numpy as np
import time

In [8]:
# Import transfer files to extract whether message creator is sender or requester
transfer_file_location = "s3://prm-gp2gp-data-sandbox-dev/transfers-sample-5/"
transfer_files = [
    "2020-9-transfers.parquet",
    "2020-10-transfers.parquet",
    "2020-11-transfers.parquet",
    "2020-12-transfers.parquet",
    "2021-1-transfers.parquet",
    "2021-2-transfers.parquet",
    "2021-3-transfers.parquet"
]

transfer_input_files = [transfer_file_location + f for f in transfer_files]
transfers_raw = pd.concat((
    pd.read_parquet(f)
    for f in transfer_input_files
))

# In the data from the PRMT-1742-duplicates-analysis branch, these columns have been added , but contain only empty values.
transfers_raw = transfers_raw.drop(["sending_supplier", "requesting_supplier"], axis=1)
transfers = transfers_raw.copy()

# Correctly interpret certain sender errors as failed.
# This is explained in PRMT-1974. Eventually this will be fixed upstream in the pipeline.
# Step Two: reclassifying the relevant transfers with pending sender error codes to FAILED DUE TO SENDER ERROR CODE status for comparison
pending_sender_error_codes=[6,7,10,24,30,23,14,99]
transfers_with_pending_sender_code_bool=transfers['sender_error_code'].isin(pending_sender_error_codes)
transfers_with_pending_with_error_bool=transfers['status']=='PENDING_WITH_ERROR'
transfers_which_need_pending_to_failure_change_bool=transfers_with_pending_sender_code_bool & transfers_with_pending_with_error_bool
transfers.loc[transfers_which_need_pending_to_failure_change_bool,'status']='FAILED DUE TO SENDER ERROR CODE'

# Add integrated Late status
eight_days_in_seconds=8*24*60*60
transfers_after_sla_bool=transfers['sla_duration']>eight_days_in_seconds
transfers_with_integrated_bool=transfers['status']=='INTEGRATED'
transfers_integrated_late_bool=transfers_after_sla_bool & transfers_with_integrated_bool
transfers.loc[transfers_integrated_late_bool,'status']='INTEGRATED LATE'

# If the record integrated after 28 days, change the status back to pending.
# This is to handle each month consistently and to always reflect a transfers status 28 days after it was made.
# TBD how this is handled upstream in the pipeline
twenty_eight_days_in_seconds=28*24*60*60
transfers_after_month_bool=transfers['sla_duration']>twenty_eight_days_in_seconds
transfers_pending_at_month_bool=transfers_after_month_bool & transfers_integrated_late_bool
transfers.loc[transfers_pending_at_month_bool,'status']='PENDING'
transfers_with_early_error_bool=(~transfers.loc[:,'sender_error_code'].isna()) |(~transfers.loc[:,'intermediate_error_codes'].apply(len)>0)
transfers.loc[transfers_with_early_error_bool & transfers_pending_at_month_bool,'status']='PENDING_WITH_ERROR'

# Supplier name mapping
supplier_renaming = {
    "EGTON MEDICAL INFORMATION SYSTEMS LTD (EMIS)":"EMIS",
    "IN PRACTICE SYSTEMS LTD":"Vision",
    "MICROTEST LTD":"Microtest",
    "THE PHOENIX PARTNERSHIP":"TPP",
    None: "Unknown"
}

# Generate ASID lookup that contains all the most recent entry for all ASIDs encountered
asid_file_location = "s3://prm-gp2gp-data-sandbox-dev/asid-lookup/"
asid_files = [
    "asidLookup-Nov-2020.csv.gz",
    "asidLookup-Dec-2020.csv.gz",
    "asidLookup-Jan-2021.csv.gz",
    "asidLookup-Feb-2021.csv.gz",
    "asidLookup-Mar-2021.csv.gz"
]
asid_lookup_files = [asid_file_location + f for f in asid_files]
asid_lookup = pd.concat((
    pd.read_csv(f)
    for f in asid_lookup_files
))
asid_lookup = asid_lookup.drop_duplicates().groupby("ASID").last().reset_index()
lookup = asid_lookup[["ASID", "MName", "NACS","OrgName"]]

transfers = transfers.merge(lookup, left_on='requesting_practice_asid',right_on='ASID',how='left')
transfers = transfers.rename({'MName': 'requesting_supplier', 'ASID': 'requesting_supplier_asid', 'NACS': 'requesting_ods_code','OrgName':'requesting_practice_name'}, axis=1)
transfers = transfers.merge(lookup, left_on='sending_practice_asid',right_on='ASID',how='left')
transfers = transfers.rename({'MName': 'sending_supplier', 'ASID': 'sending_supplier_asid', 'NACS': 'sending_ods_code','OrgName':'sending_practice_name'}, axis=1)

transfers["sending_supplier"] = transfers["sending_supplier"].replace(supplier_renaming.keys(), supplier_renaming.values())
transfers["requesting_supplier"] = transfers["requesting_supplier"].replace(supplier_renaming.keys(), supplier_renaming.values())

# Making the status to be more human readable here
transfers["status"] = transfers["status"].str.replace("_", " ").str.title()


In [9]:
def add_messages_parquet_to_transfers(parquet_file_name, transfers_df):
    conversations_extended_interaction_messages=pd.read_parquet(parquet_file_name)
    # turning messages from list of list to tuple of tuples (since they are hasable)
    conversations_extended_interaction_messages["messages"]=conversations_extended_interaction_messages["messages"].apply(lambda message_list: tuple([tuple(message) for message in message_list]))
    # Attach this message list to the transfers dataframe
    transfers_with_message_list = transfers_df.merge(conversations_extended_interaction_messages, left_on="conversation_id", right_index=True)
    return transfers_with_message_list

## Part A

**We believe that** transfers which do not receive a “Request Completed” message from the Sender within 14 days of the request started

**can be considered** technical failures

**we will know this to be true when** we see that for a sample of data without this message in 14 days, the message does not arrive, nor does transfer completion occur, within 1-8 months

### Scope

1. Extract all conversation ids from September to March (7 months) parquet files
2. Identify all messages within these conversation IDs using raw Spine data from September to April
3. Extract conversations with no Request Completed Message in the first 14 days
4. Count how many of these contain a later Request Completed or successful transfer message

In [4]:
overwrite_files = True

In [5]:
# Generating a mapping of practice asid, and whether they were the sender or requestor in that conversation
requesting_supplier_type_map = transfers[["conversation_id", "requesting_practice_asid", "date_requested"]].drop_duplicates()
sending_supplier_type_map = transfers[["conversation_id", "sending_practice_asid", "date_requested"]].drop_duplicates()

requesting_supplier_type_map["supplier_type"] = "requestor"
sending_supplier_type_map["supplier_type"] = "sender"

requesting_supplier_type_map = requesting_supplier_type_map.rename({"requesting_practice_asid": "practice_asid"}, axis=1)
sending_supplier_type_map = sending_supplier_type_map.rename({"sending_practice_asid": "practice_asid"}, axis=1)

supplier_type_mapping = pd.concat([requesting_supplier_type_map, sending_supplier_type_map])
supplier_type_mapping["practice_asid"] = supplier_type_mapping["practice_asid"].astype(int)

conversation_ids_of_interest=transfers['conversation_id'].values

# Define a list of files to be loaded in
#folder="s3://prm-gp2gp-data-sandbox-dev/spine-gp2gp-data-with-ack-codes-prmt-2059/"
folder="s3://prm-gp2gp-data-sandbox-dev/spine-gp2gp-data/"
files=["Sept-2020","Oct-2020","Nov-2020","Dec-2020","Jan-2021","Feb-2021","Mar-2021"]
full_filenames=[folder + file + ".csv.gz" for file in files]

# Rename message types to be human readable
interaction_name_mapping={"urn:nhs:names:services:gp2gp/RCMR_IN010000UK05":"req start",
"urn:nhs:names:services:gp2gp/RCMR_IN030000UK06":"req complete",
"urn:nhs:names:services:gp2gp/COPC_IN000001UK01":"COPC",
"urn:nhs:names:services:gp2gp/MCCI_IN010000UK13":" ack"}

#ackTypeCode_mapping={'AE':"Neg","AR":"Neg","ER":"Neg","IF":"Pos","NONE":"Pos"}


In [6]:
# This function will take a set of Spine data and for each message in the conversation we're interested in, it will output a dataframe
# with "conversation_id", "supplier_type", "interaction_name", "jdiEvent", "GUID", "messageRef" for each given message in the order
# they occur
def generate_single_frame(file):
    a=time.perf_counter()
    print("Now Processing " + file)
    all_messages_in_file=pd.read_csv(file, compression='gzip')

    # Only keep conversations from the conversations that we actually want to use
    monthly_relevant_messages=all_messages_in_file.loc[all_messages_in_file['conversationID'].isin(conversation_ids_of_interest)]
    monthly_relevant_messages=monthly_relevant_messages.sort_values(by='_time')
    monthly_relevant_messages = monthly_relevant_messages.merge(supplier_type_mapping, left_on=["conversationID", "messageSender"], right_on=["conversation_id", "practice_asid"], how="left")
    
    # Add time of messages occurring
    monthly_relevant_messages["time_of_message"] = (pd.to_datetime(monthly_relevant_messages["_time"]).dt.tz_localize(None) - monthly_relevant_messages["date_requested"]).dt.total_seconds().astype(str)
    monthly_messages = monthly_relevant_messages

    # map the message name to human readable form using supplier mapping
    monthly_messages['interaction_name']=monthly_messages['interactionID'].replace(interaction_name_mapping)
    monthly_messages["jdiEvent"] = monthly_messages["jdiEvent"].replace("NONE", "")
    
    monthly_messages=monthly_messages[["conversation_id","supplier_type","interaction_name","jdiEvent","GUID","messageRef","time_of_message" ]]
    print(time.perf_counter()-a)
    return monthly_messages



In [7]:
all_messages=[generate_single_frame(file) for file in full_filenames]

print('Now Concatenating all months of data')
a=time.perf_counter()
all_messages=pd.concat(all_messages,axis=0)
print(time.perf_counter()-a)

Now Processing s3://prm-gp2gp-data-sandbox-dev/spine-gp2gp-data/Sept-2020.csv.gz
59.076444541000456
Now Processing s3://prm-gp2gp-data-sandbox-dev/spine-gp2gp-data/Oct-2020.csv.gz
52.314487456998904
Now Processing s3://prm-gp2gp-data-sandbox-dev/spine-gp2gp-data/Nov-2020.csv.gz
48.13411116899988
Now Processing s3://prm-gp2gp-data-sandbox-dev/spine-gp2gp-data/Dec-2020.csv.gz
44.745309570000245
Now Processing s3://prm-gp2gp-data-sandbox-dev/spine-gp2gp-data/Jan-2021.csv.gz
50.18377329900068
Now Processing s3://prm-gp2gp-data-sandbox-dev/spine-gp2gp-data/Feb-2021.csv.gz
52.99704719299916
Now Processing s3://prm-gp2gp-data-sandbox-dev/spine-gp2gp-data/Mar-2021.csv.gz
68.1787397930002
Now Concatenating all months of data
4.794414346999474


In [8]:
print('Now constructing full interactions')
a=time.perf_counter()
all_messages_listed=all_messages.merge(all_messages[['GUID','interaction_name']].rename({'interaction_name':'interaction_response'},axis=1),left_on='messageRef',right_on='GUID',how='left')
all_messages_listed['interaction_response']=all_messages_listed['interaction_response'].fillna("")
all_messages_listed['interaction']=all_messages_listed['interaction_response']+all_messages_listed['interaction_name']
all_messages_listed["messages"] = list(zip(all_messages_listed["supplier_type"], all_messages_listed["interaction"], all_messages_listed["jdiEvent"], all_messages_listed["time_of_message"] ))
all_messages_listed["messages"] = all_messages_listed["messages"].apply(list)
all_messages_listed=all_messages_listed[["conversation_id", "messages"]]
all_messages_listed
print(time.perf_counter()-a)

print('Now Grouping by conversation')
a=time.perf_counter()
full_field_data=all_messages_listed.groupby('conversation_id')['messages'].apply(list)
print(time.perf_counter()-a)

if overwrite_files:
    print('Now Saving Data')
    pd.DataFrame(full_field_data).to_parquet(f's3://prm-gp2gp-data-sandbox-dev/extra-fields-data-from-splunk/Sept_20_Feb_21_conversations_extended_interaction_messages_with_time.parquet')

Now constructing full interactions
146.62699596099992
Now Grouping by conversation
83.8268605309986
Now Saving Data


In [9]:
full_field_data["000009F9-DF15-4597-9218-4024D7A79145"]

[['requestor', 'req start', '', '0.0'],
 ['sender', 'req complete', '', '10.729000000000001'],
 ['sender', 'req start ack', '', '11.616000000000001'],
 ['requestor', 'req complete ack', '', '18637.791']]

In [10]:
conversations_extended_interaction_messages_with_time_file_name = 's3://prm-gp2gp-data-sandbox-dev/extra-fields-data-from-splunk/Sept_20_Feb_21_conversations_extended_interaction_messages_with_time.parquet'
transfers_with_time_messages = add_messages_parquet_to_transfers(conversations_extended_interaction_messages_with_time_file_name, transfers)

### How long does it take for a sender request complete message to arrive from request start message

In [11]:
# returns a time if sender request completed message
transfers_with_time_messages["Sender req complete times/s"] = transfers_with_time_messages["messages"].apply(lambda messages: [message[3] for message in messages if message[0:3] == ('sender', 'req complete', '')])

In [12]:
transfers_with_time_messages["Min sender req complete time/s"] = transfers_with_time_messages["Sender req complete times/s"].apply(lambda times: min(times) if len(times) > 0 else np.nan)

In [13]:
print("Hours")
(transfers_with_time_messages["Min sender req complete time/s"].dropna().astype(float).describe())/3600

Hours


count     443.742222
mean        1.992413
std        43.934522
min         0.000338
25%         0.002464
50%         0.004159
75%         0.008186
max      4395.096212
Name: Min sender req complete time/s, dtype: float64

In [14]:
fourteen_days_in_seconds = 14 * 24 * 60 * 60
percent_sender_req_complete_received_before_14_days = (transfers_with_time_messages["Min sender req complete time/s"].astype(float) <= fourteen_days_in_seconds).mean() * 100

In [15]:
print(f"There are {(100 - percent_sender_req_complete_received_before_14_days).round(2)}% transfers where a sender req complete message is received after 14 days")

There are 3.4% transfers where a sender req complete message is received after 14 days


### Is integration possible without sender request complete message

In [16]:
# -- anything that did integrate at some point
transfers_with_time_messages["integrated"] = transfers_with_time_messages["messages"].apply(lambda messages: True in [True for message in messages if (message[0:3] == ("requestor", "req complete ack", '')) | (message[0:3] == ("requestor", "req complete ack", "15"))])
transfers_with_time_messages["Contains sender req complete"] = transfers_with_time_messages["messages"].apply(lambda messages: True in [True for message in messages if message[0:3] == ('sender', 'req complete', '')])

In [17]:
transfers_with_time_messages.pivot_table(index="Contains sender req complete", columns="integrated", aggfunc="count", values="conversation_id")

integrated,False,True
Contains sender req complete,Unnamed: 1_level_1,Unnamed: 2_level_1
False,53490.0,
True,64271.0,1533201.0


The transfer can only integrate when there is sender req complete message

## Part B

**We also believe that** reclassifying these transfers as Failed, will classify the transfers with fatal Sender errors as failures, and therefore make the following redundant: 
**We will know this to be true when** we can see that any transfers that would be classified as Failures because they contain these errors, do not contain the Request Completed message. 

### Scope

1. Use 7 months of Spine Parquet files (Sept-2020 to Mar-2020 s3://prm-gp2gp-data-sandbox-dev/transfers-sample-5/)
2. Re-label transfers which would be considered failures due to sender error as “Failed due to Sender Error”
3. Add new status label for transfers not containing Request Completed Message
4. Compare change in status for all “Failed due to Sender Error Messages”

In [None]:
transfers["status"].value_counts()

## Step 3

In [None]:
conversations_extended_interaction_messages_file_name = 's3://prm-gp2gp-data-sandbox-dev/extra-fields-data-from-splunk/Sept_20_Feb_21_conversations_extended_interaction_messages.parquet'
transfers_with_message_list = add_messages_parquet_to_transfers(transfers_with_time_file_name, transfers)

In [None]:
sender_req_completed_message = ('sender', 'req complete', '')

In [None]:
transfers_with_sender_req_completed_bool = transfers_with_message_list["messages"].apply(lambda messages: sender_req_completed_message in messages)
transfers_with_message_list_and_new_status = transfers_with_message_list.copy()
transfers_with_message_list_and_new_status["New Status"] = transfers_with_message_list_and_new_status["status"]
transfers_with_message_list_and_new_status = transfers_with_message_list_and_new_status.rename({"status": "Old Status"}, axis=1)
transfers_with_message_list_and_new_status.loc[~transfers_with_sender_req_completed_bool, "New Status"] = "Failed"

In [None]:
transfers_with_message_list_and_new_status.groupby(by=["Old Status", "New Status"]).agg({"conversation_id":"count"})

In [None]:
pd.set_option('display.max_colwidth', None)
old_status_sender_error_failure_bool = transfers_with_message_list_and_new_status["Old Status"] == "Failed Due To Sender Error Code"
new_status_sender_error_failure_bool = transfers_with_message_list_and_new_status["New Status"] == "Failed Due To Sender Error Code"
transfers_with_message_list_and_new_status.loc[old_status_sender_error_failure_bool & new_status_sender_error_failure_bool, "messages"]