# PRMT-2059 Generate high level table of message patterns prevalance
We believe that being able to break down transfers by the set of messages that occur between the sending and receiving supplier will give us a better understanding of what the actual status of the transfer is. 

In particular, Pending transfers may have a technical issue or may be awaiting practice integration - we may be able to distinguish between these. 
For 6 months of transfers (September 2020 to Feb 2021), we wish to be able to see the list of messages in the form:
- The message creator (sending or requesting practice)
- The message type (interaction name)
- Any associate code (jdi event)

We then wish to break down all transfers by:
- The supplier pathway
- The Status
- The message chain
And order these in a table according to how common they are


In [None]:
import pandas as pd
import numpy as np
import time

In [None]:
overwrite_files = False

# Turn on the option to only include the first pair of messages
# where multiple COPC messages with successful acknowledgements were sent in a single transfer
reduce_COPC_messages = True

COPC_tag = "-reduced-COPCs" if reduce_COPC_messages else ""

## Importing transfer data

In [None]:
# Import transfer files to extract whether message creator is sender or requester
# Using data generated from branch PRMT-1742-duplicates-analysis.
# This is needed to correctly handle duplicates.
# Once the upstream pipeline has a fix for duplicate EHRs, then we can go back to using the main output.
transfer_file_location = "s3://prm-gp2gp-data-sandbox-dev/transfers-duplicates-hypothesis/"
transfer_files = [
    "9-2020-transfers.parquet",
    "10-2020-transfers.parquet",
    "11-2020-transfers.parquet",
    "12-2020-transfers.parquet",
    "1-2021-transfers.parquet",
    "2-2021-transfers.parquet"
]

transfer_input_files = [transfer_file_location + f for f in transfer_files]
transfers_raw = pd.concat((
    pd.read_parquet(f)
    for f in transfer_input_files
))

# In the data from the PRMT-1742-duplicates-analysis branch, these columns have been added , but contain only empty values.
transfers_raw = transfers_raw.drop(["sending_supplier", "requesting_supplier"], axis=1)

# Given the findings in PRMT-1742 - many duplicate EHR errors are misclassified, the below reclassifies the relevant data
has_at_least_one_successful_integration_code = lambda errors: any((np.isnan(e) or e==15 for e in errors))
successful_transfers_bool = transfers_raw['request_completed_ack_codes'].apply(has_at_least_one_successful_integration_code)
transfers = transfers_raw.copy()
transfers.loc[successful_transfers_bool, "status"] = "INTEGRATED"

# Correctly interpret certain sender errors as failed.
# This is explained in PRMT-1974. Eventually this will be fixed upstream in the pipeline.
pending_sender_error_codes=[6,7,10,24,30,23,14,99]
transfers_with_pending_sender_code_bool=transfers['sender_error_code'].isin(pending_sender_error_codes)
transfers_with_pending_with_error_bool=transfers['status']=='PENDING_WITH_ERROR'
transfers_which_need_pending_to_failure_change_bool=transfers_with_pending_sender_code_bool & transfers_with_pending_with_error_bool
transfers.loc[transfers_which_need_pending_to_failure_change_bool,'status']='FAILED'

# Add integrated Late status
eight_days_in_seconds=8*24*60*60
transfers_after_sla_bool=transfers['sla_duration']>eight_days_in_seconds
transfers_with_integrated_bool=transfers['status']=='INTEGRATED'
transfers_integrated_late_bool=transfers_after_sla_bool & transfers_with_integrated_bool
transfers.loc[transfers_integrated_late_bool,'status']='INTEGRATED LATE'

# If the record integrated after 28 days, change the status back to pending.
# This is to handle each month consistently and to always reflect a transfers status 28 days after it was made.
# TBD how this is handled upstream in the pipeline
twenty_eight_days_in_seconds=28*24*60*60
transfers_after_month_bool=transfers['sla_duration']>twenty_eight_days_in_seconds
transfers_pending_at_month_bool=transfers_after_month_bool & transfers_integrated_late_bool
transfers.loc[transfers_pending_at_month_bool,'status']='PENDING'
transfers_with_early_error_bool=(~transfers.loc[:,'sender_error_code'].isna()) |(~transfers.loc[:,'intermediate_error_codes'].apply(len)>0)
transfers.loc[transfers_with_early_error_bool & transfers_pending_at_month_bool,'status']='PENDING_WITH_ERROR'

# Supplier name mapping
supplier_renaming = {
    "EGTON MEDICAL INFORMATION SYSTEMS LTD (EMIS)":"EMIS",
    "IN PRACTICE SYSTEMS LTD":"Vision",
    "MICROTEST LTD":"Microtest",
    "THE PHOENIX PARTNERSHIP":"TPP",
    None: "Unknown"
}

asid_lookup_file = "s3://prm-gp2gp-data-sandbox-dev/asid-lookup/asidLookup-Mar-2021.csv.gz"
asid_lookup = pd.read_csv(asid_lookup_file)
lookup = asid_lookup[["ASID", "MName", "NACS","OrgName"]]

transfers = transfers.merge(lookup, left_on='requesting_practice_asid',right_on='ASID',how='left')
transfers = transfers.rename({'MName': 'requesting_supplier', 'ASID': 'requesting_supplier_asid', 'NACS': 'requesting_ods_code','OrgName':'requesting_practice_name'}, axis=1)
transfers = transfers.merge(lookup, left_on='sending_practice_asid',right_on='ASID',how='left')
transfers = transfers.rename({'MName': 'sending_supplier', 'ASID': 'sending_supplier_asid', 'NACS': 'sending_ods_code','OrgName':'sending_practice_name'}, axis=1)

transfers["sending_supplier"] = transfers["sending_supplier"].replace(supplier_renaming.keys(), supplier_renaming.values())
transfers["requesting_supplier"] = transfers["requesting_supplier"].replace(supplier_renaming.keys(), supplier_renaming.values())

## Stage 1
 
Using the raw Spine data for the transfers above, we want to generate the list of messages in the form:
- The message creator (sending or requesting practice)
- The message type (interaction name)
    - in the case of application acknowledgement message, we wish to see what type of message it is acknowledging
- Any associate code (JDI event)

In [None]:
# Generating a mapping of practice asid, and whether they were the sender or requestor in that conversation
requesting_supplier_type_map = transfers[["conversation_id", "requesting_practice_asid", "date_requested"]].drop_duplicates()
sending_supplier_type_map = transfers[["conversation_id", "sending_practice_asid", "date_requested"]].drop_duplicates()

requesting_supplier_type_map["supplier_type"] = "requestor"
sending_supplier_type_map["supplier_type"] = "sender"

requesting_supplier_type_map = requesting_supplier_type_map.rename({"requesting_practice_asid": "practice_asid"}, axis=1)
sending_supplier_type_map = sending_supplier_type_map.rename({"sending_practice_asid": "practice_asid"}, axis=1)

supplier_type_mapping = pd.concat([requesting_supplier_type_map, sending_supplier_type_map])
supplier_type_mapping["practice_asid"] = supplier_type_mapping["practice_asid"].astype(int)

In [None]:
conversation_ids_of_interest=transfers['conversation_id'].values

In [None]:
# Define a list of files to be loaded in
#folder="s3://prm-gp2gp-data-sandbox-dev/spine-gp2gp-data-with-ack-codes-prmt-2059/"
folder="s3://prm-gp2gp-data-sandbox-dev/spine-gp2gp-data/"
files=["Sept-2020","Oct-2020","Nov-2020","Dec-2020","Jan-2021","Feb-2021","Mar-2021"]
full_filenames=[folder + file + ".csv.gz" for file in files]

In [None]:
# Rename message types to be human readable
interaction_name_mapping={"urn:nhs:names:services:gp2gp/RCMR_IN010000UK05":"req start",
"urn:nhs:names:services:gp2gp/RCMR_IN030000UK06":"req complete",
"urn:nhs:names:services:gp2gp/COPC_IN000001UK01":"COPC",
"urn:nhs:names:services:gp2gp/MCCI_IN010000UK13":" ack"}

#ackTypeCode_mapping={'AE':"Neg","AR":"Neg","ER":"Neg","IF":"Pos","NONE":"Pos"}

In [None]:
# This function will take a set of Spine data and for each message in the conversation we're interested in, it will output a dataframe
# with "conversation_id", "supplier_type", "interaction_name", "jdiEvent", "GUID", "messageRef" for each given message in the order
# they occur
def generate_single_frame(file):
    a=time.perf_counter()
    print("Now Processing " + file)
    all_messages_in_file=pd.read_csv(file, compression='gzip')

    # Only keep conversations from the conversations that we actually want to use
    monthly_relevant_messages=all_messages_in_file.loc[all_messages_in_file['conversationID'].isin(conversation_ids_of_interest)]
    monthly_relevant_messages=monthly_relevant_messages.sort_values(by='_time')
    monthly_relevant_messages = monthly_relevant_messages.merge(supplier_type_mapping, left_on=["conversationID", "messageSender"], right_on=["conversation_id", "practice_asid"], how="left")
    
    # filter out messages that took place more than 28 days after the date requested
    in_time_message_bool = (pd.to_datetime(monthly_relevant_messages["_time"]).dt.tz_localize(None) - monthly_relevant_messages["date_requested"]).dt.total_seconds() <= twenty_eight_days_in_seconds
    monthly_messages = monthly_relevant_messages.loc[in_time_message_bool]
    
    # map the message name to human readable form using supplier mapping
    monthly_messages['interaction_name']=monthly_messages['interactionID'].replace(interaction_name_mapping)
    monthly_messages["jdiEvent"] = monthly_messages["jdiEvent"].replace("NONE", "")
    
    monthly_messages=monthly_messages[["conversation_id","supplier_type","interaction_name","jdiEvent","GUID","messageRef"]]
    print(time.perf_counter()-a)
    return monthly_messages

In [None]:
all_messages=[generate_single_frame(file) for file in full_filenames]

print('Now Concatenating all months of data')
a=time.perf_counter()
all_messages=pd.concat(all_messages,axis=0)
print(time.perf_counter()-a)

In [None]:
# Only include the first pair of messages where multiple COPC messages with successful acknowledgements were sent in a single transfer
# The following code identifies indexes of messages we want to remove in df, and then removes them
if reduce_COPC_messages:
    all_messages=all_messages.reset_index(drop=True)
    COPC_data=all_messages.copy()

    # Sender COPCs
    COPCs_bool=(COPC_data['supplier_type']=='sender') & (COPC_data['interaction_name']=='COPC')
    COPC_data=COPC_data.loc[COPCs_bool].reset_index().rename({'index':'Sender COPC index'},axis=1)

    # Requestor COPC ack
    COPC_data=COPC_data.merge(all_messages[['messageRef','interaction_name', 'jdiEvent']].reset_index().rename({'index':'Requestor COPC ack index'},axis=1),left_on='GUID',right_on='messageRef',how='inner')

    # Filter out anything with a negative acknowledgement (ie a JDI event)
    successful_responses_bool=COPC_data['jdiEvent_y']==""
    COPC_data=COPC_data.loc[successful_responses_bool]

    COPC_data=COPC_data[['conversation_id','Sender COPC index','Requestor COPC ack index']].groupby('conversation_id').agg(list)

    multiple_COPC_message_conversations_bool=COPC_data['Sender COPC index'].apply(len)>1
    COPC_data=COPC_data.loc[multiple_COPC_message_conversations_bool]
    messages_to_remove=COPC_data.apply(lambda row: row['Sender COPC index'][1:]+row['Requestor COPC ack index'][1:],axis=1).explode().values
    all_messages=all_messages.drop(messages_to_remove,axis=0)

In [None]:
print('Now constructing full interactions')
a=time.perf_counter()
all_messages_listed=all_messages.merge(all_messages[['GUID','interaction_name']].rename({'interaction_name':'interaction_response'},axis=1),left_on='messageRef',right_on='GUID',how='left')
all_messages_listed['interaction_response']=all_messages_listed['interaction_response'].fillna("")
all_messages_listed['interaction']=all_messages_listed['interaction_response']+all_messages_listed['interaction_name']
all_messages_listed["messages"] = list(zip(all_messages_listed["supplier_type"], all_messages_listed["interaction"], all_messages_listed["jdiEvent"]))
all_messages_listed["messages"] = all_messages_listed["messages"].apply(list)
all_messages_listed=all_messages_listed[["conversation_id", "messages"]]
all_messages_listed
print(time.perf_counter()-a)

print('Now Grouping by conversation')
a=time.perf_counter()
full_field_data=all_messages_listed.groupby('conversation_id')['messages'].apply(list)
print(time.perf_counter()-a)

if overwrite_files:
    print('Now Saving Data')
    pd.DataFrame(full_field_data).to_parquet(f's3://prm-gp2gp-data-sandbox-dev/extra-fields-data-from-splunk/Sept_20_Feb_21_conversations_extended_interaction_messages{COPC_tag}.parquet')

## Stage 2

Create csv files, which show the most common supplier pathway, status and message list combinations.

In [None]:
conversations_extended_interaction_messages=pd.read_parquet(f's3://prm-gp2gp-data-sandbox-dev/extra-fields-data-from-splunk/Sept_20_Feb_21_conversations_extended_interaction_messages{COPC_tag}.parquet')
# turning messages from list of list to tuple of tuples (since they are hasable)
conversations_extended_interaction_messages["messages"]=conversations_extended_interaction_messages["messages"].apply(lambda message_list: tuple([tuple(message) for message in message_list]))

In [None]:
# Attach this message list to the transfers dataframe
transfers_with_message_list = transfers.merge(conversations_extended_interaction_messages, left_on="conversation_id", right_index=True)

In [None]:
# Count of transfers per status and supplier pathway and message pattern combination
message_list_prevelance_table = transfers_with_message_list.groupby(["sending_supplier", "requesting_supplier", "status", "messages"]).agg({"conversation_id": "count"})
message_list_prevelance_table = message_list_prevelance_table.rename({"conversation_id": "Total Number of transfers"}, axis=1).sort_values(by="Total Number of transfers", ascending=False)

In [None]:
total_transfer_count = message_list_prevelance_table["Total Number of transfers"].sum()
message_list_prevelance_table["% Transfers"] = (message_list_prevelance_table["Total Number of transfers"] / total_transfer_count).multiply(100).round(2)

In [None]:
## Let's add a column with the percentage of transfers for that combination of Supplier Pathway and status

# The columns we are aggregating on (ie supplier pathway and status)
column_interested_in = ["sending_supplier", "requesting_supplier", "status"]

# Create a table of the count of transfers for each of these supplier pathways
pathway_and_status_counts = transfers_with_message_list.groupby(column_interested_in).agg({"conversation_id": "count"})
pathway_and_status_counts = pathway_and_status_counts.rename({"conversation_id": "Pathway and status totals"}, axis=1)

# Take the relevant indexes from our original table and use this to get a full list of the number of transfers for each row's pathway and status
order_of_indexes_needed=message_list_prevelance_table.reset_index().set_index(column_interested_in).index
ordered_totals=pathway_and_status_counts.loc[order_of_indexes_needed]

# Divide the transfers by these values to get the percentage
message_list_prevelance_table["% Pathway and status transfers"] =(message_list_prevelance_table['Total Number of transfers'].values/ordered_totals['Pathway and status totals'].values)
message_list_prevelance_table["% Pathway and status transfers"] =message_list_prevelance_table["% Pathway and status transfers"].multiply(100).round(2)

In [None]:
message_list_prevelance_table.shape

In [None]:
# Filtering out the combinations that only have one transfer associated
reduced_message_list_prevelance_table_bool = message_list_prevelance_table["Total Number of transfers"] > 1
reduced_message_list_prevelance_table = message_list_prevelance_table[reduced_message_list_prevelance_table_bool]
reduced_message_list_prevelance_table.shape

In [None]:
print(f"Note: there are {message_list_prevelance_table.shape[0] - reduced_message_list_prevelance_table.shape[0]} combinations of supplier pathway, status and message pattern with only 1 transfer associated, which we have filtered out")

In [None]:
if overwrite_files:
    pd.DataFrame(reduced_message_list_prevelance_table).to_csv(f's3://prm-gp2gp-data-sandbox-dev/notebook-outputs/36--PRMT-2059-high-level-table-of-message-patterns-reduced{COPC_tag}.csv')

## Addendum - Closer inspection of pending message patterns

In [None]:
message_table = message_list_prevelance_table.reset_index()

single_message_pending = message_table.loc[35, "messages"]
two_messages_pending = message_table.loc[18, "messages"]

missing_req_completed_pending_bool = message_table["messages"].isin([single_message_pending, two_messages_pending])
main_suppliers_bool = (message_table["sending_supplier"].isin(["EMIS", "TPP", "Vision"])) & (message_table["requesting_supplier"].isin(["EMIS", "TPP", "Vision"]))
message_table=message_table[main_suppliers_bool & missing_req_completed_pending_bool]

In [None]:
reduced_messages_table=message_list_prevelance_table.copy().reset_index()
reduced_messages_table['Message Pattern Type']='Other'

single_message_bool=reduced_messages_table["messages"].isin([single_message_pending])
two_messages_bool=reduced_messages_table["messages"].isin([two_messages_pending])
three_messages_pending=message_list_prevelance_table.reset_index().loc[14,"messages"]
three_messages_bool=reduced_messages_table["messages"].isin([three_messages_pending])
sender_complete_message=three_messages_pending[1]
ready_for_integration_bool=reduced_messages_table['messages'].apply(lambda message: sender_complete_message in message)

reduced_messages_table.loc[single_message_bool,'Message Pattern Type']='Transfer Started (Single Message)'
reduced_messages_table.loc[two_messages_bool,'Message Pattern Type']='Transfer Acknowledged (Two Messages)'
reduced_messages_table.loc[three_messages_bool,'Message Pattern Type']='Awaiting Integration (Three Messages)'
reduced_messages_table.loc[ready_for_integration_bool,'Message Pattern Type']='Awaiting Integration (Sender Complete Message)'


In [None]:
sender_complete_message

In [None]:
reduced_messages_table_summary=reduced_messages_table.pivot_table(index='status',columns='Message Pattern Type',values='Total Number of transfers',aggfunc='sum').fillna(0).astype(int)

reduced_messages_table_summary_pc=reduced_messages_table_summary.div(reduced_messages_table_summary.sum(axis=1),axis=0).multiply(100).round(2)
reduced_messages_table_summary_pc.columns="% " + reduced_messages_table_summary_pc.columns

print('Proportions of Transfers that have certain message patterns')
pd.concat([reduced_messages_table_summary,reduced_messages_table_summary_pc],axis=1)

In [None]:
reduced_messages_table['Transferred Successfully']=ready_for_integration_bool
pending_transfers_table=reduced_messages_table.loc[reduced_messages_table['status']=='PENDING']
pending_transfers_by_pathway=pending_transfers_table.pivot_table(index=['sending_supplier','requesting_supplier'],columns='Transferred Successfully',values='Total Number of transfers',aggfunc='sum')
pending_transfers_by_pathway.columns=pending_transfers_by_pathway.columns.astype(str)
transfer_fail_rate=pd.DataFrame((pending_transfers_by_pathway['False']/pending_transfers_by_pathway.sum(axis=1)).multiply(100).rename('% Failed to Transfer'))
transfer_fail_rate=transfer_fail_rate.reset_index().pivot_table(index='sending_supplier',columns='requesting_supplier',values='% Failed to Transfer').fillna(0)
print('What percentage of each pathway does not seem to contain a successful transfer message from the sender')
transfer_fail_rate.loc[['EMIS','TPP','Vision'],['EMIS','TPP','Vision']].round(1)

In [None]:
pending_messages=message_list_prevelance_table.reset_index().loc[message_list_prevelance_table.reset_index()["status"]=="PENDING"]
pending_messages.groupby('messages').agg({'Total Number of transfers':'sum'}).sort_values(by='Total Number of transfers',ascending=False)


In [None]:
three_messages_pending

In [None]:
transfers.groupby('status').agg({'conversation_id':'count'})

In [None]:
# For a given supplier pathway, what is a percentage chance of the transfer having the given message patterns
supplier_pathway_missing_messages_table = message_table.pivot_table(index=["sending_supplier", "requesting_supplier"], columns="messages", values=["Total Number of transfers"], aggfunc="sum").fillna(0)
supplier_pathway_transfer_count = transfers.groupby(["sending_supplier", "requesting_supplier"]).agg({"conversation_id": "count"})
relevant_pathway_counts = supplier_pathway_transfer_count.loc[supplier_pathway_missing_messages_table.index]
supplier_pathway_missing_messages_table.div(relevant_pathway_counts.values, axis=0).multiply(100).round(2)

In [None]:
# For each supplier pathway, what proportion of their pending transfers have the given message patterns
pending_status_bool = message_table["status"] == "PENDING"
pending_message_table = message_table[pending_status_bool]

pending_message_table.pivot_table(index=["sending_supplier", "requesting_supplier"], columns="messages", values=["% Pathway and status transfers"], aggfunc="sum").fillna(0)