# PRMT-2059 Generate high level table of message patterns prevalance
We believe that being able to break down transfers by the set of messages that occur between the sending and receiving supplier will give us a better understanding of what the actual status of the transfer is. 

In particular, Pending transfers may have a technical issue or may be awaiting practice integration - we may be able to distinguish between these. 
For 6 months of transfers (September 2020 to Feb 2021), we wish to be able to see the list of messages in the form:
- The message creator (sending or requesting practice)
- The message type (interaction name)
- Any associate code (jdi event)

We then wish to break down all transfers by:
- The supplier pathway
- The Status
- The message chain
And order these in a table according to how common they are


In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
overwrite_files = True

# Turn on the option to only include the first pair of messages
# where multiple COPC messages with successful acknowledgements were sent in a single transfer
reduce_COPC_messages = True

COPC_tag = "-reduced-COPCs" if reduce_COPC_messages else ""

## Importing transfer data

In [3]:
# Import transfer files to extract whether message creator is sender or requester
# Using data generated from branch PRMT-1742-duplicates-analysis.
# This is needed to correctly handle duplicates.
# Once the upstream pipeline has a fix for duplicate EHRs, then we can go back to using the main output.
transfer_file_location = "s3://prm-gp2gp-data-sandbox-dev/transfers-duplicates-hypothesis/"
transfer_files = [
    "9-2020-transfers.parquet",
    "10-2020-transfers.parquet",
    "11-2020-transfers.parquet",
    "12-2020-transfers.parquet",
    "1-2021-transfers.parquet",
    "2-2021-transfers.parquet"
]

transfer_input_files = [transfer_file_location + f for f in transfer_files]
transfers_raw = pd.concat((
    pd.read_parquet(f)
    for f in transfer_input_files
))

# In the data from the PRMT-1742-duplicates-analysis branch, these columns have been added , but contain only empty values.
transfers_raw = transfers_raw.drop(["sending_supplier", "requesting_supplier"], axis=1)

# Given the findings in PRMT-1742 - many duplicate EHR errors are misclassified, the below reclassifies the relevant data
has_at_least_one_successful_integration_code = lambda errors: any((np.isnan(e) or e==15 for e in errors))
successful_transfers_bool = transfers_raw['request_completed_ack_codes'].apply(has_at_least_one_successful_integration_code)
transfers = transfers_raw.copy()
transfers.loc[successful_transfers_bool, "status"] = "INTEGRATED"

# Correctly interpret certain sender errors as failed.
# This is explained in PRMT-1974. Eventually this will be fixed upstream in the pipeline.
pending_sender_error_codes=[6,7,10,24,30,23,14,99]
transfers_with_pending_sender_code_bool=transfers['sender_error_code'].isin(pending_sender_error_codes)
transfers_with_pending_with_error_bool=transfers['status']=='PENDING_WITH_ERROR'
transfers_which_need_pending_to_failure_change_bool=transfers_with_pending_sender_code_bool & transfers_with_pending_with_error_bool
transfers.loc[transfers_which_need_pending_to_failure_change_bool,'status']='FAILED'

# Add integrated Late status
eight_days_in_seconds=8*24*60*60
transfers_after_sla_bool=transfers['sla_duration']>eight_days_in_seconds
transfers_with_integrated_bool=transfers['status']=='INTEGRATED'
transfers_integrated_late_bool=transfers_after_sla_bool & transfers_with_integrated_bool
transfers.loc[transfers_integrated_late_bool,'status']='INTEGRATED LATE'

# If the record integrated after 28 days, change the status back to pending.
# This is to handle each month consistently and to always reflect a transfers status 28 days after it was made.
# TBD how this is handled upstream in the pipeline
twenty_eight_days_in_seconds=28*24*60*60
transfers_after_month_bool=transfers['sla_duration']>twenty_eight_days_in_seconds
transfers_pending_at_month_bool=transfers_after_month_bool & transfers_integrated_late_bool
transfers.loc[transfers_pending_at_month_bool,'status']='PENDING'
transfers_with_early_error_bool=(~transfers.loc[:,'sender_error_code'].isna()) |(~transfers.loc[:,'intermediate_error_codes'].apply(len)>0)
transfers.loc[transfers_with_early_error_bool & transfers_pending_at_month_bool,'status']='PENDING_WITH_ERROR'

# Supplier name mapping
supplier_renaming = {
    "EGTON MEDICAL INFORMATION SYSTEMS LTD (EMIS)":"EMIS",
    "IN PRACTICE SYSTEMS LTD":"Vision",
    "MICROTEST LTD":"Microtest",
    "THE PHOENIX PARTNERSHIP":"TPP",
    None: "Unknown"
}

asid_lookup_file = "s3://prm-gp2gp-data-sandbox-dev/asid-lookup/asidLookup-Mar-2021.csv.gz"
asid_lookup = pd.read_csv(asid_lookup_file)
lookup = asid_lookup[["ASID", "MName", "NACS","OrgName"]]

transfers = transfers.merge(lookup, left_on='requesting_practice_asid',right_on='ASID',how='left')
transfers = transfers.rename({'MName': 'requesting_supplier', 'ASID': 'requesting_supplier_asid', 'NACS': 'requesting_ods_code','OrgName':'requesting_practice_name'}, axis=1)
transfers = transfers.merge(lookup, left_on='sending_practice_asid',right_on='ASID',how='left')
transfers = transfers.rename({'MName': 'sending_supplier', 'ASID': 'sending_supplier_asid', 'NACS': 'sending_ods_code','OrgName':'sending_practice_name'}, axis=1)

transfers["sending_supplier"] = transfers["sending_supplier"].replace(supplier_renaming.keys(), supplier_renaming.values())
transfers["requesting_supplier"] = transfers["requesting_supplier"].replace(supplier_renaming.keys(), supplier_renaming.values())

## Stage 1
 
Using the raw Spine data for the transfers above, we want to generate the list of messages in the form:
- The message creator (sending or requesting practice)
- The message type (interaction name)
    - in the case of application acknowledgement message, we wish to see what type of message it is acknowledging
- Any associate code (JDI event)

In [4]:
# Generating a mapping of practice asid, and whether they were the sender or requestor in that conversation
requesting_supplier_type_map = transfers[["conversation_id", "requesting_practice_asid", "date_requested"]].drop_duplicates()
sending_supplier_type_map = transfers[["conversation_id", "sending_practice_asid", "date_requested"]].drop_duplicates()

requesting_supplier_type_map["supplier_type"] = "requestor"
sending_supplier_type_map["supplier_type"] = "sender"

requesting_supplier_type_map = requesting_supplier_type_map.rename({"requesting_practice_asid": "practice_asid"}, axis=1)
sending_supplier_type_map = sending_supplier_type_map.rename({"sending_practice_asid": "practice_asid"}, axis=1)

supplier_type_mapping = pd.concat([requesting_supplier_type_map, sending_supplier_type_map])
supplier_type_mapping["practice_asid"] = supplier_type_mapping["practice_asid"].astype(int)

In [5]:
conversation_ids_of_interest=transfers['conversation_id'].values

In [6]:
# Define a list of files to be loaded in
#folder="s3://prm-gp2gp-data-sandbox-dev/spine-gp2gp-data-with-ack-codes-prmt-2059/"
folder="s3://prm-gp2gp-data-sandbox-dev/spine-gp2gp-data/"
files=["Sept-2020","Oct-2020","Nov-2020","Dec-2020","Jan-2021","Feb-2021","Mar-2021"]
full_filenames=[folder + file + ".csv.gz" for file in files]

In [7]:
# Rename message types to be human readable
interaction_name_mapping={"urn:nhs:names:services:gp2gp/RCMR_IN010000UK05":"req start",
"urn:nhs:names:services:gp2gp/RCMR_IN030000UK06":"req complete",
"urn:nhs:names:services:gp2gp/COPC_IN000001UK01":"COPC",
"urn:nhs:names:services:gp2gp/MCCI_IN010000UK13":" ack"}

#ackTypeCode_mapping={'AE':"Neg","AR":"Neg","ER":"Neg","IF":"Pos","NONE":"Pos"}

In [8]:
# This function will take a set of Spine data and for each message in the conversation we're interested in, it will output a dataframe
# with "conversation_id", "supplier_type", "interaction_name", "jdiEvent", "GUID", "messageRef" for each given message in the order
# they occur
def generate_single_frame(file):
    a=time.perf_counter()
    print("Now Processing " + file)
    df=pd.read_csv(file, compression='gzip',error_bad_lines=False)

    # Only keep conversations from the conversations that we actually want to use
    df=df.loc[df['conversationID'].isin(conversation_ids_of_interest)]
    df=df.sort_values(by='_time')
    df = df.merge(supplier_type_mapping, left_on=["conversationID", "messageSender"], right_on=["conversation_id", "practice_asid"], how="left")
    
    # filter out messages that took place more than 28 days after the date requested
    in_time_message_bool = (pd.to_datetime(df["_time"]).dt.tz_localize(None) - df["date_requested"]).dt.total_seconds() <= twenty_eight_days_in_seconds
    df = df.loc[in_time_message_bool]
    
    # map the message name to human readable form using supplier mapping
    df['interaction_name']=df['interactionID'].replace(interaction_name_mapping)
    df["jdiEvent"] = df["jdiEvent"].replace("NONE", "")
    
    df=df[["conversation_id","supplier_type","interaction_name","jdiEvent","GUID","messageRef"]]
    print(time.perf_counter()-a)
    return df

In [9]:
df=[generate_single_frame(file) for file in full_filenames]

print('Now Concatenating all months of data')
a=time.perf_counter()
df=pd.concat(df,axis=0)
print(time.perf_counter()-a)

Now Processing s3://prm-gp2gp-data-sandbox-dev/spine-gp2gp-data/Sept-2020.csv.gz
62.93144888099778
Now Processing s3://prm-gp2gp-data-sandbox-dev/spine-gp2gp-data/Oct-2020.csv.gz
54.177169143000356
Now Processing s3://prm-gp2gp-data-sandbox-dev/spine-gp2gp-data/Nov-2020.csv.gz
49.25512907700249
Now Processing s3://prm-gp2gp-data-sandbox-dev/spine-gp2gp-data/Dec-2020.csv.gz
46.31936260099974
Now Processing s3://prm-gp2gp-data-sandbox-dev/spine-gp2gp-data/Jan-2021.csv.gz
51.915243125000416
Now Processing s3://prm-gp2gp-data-sandbox-dev/spine-gp2gp-data/Feb-2021.csv.gz
54.136961001997406
Now Processing s3://prm-gp2gp-data-sandbox-dev/spine-gp2gp-data/Mar-2021.csv.gz
45.43095088300106
Now Concatenating all months of data
3.5775846779979474


In [10]:
# Only include the first pair of messages where multiple COPC messages with successful acknowledgements were sent in a single transfer
# The following code identifies indexes of messages we want to remove in df, and then removes them
if reduce_COPC_messages:
    df=df.reset_index(drop=True)
    COPC_data=df.copy()

    # Sender COPCs
    COPCs_bool=(COPC_data['supplier_type']=='sender') & (COPC_data['interaction_name']=='COPC')
    COPC_data=COPC_data.loc[COPCs_bool].reset_index().rename({'index':'Sender COPC index'},axis=1)

    # Requestor COPC ack
    COPC_data=COPC_data.merge(df[['messageRef','interaction_name', 'jdiEvent']].reset_index().rename({'index':'Requestor COPC ack index'},axis=1),left_on='GUID',right_on='messageRef',how='inner')

    # Filter out anything with a negative acknowledgement (ie a JDI event)
    successful_responses_bool=COPC_data['jdiEvent_y']==""
    COPC_data=COPC_data.loc[successful_responses_bool]

    COPC_data=COPC_data[['conversation_id','Sender COPC index','Requestor COPC ack index']].groupby('conversation_id').agg(list)

    multiple_COPC_message_conversations_bool=COPC_data['Sender COPC index'].apply(len)>1
    COPC_data=COPC_data.loc[multiple_COPC_message_conversations_bool]
    messages_to_remove=COPC_data.apply(lambda row: row['Sender COPC index'][1:]+row['Requestor COPC ack index'][1:],axis=1).explode().values
    df=df.drop(messages_to_remove,axis=0)

In [11]:
print('Now constructing full interactions')
a=time.perf_counter()
df2=df.merge(df[['GUID','interaction_name']].rename({'interaction_name':'interaction_response'},axis=1),left_on='messageRef',right_on='GUID',how='left')
df2['interaction_response']=df2['interaction_response'].fillna("")
df2['interaction']=df2['interaction_response']+df2['interaction_name']
df2["messages"] = list(zip(df2["supplier_type"], df2["interaction"], df2["jdiEvent"]))
df2["messages"] = df2["messages"].apply(list)
df2=df2[["conversation_id", "messages"]]
df2
print(time.perf_counter()-a)

print('Now Grouping by conversation')
a=time.perf_counter()
full_field_data=df2.groupby('conversation_id')['messages'].apply(list)
print(time.perf_counter()-a)

if overwrite_files:
    print('Now Saving Data')
    pd.DataFrame(full_field_data).to_parquet(f's3://prm-gp2gp-data-sandbox-dev/extra-fields-data-from-splunk/Sept_20_Feb_21_conversations_extended_interaction_messages{COPC_tag}.parquet')

Now constructing full interactions
31.483020542000304
Now Grouping by conversation
42.34794190899993
Now Saving Data


## Stage 2

Create csv files, which show the most common supplier pathway, status and message list combinations.

In [12]:
conversations_extended_interaction_messages=pd.read_parquet(f's3://prm-gp2gp-data-sandbox-dev/extra-fields-data-from-splunk/Sept_20_Feb_21_conversations_extended_interaction_messages{COPC_tag}.parquet')
# turning messages from list of list to tuple of tuples (since they are hasable)
conversations_extended_interaction_messages["messages"]=conversations_extended_interaction_messages["messages"].apply(lambda message_list: tuple([tuple(message) for message in message_list]))

In [13]:
# Attach this message list to the transfers dataframe
transfers_with_message_list = transfers.merge(conversations_extended_interaction_messages, left_on="conversation_id", right_index=True)

In [14]:
# Count of transfers per status and supplier pathway and message pattern combination
message_list_prevelance_table = transfers_with_message_list.groupby(["sending_supplier", "requesting_supplier", "status", "messages"]).agg({"conversation_id": "count"})
message_list_prevelance_table = message_list_prevelance_table.rename({"conversation_id": "Total Number of transfers"}, axis=1).sort_values(by="Total Number of transfers", ascending=False)

In [15]:
total_transfer_count = message_list_prevelance_table["Total Number of transfers"].sum()
message_list_prevelance_table["% Transfers"] = (message_list_prevelance_table["Total Number of transfers"] / total_transfer_count).multiply(100).round(2)

In [16]:
## Let's add a column with the percentage of transfers for that combination of Supplier Pathway and status

# The columns we are aggregating on (ie supplier pathway and status)
column_interested_in = ["sending_supplier", "requesting_supplier", "status"]

# Create a table of the count of transfers for each of these supplier pathways
pathway_and_status_counts = transfers_with_message_list.groupby(column_interested_in).agg({"conversation_id": "count"})
pathway_and_status_counts = pathway_and_status_counts.rename({"conversation_id": "Pathway and status totals"}, axis=1)

# Take the relevant indexes from our original table and use this to get a full list of the number of transfers for each row's pathway and status
order_of_indexes_needed=message_list_prevelance_table.reset_index().set_index(column_interested_in).index
ordered_totals=pathway_and_status_counts.loc[order_of_indexes_needed]

# Divide the transfers by these values to get the percentage
message_list_prevelance_table["% Pathway and status transfers"] =(message_list_prevelance_table['Total Number of transfers'].values/ordered_totals['Pathway and status totals'].values)
message_list_prevelance_table["% Pathway and status transfers"] =message_list_prevelance_table["% Pathway and status transfers"].multiply(100).round(2)

In [17]:
message_list_prevelance_table.shape

(3944, 3)

In [18]:
# Filtering out the combinations that only have one transfer associated
reduced_message_list_prevelance_table_bool = message_list_prevelance_table["Total Number of transfers"] > 1
reduced_message_list_prevelance_table = message_list_prevelance_table[reduced_message_list_prevelance_table_bool]
reduced_message_list_prevelance_table.shape

(1635, 3)

In [19]:
print(f"Note: there are {message_list_prevelance_table.shape[0] - reduced_message_list_prevelance_table.shape[0]} combinations of supplier pathway, status and message pattern with only 1 transfer associated, which we have filtered out")

Note: there are 2309 combinations of supplier pathway, status and message pattern with only 1 transfer associated, which we have filtered out


In [20]:
if overwrite_files:
    pd.DataFrame(reduced_message_list_prevelance_table).to_csv(f's3://prm-gp2gp-data-sandbox-dev/notebook-outputs/36--PRMT-2059-high-level-table-of-message-patterns-reduced{COPC_tag}.csv')