# PRMT-2124 - Can we consider transfers without a Sender Request Completed message as failed?

In [1]:
import pandas as pd
import numpy as np
import time
import datetime

In [2]:
# Import transfer files to extract whether message creator is sender or requester
transfer_file_location = "s3://prm-gp2gp-data-sandbox-dev/transfers-sample-5/"
transfer_files = [
    "2020-9-transfers.parquet",
    "2020-10-transfers.parquet",
    "2020-11-transfers.parquet",
    "2020-12-transfers.parquet",
    "2021-1-transfers.parquet",
    "2021-2-transfers.parquet",
    "2021-3-transfers.parquet",
    "2021-4-transfers.parquet",
]

transfer_input_files = [transfer_file_location + f for f in transfer_files]
transfers_raw = pd.concat((
    pd.read_parquet(f)
    for f in transfer_input_files
))

# In the data from the PRMT-1742-duplicates-analysis branch, these columns have been added , but contain only empty values.
transfers_raw = transfers_raw.drop(["sending_supplier", "requesting_supplier"], axis=1)
transfers = transfers_raw.copy()

# Correctly interpret certain sender errors as failed.
# This is explained in PRMT-1974. Eventually this will be fixed upstream in the pipeline.
# Step Two: reclassifying the relevant transfers with pending sender error codes to FAILED DUE TO SENDER ERROR CODE status for comparison
pending_sender_error_codes=[6,7,10,24,30,23,14,99]
transfers_with_pending_sender_code_bool=transfers['sender_error_code'].isin(pending_sender_error_codes)
transfers_with_pending_with_error_bool=transfers['status']=='PENDING_WITH_ERROR'
transfers_which_need_pending_to_failure_change_bool=transfers_with_pending_sender_code_bool & transfers_with_pending_with_error_bool
transfers.loc[transfers_which_need_pending_to_failure_change_bool,'status']='FAILED DUE TO SENDER ERROR CODE'

# Add integrated Late status
eight_days_in_seconds=8*24*60*60
transfers_after_sla_bool=transfers['sla_duration']>eight_days_in_seconds
transfers_with_integrated_bool=transfers['status']=='INTEGRATED'
transfers_integrated_late_bool=transfers_after_sla_bool & transfers_with_integrated_bool
transfers.loc[transfers_integrated_late_bool,'status']='INTEGRATED LATE'

# If the record integrated after 28 days, change the status back to pending.
# This is to handle each month consistently and to always reflect a transfers status 28 days after it was made.
# TBD how this is handled upstream in the pipeline
twenty_eight_days_in_seconds=28*24*60*60
transfers_after_month_bool=transfers['sla_duration']>twenty_eight_days_in_seconds
transfers_pending_at_month_bool=transfers_after_month_bool & transfers_integrated_late_bool
transfers.loc[transfers_pending_at_month_bool,'status']='PENDING'
transfers_with_early_error_bool=(~transfers.loc[:,'sender_error_code'].isna()) |(~transfers.loc[:,'intermediate_error_codes'].apply(len)>0)
transfers.loc[transfers_with_early_error_bool & transfers_pending_at_month_bool,'status']='PENDING_WITH_ERROR'

# Supplier name mapping
supplier_renaming = {
    "EGTON MEDICAL INFORMATION SYSTEMS LTD (EMIS)":"EMIS",
    "IN PRACTICE SYSTEMS LTD":"Vision",
    "MICROTEST LTD":"Microtest",
    "THE PHOENIX PARTNERSHIP":"TPP",
    None: "Unknown"
}

# Generate ASID lookup that contains all the most recent entry for all ASIDs encountered
asid_file_location = "s3://prm-gp2gp-data-sandbox-dev/asid-lookup/"
asid_files = [
    "asidLookup-Nov-2020.csv.gz",
    "asidLookup-Dec-2020.csv.gz",
    "asidLookup-Jan-2021.csv.gz",
    "asidLookup-Feb-2021.csv.gz",
    "asidLookup-Mar-2021.csv.gz",
    "asidLookup-Apr-2021.csv.gz"
]
asid_lookup_files = [asid_file_location + f for f in asid_files]
asid_lookup = pd.concat((
    pd.read_csv(f)
    for f in asid_lookup_files
))
asid_lookup = asid_lookup.drop_duplicates().groupby("ASID").last().reset_index()
lookup = asid_lookup[["ASID", "MName", "NACS","OrgName"]]

transfers = transfers.merge(lookup, left_on='requesting_practice_asid',right_on='ASID',how='left')
transfers = transfers.rename({'MName': 'requesting_supplier', 'ASID': 'requesting_supplier_asid', 'NACS': 'requesting_ods_code','OrgName':'requesting_practice_name'}, axis=1)
transfers = transfers.merge(lookup, left_on='sending_practice_asid',right_on='ASID',how='left')
transfers = transfers.rename({'MName': 'sending_supplier', 'ASID': 'sending_supplier_asid', 'NACS': 'sending_ods_code','OrgName':'sending_practice_name'}, axis=1)

transfers["sending_supplier"] = transfers["sending_supplier"].replace(supplier_renaming.keys(), supplier_renaming.values())
transfers["requesting_supplier"] = transfers["requesting_supplier"].replace(supplier_renaming.keys(), supplier_renaming.values())

# Making the status to be more human readable here
transfers["status"] = transfers["status"].str.replace("_", " ").str.title()


In [3]:
def add_messages_parquet_to_transfers(parquet_file_name, transfers_df):
    conversations_extended_interaction_messages=pd.read_parquet(parquet_file_name)
    # turning messages from list of list to tuple of tuples (since they are hashable)
    conversations_extended_interaction_messages["messages"]=conversations_extended_interaction_messages["messages"].apply(lambda message_list: tuple([tuple(message) for message in message_list]))
    # Attach this message list to the transfers dataframe
    transfers_with_message_list = transfers_df.merge(conversations_extended_interaction_messages, left_on="conversation_id", right_index=True,how='inner')
    return transfers_with_message_list

## Part A

**We believe that** transfers which do not receive a “Request Completed” message from the Sender within 14 days of the request started

**can be considered** technical failures

**we will know this to be true when** we see that for a sample of data without this message in 14 days, the message does not arrive, nor does transfer completion occur, within 1-8 months

### Scope

1. Extract all conversation ids from September to April (8 months) parquet files
2. Identify all messages within these conversation IDs using raw Spine data from September to May
3. Extract conversations with no Request Completed Message in the first 14 days
4. Count how many of these contain a later Request Completed or successful transfer message

In [4]:
regenerate_time_messages_parquet = False

In [5]:
# Generating a mapping of practice asid, and whether they were the sender or requestor in that conversation
requesting_supplier_type_map = transfers[["conversation_id", "requesting_practice_asid", "date_requested"]].drop_duplicates()
sending_supplier_type_map = transfers[["conversation_id", "sending_practice_asid", "date_requested"]].drop_duplicates()

requesting_supplier_type_map["supplier_type"] = "requestor"
sending_supplier_type_map["supplier_type"] = "sender"

requesting_supplier_type_map = requesting_supplier_type_map.rename({"requesting_practice_asid": "practice_asid"}, axis=1)
sending_supplier_type_map = sending_supplier_type_map.rename({"sending_practice_asid": "practice_asid"}, axis=1)

supplier_type_mapping = pd.concat([requesting_supplier_type_map, sending_supplier_type_map])
supplier_type_mapping["practice_asid"] = supplier_type_mapping["practice_asid"].astype(int)

conversation_ids_of_interest=transfers['conversation_id'].values

# Define a list of files to be loaded in
folder="s3://prm-gp2gp-data-sandbox-dev/spine-gp2gp-data/"
files=["Sept-2020","Oct-2020","Nov-2020","Dec-2020","Jan-2021","Feb-2021","Mar-2021","Apr-2021","May-2021-part-1-of-2","May-2021-part-2-of-2"]
full_filenames=[folder + file + ".csv.gz" for file in files]

# Rename message types to be human readable
interaction_name_mapping={"urn:nhs:names:services:gp2gp/RCMR_IN010000UK05":"req start",
"urn:nhs:names:services:gp2gp/RCMR_IN030000UK06":"req complete",
"urn:nhs:names:services:gp2gp/COPC_IN000001UK01":"COPC",
"urn:nhs:names:services:gp2gp/MCCI_IN010000UK13":" ack"}

In [6]:
# This function will take a set of Spine data and for each message in the conversation we're interested in, it will output a dataframe
# with "conversation_id", "supplier_type", "interaction_name", "jdiEvent", "GUID", "messageRef" for each given message in the order
# they occur
def generate_single_frame(file):
    a=time.perf_counter()
    print("Now Processing " + file)
    all_messages_in_file=pd.read_csv(file, compression='gzip')

    # Only keep conversations from the conversations that we actually want to use
    monthly_relevant_messages=all_messages_in_file.loc[all_messages_in_file['conversationID'].isin(conversation_ids_of_interest)]
    monthly_relevant_messages=monthly_relevant_messages.sort_values(by='_time')
    monthly_relevant_messages = monthly_relevant_messages.merge(supplier_type_mapping, left_on=["conversationID", "messageSender"], right_on=["conversation_id", "practice_asid"], how="left")
    
    # Add time of messages occurring
    monthly_relevant_messages["time_of_message"] = (pd.to_datetime(monthly_relevant_messages["_time"]).dt.tz_localize(None) - monthly_relevant_messages["date_requested"]).dt.total_seconds().astype(str)
    monthly_messages = monthly_relevant_messages

    # map the message name to human readable form using supplier mapping
    monthly_messages['interaction_name']=monthly_messages['interactionID'].replace(interaction_name_mapping)
    monthly_messages["jdiEvent"] = monthly_messages["jdiEvent"].replace("NONE", "")
    
    monthly_messages=monthly_messages[["conversation_id","supplier_type","interaction_name","jdiEvent","GUID","messageRef","time_of_message" ]]
    print(time.perf_counter()-a)
    return monthly_messages



In [7]:
if regenerate_time_messages_parquet:
    all_messages=pd.concat([generate_single_frame(file) for file in full_filenames], axis=0)

#     print('Now Concatenating all months of data')
#     a=time.perf_counter()
#     all_messages=pd.concat(all_messages,axis=0)
#     print(time.perf_counter()-a)

    print('Now constructing full interactions')
    a=time.perf_counter()
    all_messages = all_messages.merge(all_messages[['GUID','interaction_name']].rename({'interaction_name':'interaction_response'},axis=1),left_on='messageRef',right_on='GUID',how='left').drop(["GUID_x", "GUID_y"], axis=1)
    all_messages['interaction_response']=all_messages['interaction_response'].fillna("")
    all_messages['interaction']=all_messages['interaction_response']+all_messages['interaction_name']
    all_messages.drop(["interaction_response", "interaction_name"], axis=1, inplace=True)
    all_messages["messages"] = list(zip(all_messages["supplier_type"], all_messages["interaction"], all_messages["jdiEvent"], all_messages["time_of_message"] ))
    all_messages["messages"] = all_messages["messages"].apply(list)
    columns_to_drop = [column for column in all_messages.columns if column not in ["conversation_id", "messages"]]    
    all_messages.drop(columns_to_drop, axis=1, inplace=True)
#     all_messages=all_messages[["conversation_id", "messages"]]
    print(time.perf_counter()-a)

    print('Now Grouping by conversation')
    a=time.perf_counter()
    all_messages=all_messages.groupby('conversation_id')['messages'].apply(list)
    print(time.perf_counter()-a)

    print('Now Saving Data')
    pd.DataFrame(all_messages).to_parquet(f's3://prm-gp2gp-data-sandbox-dev/extra-fields-data-from-splunk/Sept_20_Apr_21_conversations_extended_interaction_messages_with_time.parquet')

In [8]:
conversations_extended_interaction_messages_with_time_file_name = 's3://prm-gp2gp-data-sandbox-dev/extra-fields-data-from-splunk/Sept_20_Apr_21_conversations_extended_interaction_messages_with_time.parquet'
transfers_with_time_messages = add_messages_parquet_to_transfers(conversations_extended_interaction_messages_with_time_file_name, transfers)

In [9]:
transfers_with_time_messages["messages"]=transfers_with_time_messages["messages"].apply(lambda messages: tuple([message[0:3] + (float(message[3]),) for message in messages]) )

### What volume of transfers do not successfully transfer
NB Successful transfer does not necessarily mean successful integration

In [10]:
transfers_with_time_messages["Contains sender req complete"] = transfers_with_time_messages["messages"].apply(lambda messages: True in [True for message in messages if message[0:3] == ('sender', 'req complete', '')])
pc_never_transfer=(~transfers_with_time_messages["Contains sender req complete"]).mean()*100
print(f"{round(pc_never_transfer,2)}% of transfers do not receive a Request Completed Message from the sender")

3.19% of transfers do not receive a Request Completed Message from the sender


### Is integration possible without sender request complete message

In [11]:
# -- anything that did integrate at some point
transfers_with_time_messages["integrated"] = transfers_with_time_messages["messages"].apply(lambda messages: True in [True for message in messages if (message[0:3] == ("requestor", "req complete ack", '')) | (message[0:3] == ("requestor", "req complete ack", "15"))])
transfers_with_time_messages.pivot_table(index="Contains sender req complete", columns="integrated", aggfunc="count", values="conversation_id").fillna(0).astype(int)

integrated,False,True
Contains sender req complete,Unnamed: 1_level_1,Unnamed: 2_level_1
False,60280,0
True,43256,1784761


### How long does it take for a sender request complete message to arrive from request start message

In [12]:
# returns a time if sender request completed message
transfers_with_time_messages["Sender req complete times/s"] = transfers_with_time_messages["messages"].apply(lambda messages: [message[3] for message in messages if message[0:3] == ('sender', 'req complete', '')])
transfers_with_time_messages["Min sender req complete time/s"] = transfers_with_time_messages["Sender req complete times/s"].apply(lambda times: min(times) if len(times) > 0 else np.nan)
print("Hours")
(transfers_with_time_messages["Min sender req complete time/s"].dropna()/3600).describe().round(2)

Hours


count    1828017.00
mean           1.36
std           47.29
min            0.00
25%            0.00
50%            0.00
75%            0.01
max         6240.73
Name: Min sender req complete time/s, dtype: float64

In [46]:
slow_transfer_threshold_in_days= 8
slow_transfer_in_seconds=24*60*60*slow_transfer_threshold_in_days
transfers_with_time_messages['slow transfer']=(transfers_with_time_messages["Min sender req complete time/s"].fillna(0) >= slow_transfer_in_seconds)

pc_slow_transfers=transfers_with_time_messages['slow transfer'].mean()*100
total_slow_transfers=transfers_with_time_messages['slow transfer'].sum()
print(f"There are {round(pc_slow_transfers,2)}% ({total_slow_transfers}) transfers where a sender request complete message is received after {slow_transfer_threshold_in_days} days")

pc_slow_which_integrate=transfers_with_time_messages.loc[transfers_with_time_messages['slow transfer'],'integrated'].mean()*100
print(f"{round(pc_slow_which_integrate,2)}% of these transfers eventually integrate")

pc_transfers_slow_and_integrate=(transfers_with_time_messages['integrated'] & transfers_with_time_messages['slow transfer']).mean()*100
print(f"{round(pc_transfers_slow_and_integrate,2)}% of all transfers are transfered after {slow_transfer_threshold_in_days} days and then integrate")

There are 0.13% (2398) transfers where a sender request complete message is received after 8 days
85.95% of these transfers eventually integrate
0.11% of all transfers are transfered after 8 days and then integrate


In [14]:
print(f'% of transfers completing transfer after {slow_transfer_threshold_in_days} days')
transfers_with_time_messages.pivot_table(index='sending_supplier',columns='requesting_supplier',values='slow transfer',aggfunc='mean').multiply(100)

% of transfers completing transfer after 14 days


requesting_supplier,EMIS,Microtest,TPP,Unknown,Vision
sending_supplier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
EMIS,0.061777,0.0,0.061861,0.0,0.022509
Microtest,0.0,0.0,0.0,0.0,0.0
TPP,0.033625,0.0,0.0,0.0,0.0
Unknown,0.0,0.0,0.0,0.0,0.0
Vision,1.914795,0.0,1.688613,0.0,1.632873


In [40]:
print(f'Number of transfers completing transfer after {slow_transfer_threshold_in_days} days')
slow_transfers_bool=transfers_with_time_messages['slow transfer']
transfers_with_time_messages.loc[slow_transfers_bool].pivot_table(index='sending_supplier',columns='requesting_supplier',values='conversation_id',aggfunc='count').fillna(0).astype(int)

Number of transfers completing transfer after 14 days


requesting_supplier,EMIS,TPP,Vision
sending_supplier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
EMIS,694,226,5
TPP,111,0,0
Vision,498,109,91


### Investigation of transferred with caveats conversations (Vision practice??)

These are likely April transfer requests so this is delayed until April data is ready

In [18]:
conversations_of_interest=["D8C6FFBA-2BC3-4E35-9F3F-9BE661413D05",
"93EDC8FA-EC8D-4634-B16B-CB81C48E35E7","D72D3153-6841-44E2-9318-6067581269D7",
"308ADDF4-9B19-47B1-B54C-1A28E70D26DF","2D5FA7FF-B5D9-4F06-8772-8BBEEE7E2EFD",
"FD7576C0-FC0B-4EF2-96EE-7AE4AD799892","25E28A19-C610-4F19-A161-E594B902732B",
"B85B2A9A-8F77-448D-96E6-9CBA777372F0","1CEB01D0-607B-4D4B-97FF-54410526A17D",
"B3C02083-FA21-47D3-9A49-3599474FB2E6","0729466B-4D12-4D5F-B98C-F61D320D3B09",
"7462310C-FED5-4C0F-9FD0-5BA1835F59CF","6CD11744-F59D-4925-BF54-2E39EA002EF3",
"79FCEF52-03C8-4602-904C-DA2639D8196D","2BE44C84-30D4-45A5-BCAD-0E95C2ECFA83",
"C5E3E09E-BC61-4C62-A4DE-0F605E736CE4","2C70174E-5EA9-4672-B17B-7638CBB8EDAC",
"C13674DF-9BE8-4959-8E6E-4E419F60ECAA","98C809BE-5977-41DC-9293-CB304F63B739",
"FFAB9987-F353-4A07-9ED5-9526B0EB0A27","1077C9AA-B699-48B9-9D85-6B073431B4BC",
"9234139D-5180-45C6-9C7A-E357C02D68EC"]
available_conversations=set(transfers_with_time_messages['conversation_id'].values).intersection(set(conversations_of_interest))

output_file="s3://prm-gp2gp-data-sandbox-dev/notebook-outputs/41-PRMT-2124-Vision-ack-with-caveats-transfer-messages.csv"
transfers_with_time_messages.set_index('conversation_id').loc[available_conversations,"messages"].to_csv(output_file)

### What is the spread of transfer times?

In [19]:
req_complete_times=transfers_with_time_messages["Min sender req complete time/s"]
quantiles=[i/10 for i in list(range(1,10))]
transfer_proportions=pd.DataFrame(req_complete_times.fillna(req_complete_times.max()).quantile(quantiles).rename('Time to transfer/s'))
transfer_proportions.index=(transfer_proportions.index*100).astype(int).rename('% Transfers Completed')
transfer_proportions.round(1)

Unnamed: 0_level_0,Time to transfer/s
% Transfers Completed,Unnamed: 1_level_1
10,6.0
20,8.0
30,10.1
40,12.5
50,15.5
60,19.8
70,26.7
80,40.7
90,86.4


In [20]:
message_final_time=datetime.datetime(2021, 6, 1)
transfers_with_time_messages['Total Time of Transfer Tracking/s']=(message_final_time-transfers_with_time_messages['date_requested']).dt.total_seconds()
day_in_seconds=24*60*60
time_points_in_days=[pow(2,i) for i in range(-5,8)]
time_points_in_seconds=[day_point*day_in_seconds for day_point in time_points_in_days]
time_point_labels=[str(day_point*24) + ' hours' if day_point<=1 else str(day_point) + ' days' for day_point in time_points_in_days]
df=transfers_with_time_messages.copy()
message_received_rate=[(df.loc[df['Total Time of Transfer Tracking/s']>=time_point,'Min sender req complete time/s'].fillna(time_point+1)<=time_point).mean() for time_point in time_points_in_seconds]
Transfer_rate_time_table=pd.DataFrame(index=time_point_labels,data=message_received_rate).rename({0:'% Transfers Completed'},axis=1).multiply(100)
Transfer_rate_time_table['Increase (% Total Transfers)']=Transfer_rate_time_table['% Transfers Completed'].diff()
Transfer_rate_time_table.round(2)

Unnamed: 0,% Transfers Completed,Increase (% Total Transfers)
0.75 hours,96.13,
1.5 hours,96.19,0.06
3.0 hours,96.38,0.19
6.0 hours,96.43,0.05
12.0 hours,96.45,0.02
24 hours,96.53,0.08
2 days,96.57,0.04
4 days,96.62,0.05
8 days,96.68,0.06
16 days,96.73,0.05


#### Do slightly slower transfer times increase the chance of a late integration?

In [21]:
transfers_with_time_messages['Transfer Time Decile']=pd.qcut(req_complete_times.fillna(req_complete_times.max()),100,duplicates='drop')
transfers_with_time_messages['Integrated in Time']=transfers_with_time_messages['status']=='Integrated'
transfers_with_time_messages.groupby('Transfer Time Decile').agg({'Integrated in Time':'mean'}).tail(15)

Unnamed: 0_level_0,Integrated in Time
Transfer Time Decile,Unnamed: 1_level_1
"(45.45, 48.366]",0.906458
"(48.366, 51.628]",0.905219
"(51.628, 55.333]",0.908932
"(55.333, 59.628]",0.907863
"(59.628, 64.597]",0.906654
"(64.597, 70.557]",0.908975
"(70.557, 77.585]",0.904936
"(77.585, 86.424]",0.902881
"(86.424, 97.417]",0.901488
"(97.417, 112.263]",0.894402


In [44]:
transfers_with_time_messages["Min sender req complete time/m"]=transfers_with_time_messages["Min sender req complete time/s"].astype(float)/60
transfers_minute_effect=transfers_with_time_messages.copy()
transfers_minute_effect["Min sender req complete time/m"]=np.ceil(transfers_minute_effect["Min sender req complete time/m"])
transfers_minute_effect=transfers_minute_effect.groupby("Min sender req complete time/m").agg({'Integrated in Time':['mean','count']})['Integrated in Time']
transfers_minute_effect['%']=(100*transfers_minute_effect['count']/transfers_minute_effect['count'].sum()).round(1)
transfers_minute_effect['mean']=transfers_minute_effect['mean'].multiply(100).round(1)
transfers_minute_effect.head(10).rename({'mean':'% of transfers which integrate in time','count':'Number of transfers in bucket','%':'% of transfers in bucket'},axis=1)

Unnamed: 0_level_0,% of transfers which integrate in time,Number of transfers in bucket,% of transfers in bucket
Min sender req complete time/m,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,90.8,1625470,88.9
2.0,90.3,119337,6.5
3.0,88.2,31701,1.7
4.0,86.2,11854,0.6
5.0,84.1,6479,0.4
6.0,84.3,4709,0.3
7.0,83.4,3868,0.2
8.0,84.0,3357,0.2
9.0,82.2,1863,0.1
10.0,82.0,982,0.1


### Chance of eventual integration for transfers with only 2 vs 1 message

In [23]:
_120_days_in_seconds = 120 * 24 * 60 * 60
transfers_lasting_more_120_days_bool = transfers_with_time_messages["Total Time of Transfer Tracking/s"] >= _120_days_in_seconds
transfers_lasting_more_120_days = transfers_with_time_messages.copy().loc[transfers_lasting_more_120_days_bool]

In [24]:
integration_messages = [("requestor", "req complete ack", ''), ("requestor", "req complete ack", "15")]
def extract_integration_time(messages):
    integration_times = [message[3] for message in messages if (message[0:3] in integration_messages)]
    if len(integration_times) > 0:
        return float(min(integration_times))
    else:
        return np.nan

transfers_lasting_more_120_days["Integrated within 120 days"] = transfers_lasting_more_120_days["messages"].apply(lambda messages: extract_integration_time(messages) < _120_days_in_seconds)

In [25]:
transfers_lasting_more_120_days["Integrated within 120 days"].value_counts(dropna=False)

True     1041961
False      66716
Name: Integrated within 120 days, dtype: int64

In [30]:
fourteen_days_in_seconds=14 * 24 * 60 *60
def pending_message_pattern(messages):
    messages_within_14_days = tuple(message[0:3] for message in messages if float(message[3]) <= fourteen_days_in_seconds)
    if messages_within_14_days == (('requestor', 'req start', ''),):
        return "Single message pending pattern"
    elif messages_within_14_days == (('requestor', 'req start', ''), ('sender', 'req start ack', ''),):
        return "Two message pending pattern"
    return "Other pattern"

transfers_lasting_more_120_days["Message pattern"] = transfers_lasting_more_120_days["messages"].apply(pending_message_pattern)

In [31]:
transfers_message_pattern_integration_within_120_days = transfers_lasting_more_120_days.groupby("Message pattern").agg({"Integrated within 120 days": ["mean", "count"]})
transfers_message_pattern_integration_within_120_days = transfers_message_pattern_integration_within_120_days["Integrated within 120 days"]
transfers_message_pattern_integration_within_120_days = transfers_message_pattern_integration_within_120_days.rename({"mean": "% integrated within 120 days", "count": "Total number transfers"}, axis=1)
transfers_message_pattern_integration_within_120_days["% integrated within 120 days"] = transfers_message_pattern_integration_within_120_days["% integrated within 120 days"].multiply(100)
transfers_message_pattern_integration_within_120_days.round(2)

Unnamed: 0_level_0,% integrated within 120 days,Total number transfers
Message pattern,Unnamed: 1_level_1,Unnamed: 2_level_1
Other pattern,95.27,1092820
Single message pending pattern,6.26,5668
Two message pending pattern,4.36,10189


## Part B

**We also believe that** reclassifying these transfers as Failed, will classify the transfers with fatal Sender errors as failures, and therefore make the following redundant: 
**We will know this to be true when** we can see that any transfers that would be classified as Failures because they contain these errors, do not contain the Request Completed message. 

### Scope

1. Use 6 months of Spine Parquet files (Sept-2020 to Feb-2021 s3://prm-gp2gp-data-sandbox-dev/transfers-sample-5/)
2. Re-label transfers which would be considered failures due to sender error as “Failed due to Sender Error”
3. Add new status label for transfers not containing Request Completed Message
4. Compare change in status for all “Failed due to Sender Error Messages”

In [32]:
transfers["status"].value_counts()

Integrated                         1654865
Integrated Late                     102662
Pending                              66882
Failed Due To Sender Error Code      31217
Failed                               25617
Pending With Error                    7054
Name: status, dtype: int64

## Step 3

In [33]:
conversations_extended_interaction_messages_file_name = 's3://prm-gp2gp-data-sandbox-dev/extra-fields-data-from-splunk/Sept_20_Feb_21_conversations_extended_interaction_messages.parquet'
transfers_with_message_list = add_messages_parquet_to_transfers(conversations_extended_interaction_messages_file_name, transfers)

In [34]:
sender_req_completed_message = ('sender', 'req complete', '')

In [35]:
transfers_with_sender_req_completed_bool = transfers_with_message_list["messages"].apply(lambda messages: sender_req_completed_message in messages)
transfers_with_message_list_and_new_status = transfers_with_message_list.copy()
transfers_with_message_list_and_new_status["New Status"] = transfers_with_message_list_and_new_status["status"]
transfers_with_message_list_and_new_status = transfers_with_message_list_and_new_status.rename({"status": "Old Status"}, axis=1)
transfers_with_message_list_and_new_status.loc[~transfers_with_sender_req_completed_bool, "New Status"] = "Failed"

In [36]:
transfers_with_message_list_and_new_status.groupby(by=["Old Status", "New Status"]).agg({"conversation_id":"count"})

Unnamed: 0_level_0,Unnamed: 1_level_0,conversation_id
Old Status,New Status,Unnamed: 2_level_1
Failed,Failed,20793
Failed Due To Sender Error Code,Failed,23363
Failed Due To Sender Error Code,Failed Due To Sender Error Code,12
Integrated,Integrated,1174129
Integrated Late,Integrated Late,72639
Pending,Failed,17695
Pending,Pending,31378
Pending With Error,Failed,2844
Pending With Error,Pending With Error,381


In [37]:
#pd.set_option('display.max_colwidth', None)
old_status_sender_error_failure_bool = transfers_with_message_list_and_new_status["Old Status"] == "Failed Due To Sender Error Code"
new_status_sender_error_failure_bool = transfers_with_message_list_and_new_status["New Status"] == "Failed Due To Sender Error Code"
transfers_with_message_list_and_new_status.loc[old_status_sender_error_failure_bool & new_status_sender_error_failure_bool, "messages"]

294226     ((requestor, req start, ), (sender, req start ...
440570     ((requestor, req start, ), (sender, req start ...
543923     ((requestor, req start, ), (sender, req start ...
562441     ((requestor, req start, ), (sender, req start ...
739378     ((requestor, req start, ), (sender, req start ...
922527     ((requestor, req start, ), (sender, req comple...
1001057    ((requestor, req start, ), (sender, req start ...
1011256    ((requestor, req start, ), (sender, req start ...
1053281    ((requestor, req start, ), (sender, req start ...
1059123    ((requestor, req start, ), (sender, req start ...
1075010    ((requestor, req start, ), (sender, req start ...
1101613    ((requestor, req start, ), (sender, req start ...
Name: messages, dtype: object

In [38]:
transfers_with_message_list_and_new_status

Unnamed: 0,conversation_id,sla_duration,requesting_practice_asid,sending_practice_asid,sender_error_code,final_error_codes,intermediate_error_codes,Old Status,date_requested,date_completed,requesting_supplier_asid,requesting_supplier,requesting_ods_code,requesting_practice_name,sending_supplier_asid,sending_supplier,sending_ods_code,sending_practice_name,messages,New Status
0,E80D906D-E1CA-47E4-9689-4417FB75A0E3,56020.0,323917613040,386279053048,,[nan],[],Integrated,2020-09-30 17:00:30.074,2020-10-01 08:36:56.218,323917613040,EMIS,M81083,HOLLYOAKS MEDICAL CENTRE,386279053048,EMIS,M81064,HOLLYWOOD MEDICAL CENTRE,"((requestor, req start, ), (sender, req comple...",Integrated
1,3F7FD0BC-32C6-4C4F-81E6-8AB7FB70DFF3,399759.0,792911523019,200000001557,,[nan],[],Integrated,2020-09-30 15:03:21.906,2020-10-05 08:06:24.209,792911523019,EMIS,M83670,KEELE PRACTICE,200000001557,EMIS,F82011,ST EDWARDS MEDICAL CENTRE,"((requestor, req start, ), (sender, req comple...",Integrated
2,60F35991-C3AE-4AFC-94FD-B1EE14AB183B,163059.0,981416634047,736940363012,,[nan],[],Integrated,2020-09-30 17:02:16.126,2020-10-02 14:21:23.556,981416634047,EMIS,A83011,CLAYPATH & UNIVERSITY MEDICAL GROUP,736940363012,EMIS,B86110,LEEDS STUDENT MEDICAL PRACTICE,"((requestor, req start, ), (sender, req comple...",Integrated
3,E6A19016-1E80-4F12-B127-90C3DC09A7ED,1442.0,310097028016,097881534040,,[nan],[],Integrated,2020-09-30 17:01:50.754,2020-09-30 17:27:47.132,310097028016,EMIS,P81710,TARLETON GROUP PRACTICE,097881534040,EMIS,P81185,RIVERSIDE MEDICAL CENTRE,"((requestor, req start, ), (sender, req comple...",Integrated
4,DF01B9A0-033E-11EB-AE71-C563C6B51281,154337.0,200000001906,907503130011,,[nan],[],Integrated,2020-09-30 17:03:33.451,2020-10-02 11:56:02.535,200000001906,TPP,P92648,SLAG LANE MC,907503130011,EMIS,B82080,MY HEALTH GROUP,"((requestor, req start, ), (sender, req comple...",Integrated
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1343229,75173BD9-37A6-4E15-856E-F173317D01E6,,994133653042,950140416013,,[],[],Pending,2021-02-01 10:38:41.327,NaT,994133653042,EMIS,F86008,GANTS HILL MEDICAL CENTRE,950140416013,TPP,F81110,TILBURY HEALTH CENTRE,"((requestor, req start, ),)",Failed
1343230,A2EB628E-8074-4943-B6E9-E59F6A94C792,615.0,345196753049,181445352014,,[nan],[],Integrated,2021-02-01 09:14:17.209,2021-02-01 09:24:44.125,345196753049,EMIS,G84011,EDEN PARK SURGERY,181445352014,EMIS,G84018,CORNERWAYS SURGERY,"((requestor, req start, ), (sender, req comple...",Integrated
1343231,3A0FD1DF-7A30-4B7C-B845-1A353DA58F0B,,200000001410,227784357013,,[],[],Pending,2021-02-01 08:59:25.769,NaT,200000001410,EMIS,F85666,DR ME SILVER'S PRACTICE,227784357013,EMIS,F85645,MYDDLETON ROAD SURGERY,"((requestor, req start, ),)",Failed
1343232,AEE4F29B-9A97-4BDF-9E40-E96D171034DE,4706.0,200000000572,272774518018,,[nan],[],Integrated,2021-02-01 09:04:53.466,2021-02-01 10:23:29.529,200000000572,EMIS,K81020,CLAREMONT HOLYPORT SURGERY,272774518018,EMIS,K81630,SOUTH MEADOW SURGERY,"((requestor, req start, ), (sender, req comple...",Integrated


## Addendum - How many transfers contain COPC messages?

In [45]:
transfers_with_time_messages['Contains COPC']=transfers_with_time_messages['messages'].apply(lambda messages: True in [message[1]=='COPC' for message in messages])
COPC_table=transfers_with_time_messages.pivot_table(index='status',columns='Contains COPC',values='conversation_id',aggfunc='count').astype(int)
COPC_table.loc['Total']=COPC_table.sum()
COPC_table['% of status which contain COPC']=(COPC_table[True]/COPC_table.sum(axis=1)).multiply(100).round(1)
COPC_table

Contains COPC,False,True,% of status which contain COPC
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Failed,6691,18926,73.9
Failed Due To Sender Error Code,31211,6,0.0
Integrated,952292,702573,42.5
Integrated Late,60848,41814,40.7
Pending,49187,17695,26.5
Pending With Error,6744,310,4.4
Total,1106973,781324,41.4
