In [1]:
import pandas as pd 
import numpy as np

# Import transfer files to extract whether message creator is sender or requester
# Using data generated from branch PRMT-1742-duplicates-analysis.
# This is needed to correctly handle duplicates.
# Once the upstream pipeline has a fix for duplicate EHRs, then we can go back to using the main output.
transfer_file_location = "s3://prm-gp2gp-data-sandbox-dev/transfers-sample-5/"
transfer_files = [
    "2020-9-transfers.parquet",
    "2020-10-transfers.parquet",
    "2020-11-transfers.parquet",
    "2020-12-transfers.parquet",
    "2021-1-transfers.parquet",
    "2021-2-transfers.parquet"
]

transfer_input_files = [transfer_file_location + f for f in transfer_files]
transfers_raw = pd.concat((
    pd.read_parquet(f)
    for f in transfer_input_files
))

# In the data from the PRMT-1742-duplicates-analysis branch, these columns have been added , but contain only empty values.
transfers_raw = transfers_raw.drop(["sending_supplier", "requesting_supplier"], axis=1)
transfers = transfers_raw.copy()

# Correctly interpret certain sender errors as failed.
# This is explained in PRMT-1974. Eventually this will be fixed upstream in the pipeline.
# Step Two: reclassifying the relevant transfers with pending sender error codes to FAILED DUE TO SENDER ERROR CODE status for comparison
pending_sender_error_codes=[6,7,10,24,30,23,14,99]
transfers_with_pending_sender_code_bool=transfers['sender_error_code'].isin(pending_sender_error_codes)
transfers_with_pending_with_error_bool=transfers['status']=='PENDING_WITH_ERROR'
transfers_which_need_pending_to_failure_change_bool=transfers_with_pending_sender_code_bool & transfers_with_pending_with_error_bool
transfers.loc[transfers_which_need_pending_to_failure_change_bool,'status']='FAILED DUE TO SENDER ERROR CODE'

# Add integrated Late status
eight_days_in_seconds=8*24*60*60
transfers_after_sla_bool=transfers['sla_duration']>eight_days_in_seconds
transfers_with_integrated_bool=transfers['status']=='INTEGRATED'
transfers_integrated_late_bool=transfers_after_sla_bool & transfers_with_integrated_bool
transfers.loc[transfers_integrated_late_bool,'status']='INTEGRATED LATE'

# If the record integrated after 28 days, change the status back to pending.
# This is to handle each month consistently and to always reflect a transfers status 28 days after it was made.
# TBD how this is handled upstream in the pipeline
twenty_eight_days_in_seconds=28*24*60*60
transfers_after_month_bool=transfers['sla_duration']>twenty_eight_days_in_seconds
transfers_pending_at_month_bool=transfers_after_month_bool & transfers_integrated_late_bool
transfers.loc[transfers_pending_at_month_bool,'status']='PENDING'
transfers_with_early_error_bool=(~transfers.loc[:,'sender_error_code'].isna()) |(~transfers.loc[:,'intermediate_error_codes'].apply(len)>0)
transfers.loc[transfers_with_early_error_bool & transfers_pending_at_month_bool,'status']='PENDING_WITH_ERROR'

# Supplier name mapping
supplier_renaming = {
    "EGTON MEDICAL INFORMATION SYSTEMS LTD (EMIS)":"EMIS",
    "IN PRACTICE SYSTEMS LTD":"Vision",
    "MICROTEST LTD":"Microtest",
    "THE PHOENIX PARTNERSHIP":"TPP",
    None: "Unknown"
}

# Generate ASID lookup that contains all the most recent entry for all ASIDs encountered
asid_file_location = "s3://prm-gp2gp-data-sandbox-dev/asid-lookup/"
asid_files = [
    "asidLookup-Nov-2020.csv.gz",
    "asidLookup-Dec-2020.csv.gz",
    "asidLookup-Jan-2021.csv.gz",
    "asidLookup-Feb-2021.csv.gz",
    "asidLookup-Mar-2021.csv.gz",
    "asidLookup-Apr-2021.csv.gz"
]
asid_lookup_files = [asid_file_location + f for f in asid_files]
asid_lookup = pd.concat((
    pd.read_csv(f)
    for f in asid_lookup_files
))
asid_lookup = asid_lookup.drop_duplicates().groupby("ASID").last().reset_index()
lookup = asid_lookup[["ASID", "MName", "NACS","OrgName"]]

transfers = transfers.merge(lookup, left_on='requesting_practice_asid',right_on='ASID',how='left')
transfers = transfers.rename({'MName': 'requesting_supplier', 'ASID': 'requesting_supplier_asid', 'NACS': 'requesting_ods_code','OrgName':'requesting_practice_name'}, axis=1)
transfers = transfers.merge(lookup, left_on='sending_practice_asid',right_on='ASID',how='left')
transfers = transfers.rename({'MName': 'sending_supplier', 'ASID': 'sending_supplier_asid', 'NACS': 'sending_ods_code','OrgName':'sending_practice_name'}, axis=1)

transfers["sending_supplier"] = transfers["sending_supplier"].replace(supplier_renaming.keys(), supplier_renaming.values())
transfers["requesting_supplier"] = transfers["requesting_supplier"].replace(supplier_renaming.keys(), supplier_renaming.values())

# Making the status to be more human readable here
transfers["status"] = transfers["status"].str.replace("_", " ").str.title()

COPC_tag = "-reduced-COPCs" 
conversations_extended_interaction_messages=pd.read_parquet(f's3://prm-gp2gp-data-sandbox-dev/extra-fields-data-from-splunk/Sept_20_Feb_21_conversations_extended_interaction_messages{COPC_tag}.parquet')
# turning messages from list of list to tuple of tuples (since they are hasable)
conversations_extended_interaction_messages["messages"]=conversations_extended_interaction_messages["messages"].apply(lambda message_list: tuple([tuple(message) for message in message_list]))
# Attach this message list to the transfers dataframe
transfers_with_message_list = transfers.merge(conversations_extended_interaction_messages, left_on="conversation_id", right_index=True)

### Part 1: Duplicate transfers with final acknowledgment
What proportion of transfers end in just error code 12 and, therefore, the point of failure may be ambiguous?

In [5]:
duplicates_only_list=transfers['final_error_codes'].apply(lambda error_list: set(error_list)==set([12]))

In [27]:
duplicates_only_list.value_counts().sum()

1343234

In [9]:
duplicates_only_list.mean()*100

0.14733099370623437

### Investigation 1.1: What do known duplicate behaviours look like?

In [22]:
transfers_duplicates_investigation=transfers_with_message_list.copy()
transfers_duplicates_investigation['Contains Duplicate Error']=transfers_duplicates_investigation['final_error_codes'].apply(lambda error_list: 12 in error_list)


In [26]:
transfers_duplicates_investigation['Number of req start messages']=transfers_duplicates_investigation['messages'].apply(lambda message_list: sum([message[1]=='req start' for message in message_list]))
transfers_duplicates_investigation.pivot_table(index='Contains Duplicate Error',columns='Number of req start messages',aggfunc='count',values='conversation_id')

Number of req start messages,1,2
Contains Duplicate Error,Unnamed: 1_level_1,Unnamed: 2_level_1
False,1322497.0,58.0
True,20679.0,


In [49]:
transfers_duplicates_investigation['Number of req complete messages']=transfers_duplicates_investigation['messages'].apply(lambda message_list: sum([message[1]=='req complete' for message in message_list]))
transfers_duplicates_investigation.pivot_table(index=['status','Contains Duplicate Error'],columns='Number of req complete messages',aggfunc='count',values='conversation_id')

Unnamed: 0_level_0,Number of req complete messages,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,23,25,34,36,38
status,Contains Duplicate Error,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Failed,False,16.0,19737.0,543.0,122.0,32.0,13.0,3.0,6.0,1.0,1.0,...,,,1.0,,,,,,,
Failed,True,1.0,16.0,175.0,65.0,26.0,13.0,9.0,2.0,1.0,2.0,...,,,,,,,,,1.0,
Failed Due To Sender Error Code,False,23374.0,1.0,,,,,,,,,...,,,,,,,,,,
Integrated,False,165.0,1158629.0,484.0,8.0,1.0,1.0,,,,,...,,,,,,,,,,
Integrated,True,3.0,128.0,12172.0,1818.0,438.0,143.0,55.0,35.0,17.0,7.0,...,1.0,1.0,,1.0,1.0,2.0,,,,
Integrated Late,False,6.0,69506.0,27.0,,,,,,,,...,,,,,,,,,,
Integrated Late,True,,19.0,2511.0,399.0,112.0,42.0,9.0,5.0,1.0,,...,,,,,,,,,,
Pending,False,18098.0,28405.0,94.0,29.0,10.0,6.0,4.0,1.0,2.0,1.0,...,,,,,,,,,,1.0
Pending,True,,203.0,1692.0,327.0,104.0,35.0,25.0,12.0,3.0,7.0,...,,,,1.0,,,1.0,1.0,,
Pending With Error,False,2881.0,336.0,6.0,,,,,,,,...,,,,,,,,,,


In [50]:
transfers_duplicates_investigation['Number of req start ack messages']=transfers_duplicates_investigation['messages'].apply(lambda message_list: sum([message[1]=='req start ack' for message in message_list]))
transfers_duplicates_investigation.pivot_table(index=['status','Contains Duplicate Error'],columns='Number of req start ack messages',aggfunc='count',values='conversation_id')

Unnamed: 0_level_0,Number of req start ack messages,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,23,25,34,37,38,5959
status,Contains Duplicate Error,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Failed,False,117.0,19207.0,1004.0,114.0,18.0,7.0,1.0,3.0,2.0,1.0,...,,1.0,,,,,,,,
Failed,True,,22.0,158.0,64.0,32.0,20.0,5.0,5.0,1.0,2.0,...,,,,,,,,1.0,,
Failed Due To Sender Error Code,False,9.0,23145.0,208.0,9.0,4.0,,,,,,...,,,,,,,,,,
Integrated,False,1049.0,1156797.0,1268.0,155.0,18.0,1.0,,,,,...,,,,,,,,,,
Integrated,True,1.0,301.0,11948.0,1854.0,448.0,147.0,57.0,36.0,17.0,7.0,...,1.0,,1.0,1.0,2.0,,,,,
Integrated Late,False,65.0,69328.0,137.0,8.0,1.0,,,,,,...,,,,,,,,,,
Integrated Late,True,,104.0,2422.0,400.0,115.0,42.0,9.0,5.0,,1.0,...,,,,,,,,,,
Pending,False,6212.0,40263.0,121.0,29.0,10.0,6.0,4.0,1.0,2.0,1.0,...,,,,,,,,,1.0,1.0
Pending,True,,310.0,1595.0,318.0,103.0,33.0,27.0,12.0,3.0,7.0,...,,,1.0,,,1.0,1.0,,,
Pending With Error,False,19.0,3123.0,75.0,5.0,1.0,,,,,,...,,,,,,,,,,


### Part 2: Possible Duplicates without any final acknowledgment
What about transfers which never reach the point of final acknowledgment - can we detect any duplicate behaviour in them? 
- To what degree is this an issue? 

In [25]:
transfers_duplicates_investigation

Unnamed: 0,conversation_id,sla_duration,requesting_practice_asid,sending_practice_asid,sender_error_code,final_error_codes,intermediate_error_codes,status,date_requested,date_completed,...,requesting_supplier,requesting_ods_code,requesting_practice_name,sending_supplier_asid,sending_supplier,sending_ods_code,sending_practice_name,messages,Contains Duplicate Error,Number of req start messages
0,E80D906D-E1CA-47E4-9689-4417FB75A0E3,56020.0,323917613040,386279053048,,[nan],[],Integrated,2020-09-30 17:00:30.074,2020-10-01 08:36:56.218,...,EMIS,M81083,HOLLYOAKS MEDICAL CENTRE,386279053048,EMIS,M81064,HOLLYWOOD MEDICAL CENTRE,"((requestor, req start, ), (sender, req comple...",False,1
1,3F7FD0BC-32C6-4C4F-81E6-8AB7FB70DFF3,399759.0,792911523019,200000001557,,[nan],[],Integrated,2020-09-30 15:03:21.906,2020-10-05 08:06:24.209,...,EMIS,M83670,KEELE PRACTICE,200000001557,EMIS,F82011,ST EDWARDS MEDICAL CENTRE,"((requestor, req start, ), (sender, req comple...",False,1
2,60F35991-C3AE-4AFC-94FD-B1EE14AB183B,163059.0,981416634047,736940363012,,[nan],[],Integrated,2020-09-30 17:02:16.126,2020-10-02 14:21:23.556,...,EMIS,A83011,CLAYPATH & UNIVERSITY MEDICAL GROUP,736940363012,EMIS,B86110,LEEDS STUDENT MEDICAL PRACTICE,"((requestor, req start, ), (sender, req comple...",False,1
3,E6A19016-1E80-4F12-B127-90C3DC09A7ED,1442.0,310097028016,097881534040,,[nan],[],Integrated,2020-09-30 17:01:50.754,2020-09-30 17:27:47.132,...,EMIS,P81710,TARLETON GROUP PRACTICE,097881534040,EMIS,P81185,RIVERSIDE MEDICAL CENTRE,"((requestor, req start, ), (sender, req comple...",False,1
4,DF01B9A0-033E-11EB-AE71-C563C6B51281,154337.0,200000001906,907503130011,,[nan],[],Integrated,2020-09-30 17:03:33.451,2020-10-02 11:56:02.535,...,TPP,P92648,SLAG LANE MC,907503130011,EMIS,B82080,MY HEALTH GROUP,"((requestor, req start, ), (sender, req comple...",False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1343229,75173BD9-37A6-4E15-856E-F173317D01E6,,994133653042,950140416013,,[],[],Pending,2021-02-01 10:38:41.327,NaT,...,EMIS,F86008,GANTS HILL MEDICAL CENTRE,950140416013,TPP,F81110,TILBURY HEALTH CENTRE,"((requestor, req start, ),)",False,1
1343230,A2EB628E-8074-4943-B6E9-E59F6A94C792,615.0,345196753049,181445352014,,[nan],[],Integrated,2021-02-01 09:14:17.209,2021-02-01 09:24:44.125,...,EMIS,G84011,EDEN PARK SURGERY,181445352014,EMIS,G84018,CORNERWAYS SURGERY,"((requestor, req start, ), (sender, req comple...",False,1
1343231,3A0FD1DF-7A30-4B7C-B845-1A353DA58F0B,,200000001410,227784357013,,[],[],Pending,2021-02-01 08:59:25.769,NaT,...,EMIS,F85666,DR ME SILVER'S PRACTICE,227784357013,EMIS,F85645,MYDDLETON ROAD SURGERY,"((requestor, req start, ),)",False,1
1343232,AEE4F29B-9A97-4BDF-9E40-E96D171034DE,4706.0,200000000572,272774518018,,[nan],[],Integrated,2021-02-01 09:04:53.466,2021-02-01 10:23:29.529,...,EMIS,K81020,CLAREMONT HOLYPORT SURGERY,272774518018,EMIS,K81630,SOUTH MEADOW SURGERY,"((requestor, req start, ), (sender, req comple...",False,1


## Are duplicates always a response to Core EHR?

In [45]:
list_duplicate_responses=transfers_duplicates_investigation['messages'].apply(lambda message_list: [message[1] for message in message_list if message[2]=='12'])
list_duplicate_responses[list_duplicate_responses.apply(len)>0].explode().value_counts()

req complete ack    26357
 ack                    6
Name: messages, dtype: int64

In [46]:
list_responses=transfers_duplicates_investigation['messages'].apply(lambda message_list: [message[1] for message in message_list if len(message[2])>0])
list_responses[list_duplicate_responses.apply(len)>0].explode().value_counts()

req complete ack    27872
req start ack         272
COPC ack              104
 ack                   40
Name: messages, dtype: int64

## Do transfers with duplicates tend to have more COPCs?

In [54]:
transfers_duplicates_investigation['Number of COPC messages']=transfers_duplicates_investigation['messages'].apply(lambda message_list: sum([message[1]=='COPC' for message in message_list]))
transfers_duplicates_investigation.groupby(by=['status','Contains Duplicate Error']).agg({'Number of COPC messages':['mean','std']})

Unnamed: 0_level_0,Unnamed: 1_level_0,Number of COPC messages,Number of COPC messages
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std
status,Contains Duplicate Error,Unnamed: 2_level_2,Unnamed: 3_level_2
Failed,False,1.624426,1.203649
Failed,True,2.571429,3.361994
Failed Due To Sender Error Code,False,0.0,0.0
Integrated,False,0.863238,1.018553
Integrated,True,1.235766,1.650044
Integrated Late,False,0.827449,1.001455
Integrated Late,True,1.074839,1.462739
Pending,False,0.538584,1.708014
Pending,True,1.071871,2.98519
Pending With Error,False,0.217499,4.4501


In [58]:
# Could this just be because transfers that require COPC are more likely to duplicate
transfers_duplicates_investigation['Contains COPC']=transfers_duplicates_investigation['Number of COPC messages']>0

transfers_duplicates_investigation.groupby(by=['status','Contains Duplicate Error']).agg({'Contains COPC':'mean'})*100

Unnamed: 0_level_0,Unnamed: 1_level_0,Contains COPC
status,Contains Duplicate Error,Unnamed: 2_level_1
Failed,False,75.832601
Failed,True,69.84127
Failed Due To Sender Error Code,False,0.0
Integrated,False,42.886323
Integrated,True,39.997305
Integrated Late,False,41.112182
Integrated Late,True,38.354839
Pending,False,25.542313
Pending,True,37.050805
Pending With Error,False,4.623022


In [57]:
# Just those with COPC
transfers_duplicates_investigation[transfers_duplicates_investigation['Contains COPC']].groupby(by=['status','Contains Duplicate Error']).agg({'Number of COPC messages':['mean','std']})

Unnamed: 0_level_0,Unnamed: 1_level_0,Number of COPC messages,Number of COPC messages
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std
status,Contains Duplicate Error,Unnamed: 2_level_2,Unnamed: 3_level_2
Failed,False,2.142121,0.895253
Failed,True,3.681818,3.478392
Integrated,False,2.012853,0.324138
Integrated,True,3.089623,1.038786
Integrated Late,False,2.012662,0.232354
Integrated Late,True,2.802355,0.857995
Pending,False,2.108593,2.848054
Pending,True,2.892977,4.335244
Pending With Error,False,4.704698,20.245288
Pending With Error,True,7.0,


Looking at just integrated on time transfers:
- Transfers with Error Code 12 have,on average, 1.24 COPC messages per transfer compared with 0.86 for transfers without Error Code 12
- This is despite the fact that slight fewer transfers with Error Code 12 (40.0%) have any COPC messages at all compared with transfers without Error Code 12 (42.89%)
- Even if we only look at transfers which contain any COPC messages, we see that transfers with Error Code 12 have, on average, 3.09 COPC messages per transfer, compared to just 2.01 for transfers without Error Code 12

**This suggests that where the duplicate issue does occur, duplicate COPC messages are produced.**

The alternative (assuming no methodolgy error(!)) is that the chance of duplication is positively correlated to number of COPC messages where COPC is required, yet, paradoxically, is still slightly more likely to occur when there is no COPC required at all. Given the complexity of GP2GP, this may be possible...

Note 
- this analysis does not remove the initial COPC message (where it's first requested by the receiving practice), which affects the values but should not affect the outcome.
- This only looks at Integrated on Time for the sake of clarity but other status can be inspected though do not seem to affect the outcome. 