# PRMT-2040 Investigate EMIS placeholder data in MI

In [1]:
import pandas as pd
import numpy as np

# Import Transfers Data

In [2]:
# Import transfer files to extract whether message creator is sender or requester
# Using data generated from branch PRMT-1742-duplicates-analysis.
# This is needed to correctly handle duplicates.
# Once the upstream pipeline has a fix for duplicate EHRs, then we can go back to using the main output.
transfer_file_location = "s3://prm-gp2gp-data-sandbox-dev/transfers-duplicates-hypothesis/"
transfer_files = [
    "9-2020-transfers.parquet",
    "10-2020-transfers.parquet",
    "11-2020-transfers.parquet",
    "12-2020-transfers.parquet",
    "1-2021-transfers.parquet",
    "2-2021-transfers.parquet"
]

transfer_input_files = [transfer_file_location + f for f in transfer_files]
transfers_raw = pd.concat((
    pd.read_parquet(f)
    for f in transfer_input_files
))

# In the data from the PRMT-1742-duplicates-analysis branch, these columns have been added , but contain only empty values.
transfers_raw = transfers_raw.drop(["sending_supplier", "requesting_supplier"], axis=1)

# Given the findings in PRMT-1742 - many duplicate EHR errors are misclassified, the below reclassifies the relevant data
has_at_least_one_successful_integration_code = lambda errors: any((np.isnan(e) or e==15 for e in errors))
successful_transfers_bool = transfers_raw['request_completed_ack_codes'].apply(has_at_least_one_successful_integration_code)
transfers = transfers_raw.copy()
transfers.loc[successful_transfers_bool, "status"] = "INTEGRATED"

# Correctly interpret certain sender errors as failed.
# This is explained in PRMT-1974. Eventually this will be fixed upstream in the pipeline.
pending_sender_error_codes=[6,7,10,24,30,23,14,99]
transfers_with_pending_sender_code_bool=transfers['sender_error_code'].isin(pending_sender_error_codes)
transfers_with_pending_with_error_bool=transfers['status']=='PENDING_WITH_ERROR'
transfers_which_need_pending_to_failure_change_bool=transfers_with_pending_sender_code_bool & transfers_with_pending_with_error_bool
transfers.loc[transfers_which_need_pending_to_failure_change_bool,'status']='FAILED'

# Add integrated Late status
eight_days_in_seconds=8*24*60*60
transfers_after_sla_bool=transfers['sla_duration']>eight_days_in_seconds
transfers_with_integrated_bool=transfers['status']=='INTEGRATED'
transfers_integrated_late_bool=transfers_after_sla_bool & transfers_with_integrated_bool
transfers.loc[transfers_integrated_late_bool,'status']='INTEGRATED LATE'

# If the record integrated after 28 days, change the status back to pending.
# This is to handle each month consistently and to always reflect a transfers status 28 days after it was made.
# TBD how this is handled upstream in the pipeline
twenty_eight_days_in_seconds=28*24*60*60
transfers_after_month_bool=transfers['sla_duration']>twenty_eight_days_in_seconds
transfers_pending_at_month_bool=transfers_after_month_bool & transfers_integrated_late_bool
transfers.loc[transfers_pending_at_month_bool,'status']='PENDING'
transfers_with_early_error_bool=(~transfers.loc[:,'sender_error_code'].isna()) |(~transfers.loc[:,'intermediate_error_codes'].apply(len)>0)
transfers.loc[transfers_with_early_error_bool & transfers_pending_at_month_bool,'status']='PENDING_WITH_ERROR'

# Supplier name mapping
supplier_renaming = {
    "EGTON MEDICAL INFORMATION SYSTEMS LTD (EMIS)":"EMIS",
    "IN PRACTICE SYSTEMS LTD":"Vision",
    "MICROTEST LTD":"Microtest",
    "THE PHOENIX PARTNERSHIP":"TPP",
    None: "Unknown"
}

asid_lookup_file = "s3://prm-gp2gp-data-sandbox-dev/asid-lookup/asidLookup-Mar-2021.csv.gz"
asid_lookup = pd.read_csv(asid_lookup_file)
lookup = asid_lookup[["ASID", "MName", "NACS","OrgName"]]

transfers = transfers.merge(lookup, left_on='requesting_practice_asid',right_on='ASID',how='left')
transfers = transfers.rename({'MName': 'requesting_supplier', 'ASID': 'requesting_supplier_asid', 'NACS': 'requesting_ods_code','OrgName':'requesting_practice_name'}, axis=1)
transfers = transfers.merge(lookup, left_on='sending_practice_asid',right_on='ASID',how='left')
transfers = transfers.rename({'MName': 'sending_supplier', 'ASID': 'sending_supplier_asid', 'NACS': 'sending_ods_code','OrgName':'sending_practice_name'}, axis=1)

transfers["sending_supplier"] = transfers["sending_supplier"].replace(supplier_renaming.keys(), supplier_renaming.values())
transfers["requesting_supplier"] = transfers["requesting_supplier"].replace(supplier_renaming.keys(), supplier_renaming.values())

# Import MI SR Data

In [3]:
MI_Data=pd.read_csv("s3://prm-gp2gp-data-sandbox-dev/MI_athena_outputs/MI_SR_Sept_20_Feb_21.csv")
MI_Data['conversation_id']=MI_Data['ConversationID'].str.upper()
MI_Data.shape

  interactivity=interactivity, compiler=compiler, result=result)


(1502251, 34)

### To what degree does our MI set match our transfers (Spine) set?

In [4]:
MI_conversations=set(MI_Data['conversation_id'].values)
spine_conversations=set(transfers['conversation_id'].values)
overlap_conversations=MI_conversations.intersection(spine_conversations)
print(f'Number of MI conversations from Athena: {len(MI_conversations)}')
print(f'Number of transfer conversations from Spine: {len(spine_conversations)}')
print(f'Number of overlapping conversations between Athena and Spine: {len(overlap_conversations)}')
print(f'How many conversations are present in Athena MI but missing from Spine Data: {len(MI_conversations)-len(overlap_conversations)}')

print('Senders of conversations which are present in Spine Data but missing from Athena MI')
non_mi_conversations_bool=~transfers['conversation_id'].isin(MI_conversations)
transfers.loc[non_mi_conversations_bool].groupby(['sending_supplier']).agg({'conversation_id':'count'})

Number of MI conversations from Athena: 1310479
Number of transfer conversations from Spine: 1343234
Number of overlapping conversations between Athena and Spine: 1309920
How many conversations are present in Athena MI but missing from Spine Data: 559
Senders of conversations which are present in Spine Data but missing from Athena MI


Unnamed: 0_level_0,conversation_id
sending_supplier,Unnamed: 1_level_1
EMIS,1320
Microtest,1029
TPP,386
Unknown,1487
Vision,29092


### Does the Athena MI pull match an equivalent pull from Splunk?
We found a discrepency between Splunk and Athena. Splunk was 1,045,674 results, while Athena was 1,502,251.
We investigate why below

In [5]:
# Investigation of whether all the same source files were used
file_names_athena = pd.read_csv("file-names-athena.csv")
file_names_splunk = pd.read_csv("file-names.csv")
print(f'Number of Splunk source files: {file_names_splunk.shape[0]}')
print(f'Number of Athena source files: {file_names_athena.shape[0]}')
file_names_splunk_set = set(file_names_splunk["source"].values)
file_names_athena_set = set(file_names_athena["path"].values)
print(f'Intersection of Files: {len(file_names_athena_set.intersection(file_names_splunk_set))}')

Number of Splunk source files: 8888
Number of Athena source files: 55249
Intersection of Files: 8888


In [6]:
comparison_frame=file_names_athena.copy()
comparison_frame['Is in Splunk']=comparison_frame['path'].isin(file_names_splunk_set)
comparison_frame['Contains Whitespace']=comparison_frame['path'].str.contains(" ")
comparison_frame.pivot_table(index='Is in Splunk',columns='Contains Whitespace',aggfunc='count',values='_col1').fillna(0).astype(int)

Contains Whitespace,False,True
Is in Splunk,Unnamed: 1_level_1,Unnamed: 2_level_1
False,160,46201
True,8888,0


Further investigation of the same queries but just for 1st March to 1st May showed they both produced the same number of events (ie rows).
So the fix that took place in February (replacing whitespaces in filenames with underscores) has corrected this issue but has not been applied retroactively. 
The 160 files without whitespaces remain a mystery and are listed below

In [7]:
not_in_splunk_no_whitespace_bool=(~comparison_frame['Is in Splunk']) & (~comparison_frame['Contains Whitespace'])
comparison_frame.loc[not_in_splunk_no_whitespace_bool,"path"]

1545     s3://prm-gp2gp-mi-data-prod-v2/2020/12/06/2eba...
1973     s3://prm-gp2gp-mi-data-prod-v2/2021/01/03/9343...
2052     s3://prm-gp2gp-mi-data-prod-v2/2020/12/06/c90f...
2408     s3://prm-gp2gp-mi-data-prod-v2/2020/12/20/779a...
2683     s3://prm-gp2gp-mi-data-prod-v2/2020/12/27/9ff4...
                               ...                        
53424    s3://prm-gp2gp-mi-data-prod-v2/2021/01/03/8e83...
53542    s3://prm-gp2gp-mi-data-prod-v2/2020/12/13/43a7...
53956    s3://prm-gp2gp-mi-data-prod-v2/2021/01/03/f0d6...
54181    s3://prm-gp2gp-mi-data-prod-v2/2021/01/03/15bb...
55173    s3://prm-gp2gp-mi-data-prod-v2/2021/01/10/e314...
Name: path, Length: 160, dtype: object

### Does the Athena MI Data have repeating conversations?

In [8]:
# How often do conversations repeat but with different values?
conversation_repeat_table=pd.DataFrame(MI_Data.drop_duplicates()['conversation_id'].value_counts().value_counts().rename('Number of Transfers'))
print(f'% of conversations with different versions: {round(100*conversation_repeat_table.drop(1).sum()/conversation_repeat_table.sum(),2)}%')
conversation_repeat_table

% of conversations with different versions: Number of Transfers    8.52
dtype: float64%


Unnamed: 0,Number of Transfers
1,1198815
2,109746
3,1786
4,85
5,26
6,9
7,5
8,3
10,2
9,2


In [9]:
# For just the placeholder data, how often do conversations repeat with different placeholder data?
placeholder_columns=['PlaceholdersFileTypeUnsupported', 'PlaceholdersFileDeleted',
       'PlaceholdersFileNotFound', 'PlaceholdersFileLocked',
       'PlaceholdersUndeterminedReason']
data_of_interest=MI_Data.loc[:,placeholder_columns +['conversation_id']].drop_duplicates()
relevant_conversation_repeat_table=pd.DataFrame(data_of_interest['conversation_id'].value_counts().value_counts().rename('Number of Transfers'))
print(f'% of conversations with different versions: {round(100*relevant_conversation_repeat_table.drop(1).sum()/relevant_conversation_repeat_table.sum(),3)}%')

relevant_conversation_repeat_table

% of conversations with different versions: Number of Transfers    0.005
dtype: float64%


Unnamed: 0,Number of Transfers
1,1310418
2,61


# Parse out EMIS placeholders

In [10]:
placeholder_data=MI_Data.loc[:,placeholder_columns +['conversation_id']].drop_duplicates()
placeholder_data=placeholder_data.fillna(0).groupby('conversation_id').agg('max')

EMIS_sender_conversation_ids=transfers.loc[transfers['sending_supplier']=='EMIS','conversation_id'].values

EMIS_sender_MI_bool=placeholder_data.index.isin(EMIS_sender_conversation_ids)
EMIS_sender_MI_data=placeholder_data[EMIS_sender_MI_bool]

quantity_EMIS_transfers=EMIS_sender_MI_data.shape[0]

print(f'Number of EMIS conversations with MI placeholder data: {quantity_EMIS_transfers}')
print(f'% of EMIS conversations with placeholder files: {round((EMIS_sender_MI_data.sum(axis=1)>0).mean()*100,1)}%')
print(f'Average Number of Placeholder files per transfer: {round(EMIS_sender_MI_data.sum(axis=1).mean(),1)}')
print(f'Maximum Number of Placeholder files for a  transfer: {round(EMIS_sender_MI_data.sum(axis=1).max())}')

Number of EMIS conversations with MI placeholder data: 1063578
% of EMIS conversations with placeholder files: 16.7%
Average Number of Placeholder files per transfer: 2.1
Maximum Number of Placeholder files for a  transfer: 1465


In [11]:
pd.concat([pd.DataFrame(EMIS_sender_MI_data[pc_column].value_counts(dropna=False)) for pc_column in placeholder_columns],axis=1).fillna(0).div(quantity_EMIS_transfers).multiply(100).round(2).head(5)

Unnamed: 0,PlaceholdersFileTypeUnsupported,PlaceholdersFileDeleted,PlaceholdersFileNotFound,PlaceholdersFileLocked,PlaceholdersUndeterminedReason
0.0,86.97,100.0,97.32,100.0,98.55
1.0,2.1,0.0,1.36,0.0,0.73
2.0,1.32,0.0,0.49,0.0,0.25
3.0,1.01,0.0,0.24,0.0,0.12
4.0,0.78,0.0,0.15,0.0,0.08


In [12]:
placeholder_counts_table=pd.DataFrame(EMIS_sender_MI_data.sum(axis=1).value_counts().rename('Number of Transfers'))
placeholder_counts_table=placeholder_counts_table.reset_index().rename({'index':'Number of Placeholder Files'},axis=1)
placeholder_counts_table=placeholder_counts_table.astype(int).sort_values(by='Number of Placeholder Files')
placeholder_counts_table['% of Transfers']=(placeholder_counts_table['Number of Transfers']/quantity_EMIS_transfers).multiply(100)
placeholder_counts_table['Total Files']=placeholder_counts_table['Number of Placeholder Files']*placeholder_counts_table['Number of Transfers']

In [13]:
label_list=[]
def add_labels(min_number,label):
    placeholder_counts_table.loc[placeholder_counts_table['Number of Placeholder Files']>=min_number,'Placeholder Files']=label
    label_list.append(label)
add_labels(0,'No Files')
add_labels(1,'1 File')
add_labels(2,'2-3 Files')
add_labels(4,'4-10 Files')
add_labels(11,'11-50 Files')
add_labels(51,'>50 Files')
placeholder_distribution_table=placeholder_counts_table.groupby('Placeholder Files').agg('sum').drop('Number of Placeholder Files',axis=1)
placeholder_distribution_table=placeholder_distribution_table.loc[label_list]

no_transfers_with_placeholders=placeholder_distribution_table.drop('No Files').sum()['Number of Transfers']
placeholder_distribution_table['% of Placeholder Transfers']=(placeholder_distribution_table['Number of Transfers']/no_transfers_with_placeholders).multiply(100)
placeholder_distribution_table.loc['No Files','% of Placeholder Transfers']=np.nan

total_files=placeholder_distribution_table['Total Files'].sum()
placeholder_distribution_table['% of Total Files']=placeholder_distribution_table['Total Files'].multiply(100/total_files)

placeholder_distribution_table.round(2)

Unnamed: 0_level_0,Number of Transfers,% of Transfers,Total Files,% of Placeholder Transfers,% of Total Files
Placeholder Files,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
No Files,886327,83.33,0,,0.0
1 File,40151,3.78,40151,22.65,1.81
2-3 Files,35543,3.34,85483,20.05,3.86
4-10 Files,46942,4.41,299111,26.48,13.5
11-50 Files,45931,4.32,1032903,25.91,46.6
>50 Files,8684,0.82,758805,4.9,34.24


## Placeholders for transfers with different statuses

In [14]:
transfer_statuses = transfers[["conversation_id", "status"]]

In [15]:
placeholder_total = EMIS_sender_MI_data.copy()
placeholder_total = placeholder_total.sum(axis=1)
placeholder_total = pd.DataFrame(placeholder_total).rename({0: "Total placeholders"}, axis=1)
placeholder_statuses_table = placeholder_total.merge(transfer_statuses, left_index=True, right_on="conversation_id", how="inner")
placeholder_statuses_table["Has placeholder"] = placeholder_statuses_table["Total placeholders"] > 0

In [19]:
placeholder_stats_table = placeholder_statuses_table.groupby(by="status").agg({"Has placeholder":["mean","sum","count"]})["Has placeholder"]
placeholder_stats_table = placeholder_stats_table.rename({"mean":"Percentage of transfers with placeholders", "count":"Total transfers", "sum" : "Total transfers with placeholders"}, axis=1)
placeholder_stats_table["Percentage of transfers with placeholders"] = placeholder_stats_table["Percentage of transfers with placeholders"].multiply(100).round(2)
placeholder_stats_table


Unnamed: 0_level_0,Percentage of transfers with placeholders,Total transfers with placeholders,Total transfers
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
FAILED,26.46,5447,20588
INTEGRATED,16.57,158671,957484
INTEGRATED LATE,15.68,8976,57257
PENDING,14.35,3645,25392
PENDING_WITH_ERROR,17.92,512,2857


In [17]:
has_placeholder = placeholder_statuses_table["Has placeholder"]
transfers_with_placeholders = placeholder_statuses_table.loc[has_placeholder]
transfers_with_placeholders.groupby(by="status").agg({"Total placeholders":["mean", "median", "max"]}).rename({"Total placeholders": "Number of placeholder files for transfers with placeholders"}, axis=1)

Unnamed: 0_level_0,Number of placeholder files for transfers with placeholders,Number of placeholder files for transfers with placeholders,Number of placeholder files for transfers with placeholders
Unnamed: 0_level_1,mean,median,max
status,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
FAILED,12.814393,3.0,358.0
INTEGRATED,12.522087,5.0,1465.0
INTEGRATED LATE,11.7666,4.0,556.0
PENDING,12.449931,4.0,597.0
PENDING_WITH_ERROR,17.117188,4.0,232.0
