# [HYPOTHESIS] All error codes can be classified as completely fatal or completely non fatal

## Hypothesis

**We believe** all error codes can be classified as completely fatal or completely non fatal

**We will know this to be true** when we can attribute every failed transfer to an error code which is not present in any successfully integrated transfer

 

## Approach/Scope

- Take 6 months of data - Sept 2020 to Feb 2021

- Clearly label each transfer as integrated or failed 

  - Correct duplicate transfers 

  - Remove pending transfers [REMOVED!]

- Merge intermediate and final error codes

- Identify % failure for each error code and designate anything with 100% error code as fatal 

- Ensure that all failures contain one of these fatal error codes

In [1]:
import pandas as pd
import numpy as np

In [2]:
def Series_of_lists_value_counts(Series):
    # Replace any nan values in list
    Series=Series.apply(lambda row: ['None' if np.isnan(x) else x for x in row])
    # Convert this into a dataframe of list items in order
    journey_frame=pd.DataFrame.from_records(Series.tolist())
    # To ensure grouping of different list lengths, fill gaps
    journey_frame=journey_frame.fillna('n/a')
    # Store index for grouping
    grouping_index=list(journey_frame.columns)
    # Add column to aggreate on for group
    journey_frame['Total Occurences']=1

    # Now do the actual aggregate
    journey_frame=journey_frame.groupby(grouping_index).agg('count').sort_values(by='Total Occurences',ascending=False)
    
    return journey_frame.reset_index().replace({'n/a':np.nan})

## Take 6 months of data

In [3]:
transfer_file_location = "s3://prm-gp2gp-data-sandbox-dev/transfers-duplicates-hypothesis/"
transfer_files = [
    "9-2020-transfers.parquet",
    "10-2020-transfers.parquet",
    "11-2020-transfers.parquet",
    "12-2020-transfers.parquet",
    "1-2021-transfers.parquet",
    "2-2021-transfers.parquet"
]
transfer_input_files = [transfer_file_location + f for f in transfer_files]
transfers = pd.concat((
    pd.read_parquet(f)
    for f in transfer_input_files
))


## Clearly label each transfer as integrated or failed


### Correct duplicate transfers

In [4]:
successful_transfers_bool = transfers['request_completed_ack_codes'].apply(lambda x: True in [(np.isnan(i) or i==15) for i in x])
transfers_without_integrated_status_bool = transfers['status'] != 'INTEGRATED'
successful_transfers_without_integrated_status = transfers[(successful_transfers_bool & transfers_without_integrated_status_bool)]
successful_transfers_without_integrated_status['status'].value_counts()

FAILED                4493
PENDING                 19
PENDING_WITH_ERROR       1
Name: status, dtype: int64

In [5]:
Series_of_lists_value_counts(successful_transfers_without_integrated_status['request_completed_ack_codes'].apply(set))

Unnamed: 0,0,1,2,Total Occurences
0,,12.0,,4212
1,,11.0,,178
2,,11.0,12.0,28
3,12.0,15.0,,23
4,,,,17
5,,31.0,,17
6,,12.0,31.0,14
7,11.0,15.0,,6
8,,25.0,,5
9,,25.0,12.0,3


In [6]:
transfers_with_final_outcome = transfers.copy()
transfers_with_final_outcome.loc[successful_transfers_bool, 'status'] = 'INTEGRATED'
transfers=transfers_with_final_outcome.copy()

### Remove pending transfers [Removed!]

In [7]:
#transfers_with_final_outcome = transfers_with_final_outcome.loc[(transfers_with_final_outcome['status'] == 'INTEGRATED') | (transfers_with_final_outcome['status'] == 'FAILED')]


## Merge sender, intermediate and final error codes

In [8]:
transfers['all_error_codes'] = transfers.apply(lambda x: [*[x['sender_error_code']],*x['intermediate_error_codes'], *x['request_completed_ack_codes']],axis=1)

# Remove the "None" tag that we used as a flag for a successfully integrated transfer
transfers['all_error_codes'] = transfers['all_error_codes'].apply(lambda x: [i for i in x if np.isfinite(i)])


## Identify % failure for each error code and designate anything with 100% error code as fatal 


In [9]:
def error_code_failure_rates(transfers_df):
    reduced_transfers = transfers_df[['status', 'all_error_codes','conversation_id']]
    has_errors = reduced_transfers["all_error_codes"].apply(len) > 0
    transfers_exploded = reduced_transfers[has_errors].explode("all_error_codes")
    error_code_status_counts=pd.pivot_table(transfers_exploded,index='all_error_codes',columns='status',values='conversation_id',aggfunc='count')
    error_code_status_counts=error_code_status_counts.fillna(0)
    error_code_summary=error_code_status_counts.copy()
    error_code_summary['Volume']=error_code_summary.sum(axis=1)
    error_code_summary['% Integrated']=100*(error_code_summary['INTEGRATED']/error_code_summary['Volume'])
    
    return error_code_summary.sort_values(by=['% Integrated','Volume'])
    

In [10]:
error_code_table=error_code_failure_rates(transfers)
error_code_table

status,FAILED,INTEGRATED,PENDING,PENDING_WITH_ERROR,Volume,% Integrated
all_error_codes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
205.0,0.0,0.0,0.0,1.0,1.0,0.0
9.0,6.0,0.0,0.0,0.0,6.0,0.0
26.0,71.0,0.0,0.0,0.0,71.0,0.0
24.0,1.0,0.0,0.0,134.0,135.0,0.0
21.0,170.0,0.0,0.0,2.0,172.0,0.0
7.0,0.0,0.0,0.0,727.0,727.0,0.0
6.0,0.0,0.0,0.0,898.0,898.0,0.0
10.0,0.0,0.0,0.0,3520.0,3520.0,0.0
14.0,0.0,3.0,0.0,10194.0,10197.0,0.02942
30.0,2422.0,10.0,0.0,7682.0,10114.0,0.098873


### Allocate fatal error codes and see what happens if we remove them from the data

In [11]:
def fatal_error_implementation_effect(transfers,integration_pc_threshold):
    fatal_codes=list(error_code_table.loc[(error_code_table['% Integrated']<=integration_pc_threshold)].index)

    # Find which conversations contain one of these fatal codes and see what number of integrations would contain these
    transfers_fatal_error_flagged=transfers.copy()
    transfers_fatal_error_flagged['Contains Fatal Error']='Yes'
    non_fatal_conversations_bool=transfers_fatal_error_flagged['all_error_codes'].apply(lambda x: len(set(fatal_codes).intersection(set(x)))==0)
    transfers_fatal_error_flagged.loc[non_fatal_conversations_bool,'Contains Fatal Error']='No'

    print("Fatal Error Code Threshold (Max % Integration rate Allowed: " +str(integration_pc_threshold) +"%")
    print("Fatal Error Codes:")
    print(fatal_codes)
    return pd.pivot_table(transfers_fatal_error_flagged,index='status',columns='Contains Fatal Error',aggfunc='count',values='conversation_id').fillna(0).astype(int)

In [12]:
fatal_error_implementation_effect(transfers,0)

Fatal Error Code Threshold (Max % Integration rate Allowed: 0%
Fatal Error Codes:
[205.0, 9.0, 26.0, 24.0, 21.0, 7.0, 6.0, 10.0]


Contains Fatal Error,No,Yes
status,Unnamed: 1_level_1,Unnamed: 2_level_1
FAILED,22523,248
INTEGRATED,1254802,0
PENDING,39087,0
PENDING_WITH_ERROR,21292,5282


In [13]:
fatal_error_implementation_effect(transfers,0.1)

Fatal Error Code Threshold (Max % Integration rate Allowed: 0.1%
Fatal Error Codes:
[205.0, 9.0, 26.0, 24.0, 21.0, 7.0, 6.0, 10.0, 14.0, 30.0]


Contains Fatal Error,No,Yes
status,Unnamed: 1_level_1,Unnamed: 2_level_1
FAILED,20303,2468
INTEGRATED,1254789,13
PENDING,39087,0
PENDING_WITH_ERROR,3701,22873


In [14]:
fatal_error_implementation_effect(transfers,1)

Fatal Error Code Threshold (Max % Integration rate Allowed: 1%
Fatal Error Codes:
[205.0, 9.0, 26.0, 24.0, 21.0, 7.0, 6.0, 10.0, 14.0, 30.0, 99.0, 23.0]


Contains Fatal Error,No,Yes
status,Unnamed: 1_level_1,Unnamed: 2_level_1
FAILED,6750,16021
INTEGRATED,1254713,89
PENDING,39087,0
PENDING_WITH_ERROR,3187,23387


### Hmm.. interesting; let's also look at the success rate for each 'combination' of error codes

In [15]:
def single_status_error_count(transfers_df,status):
    status_bool=transfers_df['status']==status
    status_count=Series_of_lists_value_counts(transfers_df.loc[status_bool,'set_error_codes'])
    status_count=status_count.rename({'Total Occurences':status},axis=1).fillna(0)
    
    for i in range(4):
        if i not in status_count.columns:
            status_count[i]=0

    return status_count

In [16]:
# Convert the list of error codes into the unique set of error codes
transfers_with_error_code_set=transfers.copy()
transfers_with_error_code_set['set_error_codes']=transfers_with_error_code_set['all_error_codes'].apply(set)

# For each status, generate a count of which set of error codes occurred
status_error_code_counts=dict()
unique_statuses=transfers_with_error_code_set['status'].value_counts().index
for status in unique_statuses:
    status_error_code_counts[status]=single_status_error_count(transfers_with_error_code_set,status)
    
# Merge this data together into one frame
error_combination_counts=status_error_code_counts['INTEGRATED'].merge(status_error_code_counts['FAILED'],left_on=[0,1,2,3],right_on=[0,1,2,3],how='outer').fillna(0)
error_combination_counts=error_combination_counts.merge(status_error_code_counts['PENDING'],left_on=[0,1,2,3],right_on=[0,1,2,3],how='outer').fillna(0)
error_combination_counts=error_combination_counts.merge(status_error_code_counts['PENDING_WITH_ERROR'],left_on=[0,1,2,3],right_on=[0,1,2,3],how='outer').fillna(0)

# Calculate the success rate
error_combination_counts['Volume']=error_combination_counts[unique_statuses].sum(axis=1)
error_combination_counts['% Integrated']=100*(error_combination_counts['INTEGRATED']/error_combination_counts['Volume'])
error_combination_counts.sort_values(by='Volume',ascending=False).set_index(list(range(4)))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,4,INTEGRATED,FAILED,PENDING,PENDING_WITH_ERROR,Volume,% Integrated
0,1,2,3,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0.0,0.0,0.0,0.0,0.0,1159469.0,0.0,39086.0,0.0,1198555.0,96.738906
15.0,0.0,0.0,0.0,0.0,73742.0,0.0,0.0,29.0,73771.0,99.960689
12.0,0.0,0.0,0.0,0.0,17184.0,1976.0,0.0,5.0,19165.0,89.663449
99.0,0.0,0.0,0.0,0.0,55.0,13470.0,0.0,95.0,13620.0,0.403818
14.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,10194.0,10196.0,0.019616
...,...,...,...,...,...,...,...,...,...,...
19.0,12.0,29.0,31.0,0.0,1.0,0.0,0.0,0.0,1.0,100.000000
25.0,12.0,29.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,100.000000
11.0,20.0,29.0,31.0,0.0,1.0,0.0,0.0,0.0,1.0,100.000000
99.0,11.0,19.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,100.000000
