# [HYPOTHESIS] All error codes can be classified as completely fatal or completely non fatal

## Hypothesis

**We believe** all error codes can be classified as completely fatal or completely non fatal

**We will know this to be true** when we can attribute every failed transfer to an error code which is not present in any successfully integrated transfer

 

## Approach/Scope

- Take 6 months of data - Sept 2020 to Feb 2021

- Clearly label each transfer as integrated or failed 

  - Correct duplicate transfers 

  - Remove pending transfers

- Merge intermediate and final error codes

- Identify % failure for each error code and designate anything with 100% error code as fatal 

- Ensure that all failures contain one of these fatal error codes

In [104]:
import pandas as pd
import numpy as np

In [105]:
def Series_of_lists_value_counts(Series):
    # Replace any nan values in list
    Series=Series.apply(lambda row: ['None' if np.isnan(x) else x for x in row])
    # Convert this into a dataframe of list items in order
    journey_frame=pd.DataFrame.from_records(Series.tolist())
    # To ensure grouping of different list lengths, fill gaps
    journey_frame=journey_frame.fillna('n/a')
    # Store index for grouping
    grouping_index=list(journey_frame.columns)
    # Add column to aggreate on for group
    journey_frame['Total Occurences']=1

    # Now do the actual aggregate
    journey_frame=journey_frame.groupby(grouping_index).agg('count').sort_values(by='Total Occurences',ascending=False)
    
    return journey_frame.reset_index().replace({'n/a':np.nan})

## Take 6 months of data

In [106]:
transfer_file_location = "s3://<bucket-name>"
transfer_files = [
    "9-2020-transfers.parquet",
    "10-2020-transfers.parquet",
    "11-2020-transfers.parquet",
    "12-2020-transfers.parquet",
    "1-2021-transfers.parquet",
    "2-2021-transfers.parquet"
]
transfer_input_files = [transfer_file_location + f for f in transfer_files]
transfers = pd.concat((
    pd.read_parquet(f)
    for f in transfer_input_files
))


## Clearly label each transfer as integrated or failed


### Correct duplicate transfers

In [107]:
successful_transfers_bool = transfers['request_completed_ack_codes'].apply(lambda x: True in [(np.isnan(i) or i==15) for i in x])
transfers_without_integrated_status_bool = transfers['status'] != 'INTEGRATED'
successful_transfers_without_integrated_status = transfers[(successful_transfers_bool & transfers_without_integrated_status_bool)]
successful_transfers_without_integrated_status['status'].value_counts()

FAILED                4493
PENDING                 19
PENDING_WITH_ERROR       1
Name: status, dtype: int64

In [108]:
Series_of_lists_value_counts(successful_transfers_without_integrated_status['request_completed_ack_codes'].apply(set))

Unnamed: 0,0,1,2,Total Occurences
0,,12.0,,4212
1,,11.0,,178
2,,11.0,12.0,28
3,12.0,15.0,,23
4,,,,17
5,,31.0,,17
6,,12.0,31.0,14
7,11.0,15.0,,6
8,,25.0,,5
9,,25.0,12.0,3


In [109]:
transfers_with_final_outcome = transfers.copy()
transfers_with_final_outcome.loc[successful_transfers_bool, 'status'] = 'INTEGRATED'


### Remove pending transfers [Removed!]

In [110]:
#transfers_with_final_outcome = transfers_with_final_outcome.loc[(transfers_with_final_outcome['status'] == 'INTEGRATED') | (transfers_with_final_outcome['status'] == 'FAILED')]


## Merge intermediate and final error codes

In [111]:
transfers_with_final_outcome['all_error_codes'] = transfers_with_final_outcome.apply(lambda x: [*x['intermediate_error_codes'], *x['request_completed_ack_codes']],axis=1)

# Remove the "None" tag that we used as a flag for a successfully integrated transfer
transfers_with_final_outcome['all_error_codes'] = transfers_with_final_outcome['all_error_codes'].apply(lambda x: [i for i in x if np.isfinite(i)])


## Identify % failure for each error code and designate anything with 100% error code as fatal 


In [112]:
def error_code_failure_rates(transfers_df):
    reduced_transfers_with_final_outcome = transfers_df[['status', 'all_error_codes','conversation_id']]
    has_errors = reduced_transfers_with_final_outcome["all_error_codes"].apply(len) > 0
    transfers_with_final_outcome_exploded = reduced_transfers_with_final_outcome[has_errors].explode("all_error_codes")
    error_code_status_counts=pd.pivot_table(transfers_with_final_outcome_exploded,index='all_error_codes',columns='status',values='conversation_id',aggfunc='count')
    error_code_status_counts=error_code_status_counts.fillna(0)
    error_code_summary=error_code_status_counts.copy()
    error_code_summary['Volume']=error_code_summary.sum(axis=1)
    error_code_summary['% Integrated']=100*(error_code_summary['INTEGRATED']/error_code_summary['Volume'])
    
    return error_code_summary.sort_values(by=['% Integrated','Volume'])
    

In [113]:
error_code_failure_rates(transfers_with_final_outcome)

status,FAILED,INTEGRATED,PENDING,PENDING_WITH_ERROR,Volume,% Integrated
all_error_codes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
9.0,6.0,0.0,0.0,0.0,6.0,0.0
26.0,71.0,0.0,0.0,0.0,71.0,0.0
21.0,170.0,0.0,0.0,0.0,170.0,0.0
30.0,2422.0,10.0,0.0,289.0,2721.0,0.367512
99.0,14231.0,83.0,0.0,5.0,14319.0,0.579649
17.0,824.0,13.0,0.0,0.0,837.0,1.553166
28.0,907.0,19.0,0.0,0.0,926.0,2.051836
25.0,954.0,58.0,1.0,0.0,1013.0,5.725568
20.0,53.0,10.0,0.0,0.0,63.0,15.873016
31.0,1450.0,443.0,0.0,0.0,1893.0,23.402007


### Allocate fatal error codes and see what happens if we remove them from the data

In [114]:
fatal_codes=[9,26,21,30,99]
non_fatal_conversations_bool=transfers_with_final_outcome['all_error_codes'].apply(lambda x: len(set(fatal_codes).intersection(set(x)))==0)
non_fatal_conversations_bool.mean()
transfers_without_fatal_codes=transfers_with_final_outcome[non_fatal_conversations_bool]

In [115]:
error_code_failure_rates(transfers_without_fatal_codes)

status,FAILED,INTEGRATED,PENDING,PENDING_WITH_ERROR,Volume,% Integrated
all_error_codes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
17.0,822.0,13.0,0.0,0.0,835.0,1.556886
28.0,906.0,19.0,0.0,0.0,925.0,2.054054
25.0,954.0,57.0,1.0,0.0,1012.0,5.632411
20.0,48.0,10.0,0.0,0.0,58.0,17.241379
31.0,1404.0,433.0,0.0,0.0,1837.0,23.57104
29.0,1300.0,488.0,0.0,45.0,1833.0,26.623022
11.0,1256.0,2297.0,0.0,0.0,3553.0,64.649592
12.0,3638.0,23315.0,0.0,6.0,26959.0,86.483178
15.0,0.0,75471.0,0.0,30.0,75501.0,99.960265


### Hmm.. that wasn't that helpful; let's look at the success rate for each 'combination' of error codes

In [116]:
def single_status_error_count(transfers_df,status):
    status_bool=transfers_df['status']==status
    status_count=Series_of_lists_value_counts(transfers_df.loc[status_bool,'set_error_codes'])
    status_count=status_count.rename({'Total Occurences':status},axis=1).fillna(0)
    
    for i in range(4):
        if i not in status_count.columns:
            status_count[i]=0

    return status_count

In [117]:
# Convert the list of error codes into the unique set of error codes
transfers_with_error_code_set=transfers_with_final_outcome.copy()
transfers_with_error_code_set['set_error_codes']=transfers_with_error_code_set['all_error_codes'].apply(set)

# For each status, generate a count of which set of error codes occurred
status_error_code_counts=dict()
unique_statuses=transfers_with_error_code_set['status'].value_counts().index
for status in unique_statuses:
    status_error_code_counts[status]=single_status_error_count(transfers_with_error_code_set,status)
    
# Merge this data together into one frame
error_combination_counts=status_error_code_counts['INTEGRATED'].merge(status_error_code_counts['FAILED'],left_on=[0,1,2,3],right_on=[0,1,2,3],how='outer').fillna(0)
error_combination_counts=error_combination_counts.merge(status_error_code_counts['PENDING'],left_on=[0,1,2,3],right_on=[0,1,2,3],how='outer').fillna(0)
error_combination_counts=error_combination_counts.merge(status_error_code_counts['PENDING_WITH_ERROR'],left_on=[0,1,2,3],right_on=[0,1,2,3],how='outer').fillna(0)

# Calculate the success rate
error_combination_counts['Volume']=error_combination_counts[unique_statuses].sum(axis=1)
error_combination_counts['% Integrated']=100*(error_combination_counts['INTEGRATED']/error_combination_counts['Volume'])
error_combination_counts.sort_values(by='Volume',ascending=False).set_index(list(range(4)))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,INTEGRATED,FAILED,PENDING,PENDING_WITH_ERROR,Volume,% Integrated
0,1,2,3,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0.0,0.0,0.0,0.0,1160415.0,0.0,39086.0,26496.0,1225997.0,94.650721
15.0,0.0,0.0,0.0,73803.0,0.0,0.0,29.0,73832.0,99.960722
12.0,0.0,0.0,0.0,17228.0,1978.0,0.0,5.0,19211.0,89.677789
99.0,0.0,0.0,0.0,55.0,13501.0,0.0,5.0,13561.0,0.405575
30.0,0.0,0.0,0.0,0.0,2157.0,0.0,4.0,2161.0,0.0
11.0,0.0,0.0,0.0,1068.0,691.0,0.0,0.0,1759.0,60.716316
29.0,31.0,0.0,0.0,289.0,983.0,0.0,0.0,1272.0,22.720126
12.0,15.0,0.0,0.0,1005.0,0.0,0.0,1.0,1006.0,99.900596
25.0,0.0,0.0,0.0,41.0,844.0,1.0,0.0,886.0,4.62754
28.0,0.0,0.0,0.0,12.0,794.0,0.0,0.0,806.0,1.488834
