# PRMT-2269 Look at patterns across transfers that contain multiple error codes
Overall questions: Do certain errors often occur together? Are any error code combinations supplier specific? 

**Output:**
Create a table (similar to existing table of problems) that shows full transfer outcomes per row, per supplier pathway - i.e. transfers containing multiple errors would be shown in one row

Use March - June data with 14 day overflow

In [1]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
transfer_file_location = "s3://prm-gp2gp-data-sandbox-dev/transfers-sample-6/"
transfer_files = [
    "2021-3-transfers.parquet",
    "2021-4-transfers.parquet",
    "2021-5-transfers.parquet",
    "2021-6-transfers.parquet",
]
transfer_input_files = [transfer_file_location + f for f in transfer_files]
transfers_raw = pd.concat((
    pd.read_parquet(f)
    for f in transfer_input_files
))

In [3]:
transfers = transfers_raw.copy()
transfers["status"] = transfers["status"].str.replace("_", " ").str.title()

In [4]:
import paths, data
error_code_lookup_file = pd.read_csv(data.gp2gp_response_codes.path)
error_code_lookup = error_code_lookup_file.set_index("ErrorCode")["ErrorName"]

In [5]:
def convert_error_list_to_tuple(error_code_list, error_code_type):
    return [(error_code_type, error_code, error_code_lookup[error_code]) for error_code in set(error_code_list) if not np.isnan(error_code)]
    
def convert_error_to_tuple(error_code, error_code_type):
    if np.isnan(error_code):
        return []
    else:
        return [(error_code_type, error_code, error_code_lookup[error_code])]

def combine_error_codes(row):
    sender_list = convert_error_to_tuple(row["sender_error_code"], "Sender")
    intermediate_list = convert_error_list_to_tuple(row["intermediate_error_codes"], "COPC")
    final_list = convert_error_list_to_tuple(row["final_error_codes"], "Final")
    full_error_code_list = sender_list + intermediate_list + final_list
    if len(full_error_code_list) == 0:
        return tuple([("No Error Code", "No Error", "N/A")])
    else:
        return tuple(full_error_code_list)
    
transfers["all_error_codes"] = transfers.apply(combine_error_codes, axis=1)

In [6]:
def binarized_error_codes(table_sample):
# keeping this in case we want to consolidate error codes based ont he int value rather then combined with error code type
    table_sample["error_code_list"]=table_sample["all_error_codes"].apply(lambda error_tuple_list: [error_tuple[1] for error_tuple in error_tuple_list if type(error_tuple[1])!=str])

    # split out error codes so we can use them to filter
    mlb = MultiLabelBinarizer()
    binarized = mlb.fit_transform(table_sample["error_code_list"])
    binarized_error_occurences = pd.DataFrame(data=binarized, columns=mlb.classes_, index=table_sample.index)

    return pd.concat([table_sample, binarized_error_occurences], axis=1).drop('error_code_list',axis=1)
    

In [7]:
def generate_high_level_table(transfers_sample):

    # Create High level table
    high_level_table=transfers_sample.fillna("N/A").groupby(["requesting_supplier","sending_supplier","status","failure_reason","all_error_codes"]).agg({"conversation_id":"count"})
    high_level_table=high_level_table.rename({"conversation_id":"Number of Transfers"},axis=1).reset_index()

    # Count % of transfers
    total_number_transfers = transfers_sample.shape[0]
    high_level_table["% of Transfers"]=(high_level_table["Number of Transfers"]/total_number_transfers).multiply(100)

    # Count by supplier pathway
    supplier_pathway_counts = transfers_sample.fillna("Unknown").groupby(by=["sending_supplier", "requesting_supplier"]).agg({"conversation_id": "count"})["conversation_id"]
    high_level_table["% Supplier Pathway Transfers"]=high_level_table.apply(lambda row: row["Number of Transfers"]/supplier_pathway_counts.loc[(row["sending_supplier"],row["requesting_supplier"])],axis=1).multiply(100)

    # Add in Paper Fallback columns
    total_fallback = transfers_sample["failure_reason"].dropna().shape[0]
    fallback_bool=high_level_table["status"]!="Integrated On Time"
    high_level_table.loc[fallback_bool,"% Paper Fallback"]=(high_level_table["Number of Transfers"]/total_fallback).multiply(100)

    # % of error codes column
    no_error_tuple = tuple([("No Error Code", "No Error", "N/A")])
    error_code_bool = transfers_sample["all_error_codes"]!=no_error_tuple
    total_number_of_error_code_combinations=error_code_bool.sum()
    table_error_code_bool = high_level_table["all_error_codes"]!=no_error_tuple
    high_level_table.loc[table_error_code_bool,"% of error codes"]=(high_level_table.loc[table_error_code_bool, "Number of Transfers"]/total_number_of_error_code_combinations).multiply(100)
    
    # Select and re-order table
    grouping_columns_order=["requesting_supplier","sending_supplier","status","failure_reason", "all_error_codes"]
    counting_columns_order=["Number of Transfers","% of Transfers","% Supplier Pathway Transfers","% Paper Fallback","% of error codes"]
    high_level_table=high_level_table[grouping_columns_order+counting_columns_order].sort_values(by="Number of Transfers",ascending=False)
    
    high_level_table=binarized_error_codes(high_level_table)
    return high_level_table

In [8]:
transfers['month']=transfers['date_requested'].dt.to_period('M')

In [9]:
with pd.ExcelWriter("Error Code Combinations Tables PRMT-2269.xlsx") as writer:
    generate_high_level_table(transfers.copy()).to_excel(writer, sheet_name="All",index=False)
    [generate_high_level_table(transfers[transfers['month']==month].copy()).to_excel(writer, sheet_name=str(month),index=False) for month in transfers['month'].unique()]