# Impact of cut off points on transfer outcomes

We need to decide when we should consistently cut off the data to show a full picture of a month. The purpose of this analysis is to assess the impact of different cut off points. For example, if we cut off the data two weeks after the month end, we might have 1% of transfers that are shown in pending but will ultimately be successful, compared to doing it three weeks after the month end where we might ave 0.5% of transfers that are pending but ultimately successful. 

In [None]:
import paths
from datetime import datetime, date
from dateutil.tz import tzutc
import pandas as pd
from matplotlib import pyplot as plt

In [None]:
from gp2gp.date.range import DateTimeRange
from gp2gp.pipeline.dashboard.main import read_spine_csv_gz_files
from gp2gp.service.transformers import derive_transfers
from scripts.gp2gp_spine_outcomes import parse_conversations

In [None]:
july_data_file_name = "../data/months/July-2020.csv.gz"
august_data_file_name = "../data/months/Aug-2020.csv.gz"
september_data_file_name="../data/months/Sept-2020.csv.gz"
october_data_file_name = "../data/months/Oct-2020.csv.gz"

In [None]:
july_time_range = DateTimeRange(
    datetime(year=2020, month=7, day=1, tzinfo=tzutc()),
    datetime(year=2020, month=8, day=1, tzinfo=tzutc()),
)

In [None]:
  spine_messages = read_spine_csv_gz_files([
      july_data_file_name
  ])

  conversations = parse_conversations(spine_messages, time_range=july_time_range)
  transfers = derive_transfers(conversations)

In [None]:
transfers_df = pd.DataFrame(transfers)[["final_error_code", "intermediate_error_codes", "status", "date_completed"]]
transfers_df

## Investigation into the final outcome of transfers with intermediate errors

In [None]:
total_number_transfers = len(transfers_df.index)
transfer_error_code_count = transfers_df["intermediate_error_codes"].map(lambda error_codes: len(error_codes))
transfers_with_intermediate_errors = transfers_df[transfer_error_code_count > 0]

In [None]:
transfers_with_intermediate_errors_df = pd.DataFrame(transfers_with_intermediate_errors)[["final_error_code", "intermediate_error_codes", "status", "date_completed",]]
transfers_with_intermediate_errors_df

In [None]:
print(f"Total number of transfers: {total_number_transfers}")

In [None]:
def calculate_percentage(subset, total):
    return round((subset/total) * 100, 2)

In [None]:
total_number_transfers_with_intermediate_error_codes = len(transfers_with_intermediate_errors_df.index)

print(f"Total number of transfers with intermediate errors: {total_number_transfers_with_intermediate_error_codes}, out of a total transfer count of {total_number_transfers} ({calculate_percentage(total_number_transfers_with_intermediate_error_codes, total_number_transfers)}%) for transfers that began in the month of July (and cut off point being up until October)")

In [None]:
eventually_integrated_count = len(transfers_with_intermediate_errors_df[transfers_with_intermediate_errors_df["date_completed"].notnull()].index)
percentage_eventually_integrated = calculate_percentage(eventually_integrated_count, total_number_transfers_with_intermediate_error_codes)

print(f"Transfers with intermediate errors that eventually integrate: {eventually_integrated_count} out of {total_number_transfers_with_intermediate_error_codes} ({percentage_eventually_integrated}%)")

In [None]:
eventually_fails_count = transfers_with_intermediate_errors_df["final_error_code"].describe()["count"]
percentage_eventually_failed = calculate_percentage(eventually_fails_count, total_number_transfers_with_intermediate_error_codes)

print(f"Transfers with intermediate error codes that eventually failed: {eventually_fails_count} out of {total_number_transfers_with_intermediate_error_codes} ({percentage_eventually_failed}%)")

In [None]:
unresolved_count = total_number_transfers_with_intermediate_error_codes - (eventually_fails_count + eventually_integrated_count)
percentage_unresolved = calculate_percentage(unresolved_count, total_number_transfers_with_intermediate_error_codes)

print(f"Transfers with intermediate error codes that are still pending: {unresolved_count} out of {total_number_transfers_with_intermediate_error_codes} ({percentage_unresolved})%")

# Transfers completed over time

In [None]:
start_date = date(2020, 7, 1)
transfers_date_series = transfers_df["date_completed"].apply(lambda dt: (dt.date()-start_date).days if not pd.isnull(dt) else None)
transfers_date_df = transfers_date_series.to_frame(name="days_to_complete")
days_to_complete_df = transfers_date_df.groupby("days_to_complete").size().reset_index(name='counts')

In [None]:
days_to_complete_df["percent_complete"] = (days_to_complete_df["counts"].cumsum()/total_number_transfers)*100
days_to_complete_df.head()