In [1]:
# Import modules
import sys

In [2]:
# Import custom modules
sys.path.append('../')

from src.models.upload_data import *

In [3]:
# Set notebook attributes
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 300)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [4]:
# Set magic attributes
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [5]:
# Configuration
args = argparse.ArgumentParser()
args.add_argument("--config", default="../params.yaml")
parsed_args = vars(args.parse_args(""))
config_path = parsed_args['config']

In [6]:
config = read_params(config_path)
project_dir = config["project_dir"]
merged_data_path = config["processed_data_config"]["merged_data_parquet"]
sr_rein_last_limits_clean_data_path = config["processed_data_config"]["sr_rein_last_limits_clean_data_parquet"]
sr_rein_last_limits_clean_data_path = config["processed_data_config"]["sr_rein_last_limits_clean_data_parquet"]
sr_rein_cohort_data_path = config["interim_data_config"]["sr_rein_cohort_data_parquet"]
ftd_clean_data_path = config["processed_data_config"]["ftd_clean_data_parquet"]

### Merged Dataset

In [7]:
%%time
# Merge data sets
df_merge = pd.read_parquet(project_dir + merged_data_path)
df_merge.sample(2)

CPU times: total: 297 ms
Wall time: 214 ms


Unnamed: 0,previous_is_iprs_validated,store_number,most_recent_trx_date_past_30_days,last_trx_date,actual_trx_days,approx_30_days_trx_val,expected_trx_days,page_active_days,inference_col,days_since_last_trx,transacted_last_5_days,weight_till_recency,national_id,is_iprs_validated,mobile_number,idm_recommendation,idm_limit,client_mobile_number,loan_count,loan_status,term_frequency,principal_disbursed,principal_repaid,disbursed_on_date,expected_matured_on_date,closed_on_date,due_date_fixed,end_rollover_date_fixed,expected_dpd90,days_past_due,bloom_version,loan_repayment_status,src_crdt_score,max_principal_amount,max_loan_disbursement_date,count_good_loans,good_loans_repayment_ratio,num_days_since_last_disbursement,weight_dpd,max_days_past_due,loan_count_past_3_months,count_7_day_loans,avg_loan_tenure,count_7_day_loans_paid_upto_rollover,good_loans_repayment_ratio(7_day_loans),minimum_7_day_principal_disbursed,total_sum_7_day_principal_disbursed,total_sum_1_day_principal_disbursed,avg_7_day_principal_disbursed,days_past_end_rollover,any_bloom2_1day,total_outstanding_sum,safaricom_loan_balance_sum,loan_balance,total_final_21_limit,total_final_1_limit,total_final_7_limit,previous_21_limit,previous_7_limit,previous_1_limit,model_630_21_limit,model_630_7_limit,model_630_1_limit,repayments_by_dd_vs_principal_mean,due_date_rm_ge_rm_add_back_old,due_date_rm_ge_rm_1d_old,repayments_by_erd_vs_principal_mean,rllvr_date_rm_ge_rm_add_back_old,never_borrowed_flag_old,repayments_by_dd_vs_total_expected_repayment_by_dd_mean,due_date_rm_ge_rm_add_back_new,repayments_by_erd_vs_total_expected_repayment_by_erd_mean,rllvr_date_rm_ge_rm_add_back_new,rllvr_date_rm_ge_rm_1d_new,never_borrowed_flag_new,due_date_rm_ge_rm_add_back,rllvr_date_rm_ge_rm_add_back,due_date_rm_ge_rm_1d,never_borrowed_flag,update_flag,reinstatement_reason,rein_7_limit
110991,,7244904,NaT,NaT,0,10.0,0,0.0,No_rules_relaxed,,0,0.0,34909853,True,254718769448,Approve,0.0,,0,,,,,NaT,NaT,NaT,NaT,NaT,NaT,,,,,,NaT,,,,,,,,,,,,,,,,,,,,0,0.0,0.0,0,0,0,,,,,,,,,1,,,,,,1,0,0,0,1,0.0,,0.0
28219,,889144,2022-11-15,2022-12-14,29,12636.67,30,0.97,relax_rules,105.0,No,0.0,13626101,True,254722861142,Approve,4000.0,254722861142.0,22,600.0,21.0,5000.0,5000.0,2022-08-25,2022-09-15,2022-09-15,2022-09-15,2022-09-20,2022-12-19,-1.0,2.0,closed_early_repayment,460.0,20000.0,2019-06-14,22.0,1.0,216.0,1.0,2.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,-6.0,False,0.0,4474.0,4474.0,182500,82100.0,76400.0,0,0,0,5500.0,3000.0,3000.0,1.08,1.0,1.0,1.0,1.0,0,1.0,1.0,1.0,1.0,0.0,0,1,1,0,0,0.0,,0.0


In [8]:
# Final dataset schema
df_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118921 entries, 0 to 118920
Data columns (total 82 columns):
 #   Column                                                     Non-Null Count   Dtype         
---  ------                                                     --------------   -----         
 0   previous_is_iprs_validated                                 0 non-null       object        
 1   store_number                                               118921 non-null  object        
 2   most_recent_trx_date_past_30_days                          102191 non-null  datetime64[ns]
 3   last_trx_date                                              102191 non-null  datetime64[ns]
 4   actual_trx_days                                            118921 non-null  int32         
 5   approx_30_days_trx_val                                     118921 non-null  float64       
 6   expected_trx_days                                          118921 non-null  int32         
 7   page_active_days    

### Reinstatement Dataset

In [9]:
# Reinstatement data set
rein_cohort_details = pd.read_parquet(project_dir + sr_rein_cohort_data_path)
rein_cohort_details.loan_mifos_id = rein_cohort_details.loan_mifos_id.astype('str')
rein_cohort_details.store_number = rein_cohort_details.store_number.astype('str')
rein_cohort_details.sample(2)

Unnamed: 0,id,client_id,loan_status_id,term_frequency,partner,product,disbursedon_date,expected_maturedon_date,DPD,principal_disbursed_derived,total_repayment_derived,total_expected_repayment_derived,principal_repaid_derived,interest_charged_derived,interest_repaid_derived,fee_charges_charged_derived,fee_charges_repaid_derived,penalty_charges_charged_derived,penalty_charges_repaid_derived,total_outstanding_derived,loan_count,repayments_ratio_by_due_date,traction,rollover_period,dpd_from_erd,loan_mifos_id,store_number,is_iprs_validated,mobile_number
118,41720,36584,300,21,Safaricom Bloom 2.0,Bloom 21-day,13/03/2022 00:00,03/04/2022 00:00,355,84800,24952.8,125180.22,0.0,6444.81,0.0,4561.81,0.0,29373.6,24952.8,100227.42,1,,0.29,5,350,41720,578417,True,254723000000.0
5,4324,17340,300,21,Safaricom Bloom 2.0,Bloom 21-day,19/11/2021 00:00,10/12/2021 00:00,469,44200,14320.0,68704.81,0.0,3842.49,0.0,2142.27,0.0,18520.05,14320.0,54384.81,1,,0.32,5,464,4324,828269,True,254723000000.0


In [10]:
# Final dataset schema
rein_cohort_details.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 396 entries, 0 to 395
Data columns (total 29 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   id                                396 non-null    int64  
 1   client_id                         396 non-null    int64  
 2   loan_status_id                    396 non-null    int64  
 3   term_frequency                    396 non-null    int64  
 4   partner                           396 non-null    object 
 5   product                           396 non-null    object 
 6   disbursedon_date                  396 non-null    object 
 7   expected_maturedon_date           396 non-null    object 
 8   DPD                               396 non-null    int64  
 9   principal_disbursed_derived       396 non-null    int64  
 10  total_repayment_derived           396 non-null    float64
 11  total_expected_repayment_derived  396 non-null    float64
 12  principa

### FTD

In [11]:
# FTD data set
df_ftd_clean = pd.read_parquet(project_dir + ftd_clean_data_path)
df_ftd_clean.sample(2)

Unnamed: 0,store_number,update_flag,reinstatement_reason
18,7783143,1,first time defaulters during election risk mitigation measures
29,7309656,1,first time defaulters during election risk mitigation measures


In [12]:
# Final dataset schema
df_ftd_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39 entries, 0 to 38
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   store_number          39 non-null     object
 1   update_flag           39 non-null     int64 
 2   reinstatement_reason  39 non-null     object
dtypes: int64(1), object(2)
memory usage: 1.0+ KB


### Reinstatement and in Till Activity

In [13]:
# Sample
df_till = df_merge[(~df_merge.rein_7_limit.isna()) & (df_merge.rein_7_limit > 0)]
df_till.sample(2)

Unnamed: 0,previous_is_iprs_validated,store_number,most_recent_trx_date_past_30_days,last_trx_date,actual_trx_days,approx_30_days_trx_val,expected_trx_days,page_active_days,inference_col,days_since_last_trx,transacted_last_5_days,weight_till_recency,national_id,is_iprs_validated,mobile_number,idm_recommendation,idm_limit,client_mobile_number,loan_count,loan_status,term_frequency,principal_disbursed,principal_repaid,disbursed_on_date,expected_matured_on_date,closed_on_date,due_date_fixed,end_rollover_date_fixed,expected_dpd90,days_past_due,bloom_version,loan_repayment_status,src_crdt_score,max_principal_amount,max_loan_disbursement_date,count_good_loans,good_loans_repayment_ratio,num_days_since_last_disbursement,weight_dpd,max_days_past_due,loan_count_past_3_months,count_7_day_loans,avg_loan_tenure,count_7_day_loans_paid_upto_rollover,good_loans_repayment_ratio(7_day_loans),minimum_7_day_principal_disbursed,total_sum_7_day_principal_disbursed,total_sum_1_day_principal_disbursed,avg_7_day_principal_disbursed,days_past_end_rollover,any_bloom2_1day,total_outstanding_sum,safaricom_loan_balance_sum,loan_balance,total_final_21_limit,total_final_1_limit,total_final_7_limit,previous_21_limit,previous_7_limit,previous_1_limit,model_630_21_limit,model_630_7_limit,model_630_1_limit,repayments_by_dd_vs_principal_mean,due_date_rm_ge_rm_add_back_old,due_date_rm_ge_rm_1d_old,repayments_by_erd_vs_principal_mean,rllvr_date_rm_ge_rm_add_back_old,never_borrowed_flag_old,repayments_by_dd_vs_total_expected_repayment_by_dd_mean,due_date_rm_ge_rm_add_back_new,repayments_by_erd_vs_total_expected_repayment_by_erd_mean,rllvr_date_rm_ge_rm_add_back_new,rllvr_date_rm_ge_rm_1d_new,never_borrowed_flag_new,due_date_rm_ge_rm_add_back,rllvr_date_rm_ge_rm_add_back,due_date_rm_ge_rm_1d,never_borrowed_flag,update_flag,reinstatement_reason,rein_7_limit
24782,,204741,2022-11-15,2022-12-14,29,58832.72,30,0.97,No_rules_relaxed,105.0,No,0.0,10893055,True,254722554132,Approve,4500.0,254722554132,118,300.0,21.0,116500.0,0.0,2022-07-24,2022-08-14,NaT,2022-08-14,2022-08-19,2022-11-17,227.0,2.0,active_default,459.0,116500.0,2022-07-24,110.0,0.93,248.0,0.0,227.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,222.0,False,149644.21,231812.25,231812.25,1774200,678500.0,719300.0,0,0,0,116500.0,63600.0,63600.0,0.97,0.0,0.0,0.95,0.0,0,0.95,0.0,0.97,0.0,0.0,0,0,0,0,0,0.0,,63600.0
18303,,7154750,2022-11-15,2022-12-14,29,144724.31,30,0.97,No_rules_relaxed,105.0,No,0.0,25444046,True,254711446956,Reject,0.0,254711446956,14,300.0,21.0,71800.0,0.0,2022-05-30,2022-06-20,NaT,2022-06-20,2022-06-25,2022-09-23,282.0,2.0,active_default,438.0,71800.0,2022-05-30,13.0,0.93,303.0,0.0,282.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,277.0,False,88149.09,139765.87,139765.87,665200,464300.0,476300.0,0,0,0,35800.0,19700.0,19700.0,0.95,0.0,0.0,0.7,0.0,0,0.93,0.0,0.93,0.0,0.0,0,0,0,0,0,0.0,,39900.0


In [14]:
# Count
df_till.shape

(300, 82)

### Till Activity and in FTD

In [15]:
# Sample
df_till[df_till.store_number.isin(df_ftd_clean.store_number)].head(2)

Unnamed: 0,previous_is_iprs_validated,store_number,most_recent_trx_date_past_30_days,last_trx_date,actual_trx_days,approx_30_days_trx_val,expected_trx_days,page_active_days,inference_col,days_since_last_trx,transacted_last_5_days,weight_till_recency,national_id,is_iprs_validated,mobile_number,idm_recommendation,idm_limit,client_mobile_number,loan_count,loan_status,term_frequency,principal_disbursed,principal_repaid,disbursed_on_date,expected_matured_on_date,closed_on_date,due_date_fixed,end_rollover_date_fixed,expected_dpd90,days_past_due,bloom_version,loan_repayment_status,src_crdt_score,max_principal_amount,max_loan_disbursement_date,count_good_loans,good_loans_repayment_ratio,num_days_since_last_disbursement,weight_dpd,max_days_past_due,loan_count_past_3_months,count_7_day_loans,avg_loan_tenure,count_7_day_loans_paid_upto_rollover,good_loans_repayment_ratio(7_day_loans),minimum_7_day_principal_disbursed,total_sum_7_day_principal_disbursed,total_sum_1_day_principal_disbursed,avg_7_day_principal_disbursed,days_past_end_rollover,any_bloom2_1day,total_outstanding_sum,safaricom_loan_balance_sum,loan_balance,total_final_21_limit,total_final_1_limit,total_final_7_limit,previous_21_limit,previous_7_limit,previous_1_limit,model_630_21_limit,model_630_7_limit,model_630_1_limit,repayments_by_dd_vs_principal_mean,due_date_rm_ge_rm_add_back_old,due_date_rm_ge_rm_1d_old,repayments_by_erd_vs_principal_mean,rllvr_date_rm_ge_rm_add_back_old,never_borrowed_flag_old,repayments_by_dd_vs_total_expected_repayment_by_dd_mean,due_date_rm_ge_rm_add_back_new,repayments_by_erd_vs_total_expected_repayment_by_erd_mean,rllvr_date_rm_ge_rm_add_back_new,rllvr_date_rm_ge_rm_1d_new,never_borrowed_flag_new,due_date_rm_ge_rm_add_back,rllvr_date_rm_ge_rm_add_back,due_date_rm_ge_rm_1d,never_borrowed_flag,update_flag,reinstatement_reason,rein_7_limit
25790,,945539,2022-11-15,2022-12-14,29,859270.6,30,0.97,No_rules_relaxed,105.0,No,0.0,22751266,True,254728410933,Approve,0.0,254728410933,17,300.0,21.0,200000.0,142925.85,2022-07-31,2022-08-21,NaT,2022-08-21,2022-08-26,2022-11-24,220.0,2.0,active_default,486.0,200000.0,2022-07-31,15.0,0.88,241.0,0.0,220.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,215.0,False,57074.15,70068.37,70068.37,2400000,2110400.0,2161500.0,200000,154700,154700,200000.0,154700.0,154700.0,1.01,0.0,1.0,0.75,0.0,0,0.94,0.0,0.94,0.0,0.0,0,0,0,0,0,0.0,,123500.0


In [16]:
# Count
df_till[df_till.store_number.isin(df_ftd_clean.store_number)].shape

(1, 82)