In [1]:
# Import modules
import sys

In [2]:
# Import custom modules
sys.path.append('../')

from src.models.upload_data import *

In [3]:
# Set notebook attributes
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 300)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [4]:
# Set magic attributes
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [5]:
# Configuration
args = argparse.ArgumentParser()
args.add_argument("--config", default="../params.yaml")
parsed_args = vars(args.parse_args(""))
config_path = parsed_args['config']

In [6]:
# Load configurations
config = read_params(config_path)
project_dir = config["project_dir"]
scored_limits_risk_review_data_path_excel_parquet = config["processed_data_config"]["scored_limits_risk_review_data_excel_parquet"]
before_21d_graduation_limits_data_path = config["interim_data_config"]["before_21d_graduation_limits_data_parquet"]
after_rmdd_ge95_data_path = config["interim_data_config"]["after_rmdd_ge95_data_parquet"]
after_rmdd_ge96_data_path = config["interim_data_config"]["after_rmdd_ge96_data_parquet"]

### Limit Investigation

#### Legacy

In [7]:
# Columns not in scope
df_columns_not_in_scope = ['sum_3_months_trx_val', 'approx_30_days_trx_val', 'model_version', 'created_at', 'record_added_to_warehouse_on_timestamp', 'old_src_crdt_score']
# df_columns_not_in_scope = ['sum_3_months_trx_val', 'approx_30_days_trx_val', 'record_added_to_warehouse_on_timestamp', 'old_src_crdt_score']
limit_cols = ['final_21_limit', 'final_7_limit', 'final_1_limit']

In [8]:
# Legacy system refresh run
# df = pd.read_excel('../data/external/07_JN_Limits_refresh_summary_20221006_multiple_products_rec_new_2.xlsx')
# df.to_parquet('../data/external/07_JN_Limits_refresh_summary_20221006_multiple_products_rec_new_2.parquet', index=False)
df_raw = pd.read_parquet('../data/external/07_JN_Limits_refresh_summary_20221006_multiple_products_rec_new_2.parquet')
df_raw['store_number'] = df_raw['store_number'].astype('str')
df = df_raw.drop(columns=df_columns_not_in_scope)
df.rename(columns={'avg_3_months_trx_val': 'approx_30_days_trx_val'}, inplace=True)
df.sample(2)

Unnamed: 0,mobile_number,store_number,approx_30_days_trx_val,most_recent_trx_date_past_30_days,last_trx_date,expected_trx_days,actual_trx_days,page_active_days,inference_col,days_since_last_trx,transacted_last_5_days,weight_till_recency,national_id,idm_recommendation,idm_limit,is_iprs_validated,client_mobile_number,loan_count,loan_status,term_frequency,principal_disbursed,principal_repaid,disbursed_on_date,expected_matured_on_date,closed_on_date,due_date_fixed,days_past_due,bloom_version,loan_repayment_status,src_crdt_score,max_principal_amount,max_loan_disbursement_date,count_good_loans,good_loans_repayment_ratio,num_days_since_last_disbursement,weight_dpd,adjusted_loan_count,limit_factor_21,limit_factor_7,limit_factor_1,idm_factor_21,idm_factor_7,idm_factor_1,trading_consistency_bands,loan_count_bands,new_limit_factor_21,new_limit_factor_7,new_limit_factor_1,weight_good_loans_repayment_ratio,weight_consistency,weight_recency,risk_rules_factor,ultimate_factor_21,ultimate_factor_7,ultimate_factor_1,limit_21_day,limit_7_day,limit_1_day,adjusted_21_limit,adjusted_7_limit,adjusted_1_limit,final_21_limit,final_7_limit,final_1_limit,blacklist_flag,total_final_21_limit,previous_21_limit,previous_7_limit,previous_1_limit
20975,254713662215,7008186,123734.33,2022-09-09,2022-10-04,26,9,0.35,No_rules_relaxed,2.0,Yes,1.0,28262287.0,Reject,0.0,True,254713662215.0,6,300.0,21.0,171500.0,0.0,2022-06-20,2022-07-11,NaT,2022-07-11,86.0,2.0,active_default,430.0,171500.0,2022-06-20,5.0,0.83,107.0,0,6,0.35,0.12,0.12,0.7,0.71,0.71,Band 2,Band 4,0.05,0.1,0.1,0,0,2,0.5,0.02,0.04,0.04,2165.35,4367.09,4367.09,0.0,0.0,0.0,0,0,0,1,376900,0,0,0
6537,254722838106,965396,19751.81,2022-09-06,2022-10-05,30,26,0.87,relax_rules,1.0,Yes,1.0,5180428.0,Approve,3000.0,True,,24,600.0,30.0,10000.0,10000.0,2020-03-29,2020-04-28,2020-04-28,2020-04-28,0.0,1.0,closed_on_time,,40000.0,2019-11-04,24.0,1.0,920.0,1,0,0.5,0.17,0.17,1.0,1.0,1.0,Band 6,Band 8,0.5,0.28,0.28,1,1,1,1.0,0.5,0.28,0.28,9875.91,5431.75,5431.75,0.0,5431.75,5431.75,0,7200,7200,0,175000,0,7200,7200


In [9]:
# Risk columns in scope
df.columns

Index(['mobile_number', 'store_number', 'approx_30_days_trx_val',
       'most_recent_trx_date_past_30_days', 'last_trx_date',
       'expected_trx_days', 'actual_trx_days', 'page_active_days',
       'inference_col', 'days_since_last_trx', 'transacted_last_5_days',
       'weight_till_recency', 'national_id', 'idm_recommendation', 'idm_limit',
       'is_iprs_validated', 'client_mobile_number', 'loan_count',
       'loan_status', 'term_frequency', 'principal_disbursed',
       'principal_repaid', 'disbursed_on_date', 'expected_matured_on_date',
       'closed_on_date', 'due_date_fixed', 'days_past_due', 'bloom_version',
       'loan_repayment_status', 'src_crdt_score', 'max_principal_amount',
       'max_loan_disbursement_date', 'count_good_loans',
       'good_loans_repayment_ratio', 'num_days_since_last_disbursement',
       'weight_dpd', 'adjusted_loan_count', 'limit_factor_21',
       'limit_factor_7', 'limit_factor_1', 'idm_factor_21', 'idm_factor_7',
       'idm_factor_1', 'trad

#### Baseline

In [10]:
# Baseline refresh run
df_baseline = pd.read_parquet('../data/processed/Limits_refresh_summary_20221115_multiple_products.parquet')
# df_baseline = df_baseline[df.columns]
df_baseline.sample(2)

Unnamed: 0,mobile_number,store_number,approx_30_days_trx_val,most_recent_trx_date_past_30_days,last_trx_date,expected_trx_days,actual_trx_days,page_active_days,inference_col,days_since_last_trx,transacted_last_5_days,weight_till_recency,national_id,idm_recommendation,idm_limit,is_iprs_validated,client_mobile_number,loan_count,loan_status,term_frequency,principal_disbursed,principal_repaid,disbursed_on_date,expected_matured_on_date,closed_on_date,due_date_fixed,days_past_due,bloom_version,loan_repayment_status,src_crdt_score,max_principal_amount,max_loan_disbursement_date,count_good_loans,good_loans_repayment_ratio,num_days_since_last_disbursement,weight_dpd,adjusted_loan_count,due_date_rm_ge_rm_1d,due_date_rm_ge_rm_add_back,rllvr_date_rm_ge_rm_add_back,limit_factor_21,limit_factor_7,limit_factor_1,idm_factor_21,idm_factor_7,idm_factor_1,trading_consistency_bands,loan_count_bands,new_limit_factor_21,new_limit_factor_7,new_limit_factor_1,weight_good_loans_repayment_ratio,weight_consistency,weight_recency,risk_rules_factor,ultimate_factor_21,ultimate_factor_7,ultimate_factor_1,limit_21_day,limit_7_day,limit_1_day,adjusted_21_limit,adjusted_7_limit,adjusted_1_limit,final_21_limit,final_7_limit,final_1_limit,blacklist_flag,total_final_21_limit,previous_21_limit,previous_7_limit,previous_1_limit,model_version,created_at
38889,254702630591,7224320,60839.8,2022-09-20,2022-10-20,31,30,0.97,No_rules_relaxed,25.0,No,0.0,3342485,Reject,0.0,True,254702630591,2,300.0,7.0,2900.0,0.0,2022-10-20,2022-10-27,NaT,2022-10-27,18.0,2.0,active_default,438.0,2900.0,2022-10-20,1.0,0.5,25.0,0,0,1,1,0,0.35,0.12,0.12,0.7,0.71,0.71,Band 7,Band 2,0.25,0.15,0.15,0.0,1.0,0,0.25,0.04,0.03,0.03,2661.74,1610.47,1610.47,0.0,0.0,0.0,0,0,0,0,0,0,2900,2900,"2022-004[2022-05-14, 2022-11-15]",2022-11-15 15:53:24
32908,254728015252,7326848,24148.83,2022-09-21,2022-10-20,30,20,0.67,No_rules_relaxed,25.0,No,0.0,30103503,Reject,0.0,True,254728015252,16,300.0,1.0,4000.0,283.93,2022-10-03,2022-10-04,NaT,2022-10-04,41.0,2.0,active_default,425.0,7000.0,2022-07-11,15.0,0.94,42.0,0,16,1,1,1,0.35,0.12,0.12,0.7,0.71,0.71,Band 4,Band 8,0.35,0.23,0.23,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,67400,0,8000,8000,"2022-004[2022-05-14, 2022-11-15]",2022-11-15 15:53:24


In [11]:
# Data set dimension
df_baseline.shape

(39261, 74)

In [12]:
# Data set schema
df_baseline.dtypes.to_frame().T

Unnamed: 0,mobile_number,store_number,approx_30_days_trx_val,most_recent_trx_date_past_30_days,last_trx_date,expected_trx_days,actual_trx_days,page_active_days,inference_col,days_since_last_trx,transacted_last_5_days,weight_till_recency,national_id,idm_recommendation,idm_limit,is_iprs_validated,client_mobile_number,loan_count,loan_status,term_frequency,principal_disbursed,principal_repaid,disbursed_on_date,expected_matured_on_date,closed_on_date,due_date_fixed,days_past_due,bloom_version,loan_repayment_status,src_crdt_score,max_principal_amount,max_loan_disbursement_date,count_good_loans,good_loans_repayment_ratio,num_days_since_last_disbursement,weight_dpd,adjusted_loan_count,due_date_rm_ge_rm_1d,due_date_rm_ge_rm_add_back,rllvr_date_rm_ge_rm_add_back,limit_factor_21,limit_factor_7,limit_factor_1,idm_factor_21,idm_factor_7,idm_factor_1,trading_consistency_bands,loan_count_bands,new_limit_factor_21,new_limit_factor_7,new_limit_factor_1,weight_good_loans_repayment_ratio,weight_consistency,weight_recency,risk_rules_factor,ultimate_factor_21,ultimate_factor_7,ultimate_factor_1,limit_21_day,limit_7_day,limit_1_day,adjusted_21_limit,adjusted_7_limit,adjusted_1_limit,final_21_limit,final_7_limit,final_1_limit,blacklist_flag,total_final_21_limit,previous_21_limit,previous_7_limit,previous_1_limit,model_version,created_at
0,object,object,float64,datetime64[ns],datetime64[ns],int32,int32,float64,object,float64,object,float64,object,object,float64,object,object,int32,float64,float64,float64,float64,datetime64[ns],datetime64[ns],datetime64[ns],datetime64[ns],float64,float64,object,float64,float64,datetime64[ns],float64,float64,float64,int32,int32,int32,int32,int32,float64,float64,float64,float64,float64,float64,object,object,float64,float64,float64,float64,float64,int32,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,int32,int32,int32,int32,int32,int32,int32,int32,object,datetime64[ns]


In [13]:
# Limits
df_baseline[limit_cols].sum()

final_21_limit    17225700
final_7_limit     18194800
final_1_limit     16377300
dtype: int64

In [14]:
%%time
# Export excel output
# df_baseline.to_excel('../data/processed/Limits_refresh_summary_20221019_multiple_products.xlsx', index=False)

CPU times: total: 0 ns
Wall time: 0 ns


#### Impact assessment 1 and 2

In [15]:
# Impact assessment 1 and 2 refresh run
df_ia_1_2 = pd.read_parquet('../data/processed/Limits_refresh_summary_20221019_multiple_products_ia_01_02.parquet')
df_ia_1_2 = df_ia_1_2[df.columns]
df_ia_1_2.sample(2)

Unnamed: 0,mobile_number,store_number,approx_30_days_trx_val,most_recent_trx_date_past_30_days,last_trx_date,expected_trx_days,actual_trx_days,page_active_days,inference_col,days_since_last_trx,transacted_last_5_days,weight_till_recency,national_id,idm_recommendation,idm_limit,is_iprs_validated,client_mobile_number,loan_count,loan_status,term_frequency,principal_disbursed,principal_repaid,disbursed_on_date,expected_matured_on_date,closed_on_date,due_date_fixed,days_past_due,bloom_version,loan_repayment_status,src_crdt_score,max_principal_amount,max_loan_disbursement_date,count_good_loans,good_loans_repayment_ratio,num_days_since_last_disbursement,weight_dpd,adjusted_loan_count,limit_factor_21,limit_factor_7,limit_factor_1,idm_factor_21,idm_factor_7,idm_factor_1,trading_consistency_bands,loan_count_bands,new_limit_factor_21,new_limit_factor_7,new_limit_factor_1,weight_good_loans_repayment_ratio,weight_consistency,weight_recency,risk_rules_factor,ultimate_factor_21,ultimate_factor_7,ultimate_factor_1,limit_21_day,limit_7_day,limit_1_day,adjusted_21_limit,adjusted_7_limit,adjusted_1_limit,final_21_limit,final_7_limit,final_1_limit,blacklist_flag,total_final_21_limit,previous_21_limit,previous_7_limit,previous_1_limit
56641,254795416901,7737682,15374.58,2022-09-18,2022-10-18,31,30,0.97,No_rules_relaxed,0.0,Yes,1.0,37235159,Reject,0.0,True,,0,,,,,NaT,NaT,NaT,NaT,1.0,,,,,NaT,,1.0,,0,0,0.35,0.12,0.12,0.7,0.71,0.71,Band 7,Band 1,0.0,0.12,0.12,1.0,1.0,1,0.75,0.0,0.07,0.07,0.0,1017.44,1017.44,0.0,1017.44,1017.44,0,0,800,0,0,0,0,800
12652,254711707054,7556387,91035.09,2022-09-18,2022-10-18,31,31,1.0,relax_rules,0.0,Yes,1.0,24921154,Reject,0.0,True,,10,300.0,7.0,12000.0,0.0,2022-03-15,2022-03-22,NaT,2022-03-22,210.0,2.0,active_default,401.0,12000.0,2022-03-15,9.0,0.9,217.0,0,0,0.35,0.12,0.12,0.7,0.71,0.71,Band 7,Band 6,0.45,0.25,0.25,0.0,1.0,1,0.5,0.16,0.09,0.09,14338.03,8032.51,8032.51,0.0,8032.51,8032.51,0,0,0,1,569900,0,0,0


In [16]:
# Data set dimension
df_ia_1_2.shape

(62849, 69)

In [17]:
# Data set schema
df_ia_1_2.dtypes.to_frame().T

Unnamed: 0,mobile_number,store_number,approx_30_days_trx_val,most_recent_trx_date_past_30_days,last_trx_date,expected_trx_days,actual_trx_days,page_active_days,inference_col,days_since_last_trx,transacted_last_5_days,weight_till_recency,national_id,idm_recommendation,idm_limit,is_iprs_validated,client_mobile_number,loan_count,loan_status,term_frequency,principal_disbursed,principal_repaid,disbursed_on_date,expected_matured_on_date,closed_on_date,due_date_fixed,days_past_due,bloom_version,loan_repayment_status,src_crdt_score,max_principal_amount,max_loan_disbursement_date,count_good_loans,good_loans_repayment_ratio,num_days_since_last_disbursement,weight_dpd,adjusted_loan_count,limit_factor_21,limit_factor_7,limit_factor_1,idm_factor_21,idm_factor_7,idm_factor_1,trading_consistency_bands,loan_count_bands,new_limit_factor_21,new_limit_factor_7,new_limit_factor_1,weight_good_loans_repayment_ratio,weight_consistency,weight_recency,risk_rules_factor,ultimate_factor_21,ultimate_factor_7,ultimate_factor_1,limit_21_day,limit_7_day,limit_1_day,adjusted_21_limit,adjusted_7_limit,adjusted_1_limit,final_21_limit,final_7_limit,final_1_limit,blacklist_flag,total_final_21_limit,previous_21_limit,previous_7_limit,previous_1_limit
0,object,object,float64,datetime64[ns],datetime64[ns],int32,int32,float64,object,float64,object,float64,object,object,float64,object,object,int32,float64,float64,float64,float64,datetime64[ns],datetime64[ns],datetime64[ns],datetime64[ns],float64,float64,object,float64,float64,datetime64[ns],float64,float64,float64,int32,int32,float64,float64,float64,float64,float64,float64,object,object,float64,float64,float64,float64,float64,int32,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,int32,int32,int32,int32,int32,int32,int32,int32


In [18]:
# Limits
df_ia_1_2[limit_cols].sum()

final_21_limit    110810700
final_7_limit     431382800
final_1_limit     433585900
dtype: int64

In [19]:
%%time
# Export excel output
# df_baseline.to_excel('../data/processed/Limits_refresh_summary_20221019_multiple_products_ia_01_02.xlsx', index=False)

CPU times: total: 0 ns
Wall time: 0 ns
