In [1]:
# Import modules
import sys

In [2]:
# Import custom modules
sys.path.append('../')

from src.models.upload_data import *

In [3]:
# Set notebook attributes
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 300)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [4]:
# Set magic attributes
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [5]:
# Configuration
args = argparse.ArgumentParser()
args.add_argument("--config", default="../params.yaml")
parsed_args = vars(args.parse_args(""))
config_path = parsed_args['config']

In [6]:
# Load configurations
config = read_params(config_path)
project_dir = config["project_dir"]
scored_limits_risk_review_data_path_excel_parquet = config["processed_data_config"]["scored_limits_risk_review_data_excel_parquet"]
before_21d_graduation_limits_data_path = config["interim_data_config"]["before_21d_graduation_limits_data_parquet"]
after_rmdd_ge95_data_path = config["interim_data_config"]["after_rmdd_ge95_data_parquet"]
after_rmdd_ge96_data_path = config["interim_data_config"]["after_rmdd_ge96_data_parquet"]

### Limit Investigation

#### Legacy

In [7]:
# Columns not in scope
df_columns_not_in_scope = ['sum_3_months_trx_val', 'approx_30_days_trx_val', 'model_version', 'created_at', 'record_added_to_warehouse_on_timestamp', 'old_src_crdt_score']
limit_cols = ['final_21_limit', 'final_7_limit', 'final_1_limit']

In [8]:
# Legacy system refresh run
# df = pd.read_excel('../data/external/07_JN_Limits_refresh_summary_20221006_multiple_products_rec_new_2.xlsx')
# df.to_parquet('../data/external/07_JN_Limits_refresh_summary_20221006_multiple_products_rec_new_2.parquet', index=False)
df_raw = pd.read_parquet('../data/external/07_JN_Limits_refresh_summary_20221006_multiple_products_rec_new_2.parquet')
df_raw['store_number'] = df_raw['store_number'].astype('str')
df = df_raw.drop(columns=df_columns_not_in_scope)
df.rename(columns={'avg_3_months_trx_val': 'approx_30_days_trx_val'}, inplace=True)
df.sample(2)

Unnamed: 0,mobile_number,store_number,approx_30_days_trx_val,most_recent_trx_date_past_30_days,last_trx_date,expected_trx_days,actual_trx_days,page_active_days,inference_col,days_since_last_trx,transacted_last_5_days,weight_till_recency,national_id,idm_recommendation,idm_limit,is_iprs_validated,client_mobile_number,loan_count,loan_status,term_frequency,principal_disbursed,principal_repaid,disbursed_on_date,expected_matured_on_date,closed_on_date,due_date_fixed,days_past_due,bloom_version,loan_repayment_status,src_crdt_score,max_principal_amount,max_loan_disbursement_date,count_good_loans,good_loans_repayment_ratio,num_days_since_last_disbursement,weight_dpd,adjusted_loan_count,limit_factor_21,limit_factor_7,limit_factor_1,idm_factor_21,idm_factor_7,idm_factor_1,trading_consistency_bands,loan_count_bands,new_limit_factor_21,new_limit_factor_7,new_limit_factor_1,weight_good_loans_repayment_ratio,weight_consistency,weight_recency,risk_rules_factor,ultimate_factor_21,ultimate_factor_7,ultimate_factor_1,limit_21_day,limit_7_day,limit_1_day,adjusted_21_limit,adjusted_7_limit,adjusted_1_limit,final_21_limit,final_7_limit,final_1_limit,blacklist_flag,total_final_21_limit,previous_21_limit,previous_7_limit,previous_1_limit
34780,254791390899,148371,162262.08,2022-09-06,2022-10-05,30,26,0.87,relax_rules,1.0,Yes,1.0,28451681.0,Reject,0.0,True,254791390899.0,13,300.0,7.0,50400.0,0.0,2022-09-28,2022-10-05,NaT,2022-10-05,0.0,2.0,current_active,423.0,50400.0,2022-09-28,13.0,1.0,7.0,1,13,0.35,0.12,0.12,0.7,0.71,0.71,Band 6,Band 8,0.5,0.28,0.28,1,1,1,1.0,0.35,0.19,0.19,56791.73,31497.93,31497.93,56791.73,31497.93,31497.93,56600,50400,50400,0,472000,56600,50400,50400
13593,254722595191,7054254,140128.85,2022-09-06,2022-10-05,30,30,1.0,No_rules_relaxed,1.0,Yes,1.0,21669837.0,Reject,0.0,True,,5,300.0,7.0,28500.0,0.0,2022-03-30,2022-04-06,NaT,2022-04-06,182.0,2.0,active_default,,51900.0,2022-03-09,4.0,0.8,189.0,0,0,0.35,0.12,0.12,0.7,0.71,0.71,Band 7,Band 4,0.35,0.2,0.2,0,1,1,0.5,0.12,0.07,0.07,17165.78,9891.45,9891.45,0.0,9891.45,9891.45,0,0,0,1,186800,0,0,0


In [9]:
# Data set dimension
df.shape

(60977, 69)

In [10]:
# Data set schema
df.dtypes.to_frame().T

Unnamed: 0,mobile_number,store_number,approx_30_days_trx_val,most_recent_trx_date_past_30_days,last_trx_date,expected_trx_days,actual_trx_days,page_active_days,inference_col,days_since_last_trx,transacted_last_5_days,weight_till_recency,national_id,idm_recommendation,idm_limit,is_iprs_validated,client_mobile_number,loan_count,loan_status,term_frequency,principal_disbursed,principal_repaid,disbursed_on_date,expected_matured_on_date,closed_on_date,due_date_fixed,days_past_due,bloom_version,loan_repayment_status,src_crdt_score,max_principal_amount,max_loan_disbursement_date,count_good_loans,good_loans_repayment_ratio,num_days_since_last_disbursement,weight_dpd,adjusted_loan_count,limit_factor_21,limit_factor_7,limit_factor_1,idm_factor_21,idm_factor_7,idm_factor_1,trading_consistency_bands,loan_count_bands,new_limit_factor_21,new_limit_factor_7,new_limit_factor_1,weight_good_loans_repayment_ratio,weight_consistency,weight_recency,risk_rules_factor,ultimate_factor_21,ultimate_factor_7,ultimate_factor_1,limit_21_day,limit_7_day,limit_1_day,adjusted_21_limit,adjusted_7_limit,adjusted_1_limit,final_21_limit,final_7_limit,final_1_limit,blacklist_flag,total_final_21_limit,previous_21_limit,previous_7_limit,previous_1_limit
0,object,object,float64,datetime64[ns],datetime64[ns],int64,int64,float64,object,float64,object,float64,float64,object,float64,bool,float64,int64,float64,float64,float64,float64,datetime64[ns],datetime64[ns],datetime64[ns],datetime64[ns],float64,float64,object,float64,float64,datetime64[ns],float64,float64,float64,int64,int64,float64,float64,float64,float64,float64,float64,object,object,float64,float64,float64,int64,int64,int64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,int64,int64,int64,int64,int64,int64,int64,int64


#### Ubuntu 2.0

In [11]:
# Ubuntu 2.0 refresh run
df_scoring_stabilisation_raw = pd.read_parquet(project_dir + scored_limits_risk_review_data_path_excel_parquet.format(refresh_date.replace("-", "")))
df_scoring_stabilisation = df_scoring_stabilisation_raw[df.columns]
df_scoring_stabilisation.sample(2)

Unnamed: 0,mobile_number,store_number,approx_30_days_trx_val,most_recent_trx_date_past_30_days,last_trx_date,expected_trx_days,actual_trx_days,page_active_days,inference_col,days_since_last_trx,transacted_last_5_days,weight_till_recency,national_id,idm_recommendation,idm_limit,is_iprs_validated,client_mobile_number,loan_count,loan_status,term_frequency,principal_disbursed,principal_repaid,disbursed_on_date,expected_matured_on_date,closed_on_date,due_date_fixed,days_past_due,bloom_version,loan_repayment_status,src_crdt_score,max_principal_amount,max_loan_disbursement_date,count_good_loans,good_loans_repayment_ratio,num_days_since_last_disbursement,weight_dpd,adjusted_loan_count,limit_factor_21,limit_factor_7,limit_factor_1,idm_factor_21,idm_factor_7,idm_factor_1,trading_consistency_bands,loan_count_bands,new_limit_factor_21,new_limit_factor_7,new_limit_factor_1,weight_good_loans_repayment_ratio,weight_consistency,weight_recency,risk_rules_factor,ultimate_factor_21,ultimate_factor_7,ultimate_factor_1,limit_21_day,limit_7_day,limit_1_day,adjusted_21_limit,adjusted_7_limit,adjusted_1_limit,final_21_limit,final_7_limit,final_1_limit,blacklist_flag,total_final_21_limit,previous_21_limit,previous_7_limit,previous_1_limit
20888,254723459568,270710,165789.6,2022-09-20,2022-10-15,26,16,0.62,No_rules_relaxed,32.0,No,0.0,7121798,Approve,0.0,True,254723459568.0,14,300.0,7.0,32000.0,21543.86,2022-06-19,2022-06-26,NaT,2022-06-26,143.0,2.0,active_default,451.0,32000.0,2022-06-19,11.0,0.79,150.0,0,14,0.5,0.17,0.17,1.0,1.0,1.0,Band 4,Band 8,0.35,0.23,0.23,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1,801800,0,0,0
4306,254706419485,237572,193.33,2022-09-23,2022-10-08,16,2,0.12,No_rules_relaxed,39.0,No,0.0,20021983,Reject,0.0,True,,3,300.0,30.0,5000.0,0.0,2019-11-22,2019-12-22,NaT,2019-12-22,1060.0,1.0,active_default,,5000.0,2019-11-22,2.0,0.67,1090.0,0,0,0.35,0.12,0.12,0.7,0.71,0.71,Band 1,Band 3,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1,0,0,0,0


In [12]:
# Data set dimension
df_scoring_stabilisation.shape

(39261, 69)

In [13]:
# Data set schema
df_scoring_stabilisation.dtypes.to_frame().T

Unnamed: 0,mobile_number,store_number,approx_30_days_trx_val,most_recent_trx_date_past_30_days,last_trx_date,expected_trx_days,actual_trx_days,page_active_days,inference_col,days_since_last_trx,transacted_last_5_days,weight_till_recency,national_id,idm_recommendation,idm_limit,is_iprs_validated,client_mobile_number,loan_count,loan_status,term_frequency,principal_disbursed,principal_repaid,disbursed_on_date,expected_matured_on_date,closed_on_date,due_date_fixed,days_past_due,bloom_version,loan_repayment_status,src_crdt_score,max_principal_amount,max_loan_disbursement_date,count_good_loans,good_loans_repayment_ratio,num_days_since_last_disbursement,weight_dpd,adjusted_loan_count,limit_factor_21,limit_factor_7,limit_factor_1,idm_factor_21,idm_factor_7,idm_factor_1,trading_consistency_bands,loan_count_bands,new_limit_factor_21,new_limit_factor_7,new_limit_factor_1,weight_good_loans_repayment_ratio,weight_consistency,weight_recency,risk_rules_factor,ultimate_factor_21,ultimate_factor_7,ultimate_factor_1,limit_21_day,limit_7_day,limit_1_day,adjusted_21_limit,adjusted_7_limit,adjusted_1_limit,final_21_limit,final_7_limit,final_1_limit,blacklist_flag,total_final_21_limit,previous_21_limit,previous_7_limit,previous_1_limit
0,object,object,float64,datetime64[ns],datetime64[ns],int32,int32,float64,object,float64,object,float64,object,object,float64,object,object,int32,float64,float64,float64,float64,datetime64[ns],datetime64[ns],datetime64[ns],datetime64[ns],float64,float64,object,float64,float64,datetime64[ns],float64,float64,float64,int32,int32,float64,float64,float64,float64,float64,float64,object,object,float64,float64,float64,float64,float64,int32,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,int32,int32,int32,int32,int32,int32,int32,int32


#### Comparison

In [14]:
# In legacy and not in Ubuntu 2.0
df_diff_in_legacy = df[~(df['store_number'].isin(df_scoring_stabilisation['store_number']))]
df_diff_in_legacy.shape

(22348, 69)

In [15]:
# In Ubuntu 2.0 and not in legacy
df_diff_in_ubuntu = df_scoring_stabilisation[~(df_scoring_stabilisation['store_number'].isin(df['store_number']))]
df_diff_in_ubuntu.shape

(632, 69)

In [16]:
# In legacy and in Ubuntu 2.0
df_similar = df[(df['store_number'].isin(df_scoring_stabilisation['store_number']))]
df_similar.shape

(38629, 69)

#### Samples

In [17]:
# Sample similar in legacy
df_similar_l = df[df['store_number'].isin(df_similar['store_number'])]
df_similar_ls = df_similar_l.sample(2).sort_values(by=['store_number'])
df_similar_l[df_similar_l['store_number'].isin(['7361329', '943478', '7252207'])].sort_values(by=['store_number'])

Unnamed: 0,mobile_number,store_number,approx_30_days_trx_val,most_recent_trx_date_past_30_days,last_trx_date,expected_trx_days,actual_trx_days,page_active_days,inference_col,days_since_last_trx,transacted_last_5_days,weight_till_recency,national_id,idm_recommendation,idm_limit,is_iprs_validated,client_mobile_number,loan_count,loan_status,term_frequency,principal_disbursed,principal_repaid,disbursed_on_date,expected_matured_on_date,closed_on_date,due_date_fixed,days_past_due,bloom_version,loan_repayment_status,src_crdt_score,max_principal_amount,max_loan_disbursement_date,count_good_loans,good_loans_repayment_ratio,num_days_since_last_disbursement,weight_dpd,adjusted_loan_count,limit_factor_21,limit_factor_7,limit_factor_1,idm_factor_21,idm_factor_7,idm_factor_1,trading_consistency_bands,loan_count_bands,new_limit_factor_21,new_limit_factor_7,new_limit_factor_1,weight_good_loans_repayment_ratio,weight_consistency,weight_recency,risk_rules_factor,ultimate_factor_21,ultimate_factor_7,ultimate_factor_1,limit_21_day,limit_7_day,limit_1_day,adjusted_21_limit,adjusted_7_limit,adjusted_1_limit,final_21_limit,final_7_limit,final_1_limit,blacklist_flag,total_final_21_limit,previous_21_limit,previous_7_limit,previous_1_limit
35217,254722874310,7252207,24679.9,2022-09-06,2022-10-05,30,24,0.8,relax_rules,1.0,Yes,1.0,23681415.0,Reject,0.0,True,254722874310.0,4,300.0,7.0,1100.0,0.0,2022-09-29,2022-10-06,NaT,2022-10-06,-1.0,2.0,current_active,425.0,1100.0,2022-09-03,4.0,1.0,6.0,1,0,0.35,0.12,0.12,0.7,0.71,0.71,Band 6,Band 3,0.25,0.15,0.15,1,0,1,0.75,0.13,0.08,0.08,3239.24,1959.87,1959.87,0.0,1959.87,1959.87,0,2000,1100,0,0,0,0,1100
15416,254724915820,7361329,10268.91,2022-09-06,2022-10-05,30,29,0.97,No_rules_relaxed,1.0,Yes,1.0,23584522.0,Reject,0.0,True,,14,300.0,7.0,2000.0,0.0,2022-04-23,2022-04-30,NaT,2022-04-30,158.0,2.0,active_default,415.0,2800.0,2022-03-19,10.0,0.71,165.0,0,14,0.35,0.12,0.12,0.7,0.71,0.71,Band 7,Band 8,0.55,0.3,0.3,0,1,1,0.5,0.19,0.11,0.11,1976.77,1087.3,1087.3,0.0,0.0,0.0,0,0,0,1,122700,0,0,0
7573,254722527886,943478,130.0,2022-09-19,2022-09-29,11,2,0.18,relax_rules,7.0,Yes,0.7,8485166.0,Reject,0.0,True,,17,300.0,30.0,40000.0,0.0,2021-08-25,2021-09-24,NaT,2021-09-24,376.0,1.0,active_default,,150000.0,2018-04-11,16.0,0.94,406.0,0,0,0.35,0.12,0.12,0.7,0.71,0.71,Band 1,Band 8,0.2,0.15,0.15,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1,11000,0,0,0


In [18]:
# Sample similar in Ubuntu 2.0
df_similar_u = df_scoring_stabilisation[df_scoring_stabilisation['store_number'].isin(df_similar_l['store_number'])]
df_similar_us = df_similar_u[df_similar_u['store_number'].isin(df_similar_ls['store_number'])]
df_similar_u[df_similar_u['store_number'].isin(['7361329', '943478', '7252207'])].sort_values(by=['store_number'])

Unnamed: 0,mobile_number,store_number,approx_30_days_trx_val,most_recent_trx_date_past_30_days,last_trx_date,expected_trx_days,actual_trx_days,page_active_days,inference_col,days_since_last_trx,transacted_last_5_days,weight_till_recency,national_id,idm_recommendation,idm_limit,is_iprs_validated,client_mobile_number,loan_count,loan_status,term_frequency,principal_disbursed,principal_repaid,disbursed_on_date,expected_matured_on_date,closed_on_date,due_date_fixed,days_past_due,bloom_version,loan_repayment_status,src_crdt_score,max_principal_amount,max_loan_disbursement_date,count_good_loans,good_loans_repayment_ratio,num_days_since_last_disbursement,weight_dpd,adjusted_loan_count,limit_factor_21,limit_factor_7,limit_factor_1,idm_factor_21,idm_factor_7,idm_factor_1,trading_consistency_bands,loan_count_bands,new_limit_factor_21,new_limit_factor_7,new_limit_factor_1,weight_good_loans_repayment_ratio,weight_consistency,weight_recency,risk_rules_factor,ultimate_factor_21,ultimate_factor_7,ultimate_factor_1,limit_21_day,limit_7_day,limit_1_day,adjusted_21_limit,adjusted_7_limit,adjusted_1_limit,final_21_limit,final_7_limit,final_1_limit,blacklist_flag,total_final_21_limit,previous_21_limit,previous_7_limit,previous_1_limit
37267,254722874310,7252207,29192.74,2022-09-20,2022-10-20,31,25,0.81,No_rules_relaxed,27.0,No,0.0,23681415,Approve,0.0,True,254722874310.0,6,300.0,7.0,2000.0,0.0,2022-10-17,2022-10-24,NaT,2022-10-24,23.0,2.0,active_default,425.0,2000.0,2022-10-17,5.0,0.83,30.0,0,6,0.5,0.17,0.17,1.0,1.0,1.0,Band 6,Band 4,0.3,0.17,0.17,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,2000,1100
15563,254724915820,7361329,12800.68,2022-09-20,2022-10-20,31,29,0.94,No_rules_relaxed,27.0,No,0.0,23584522,Reject,0.0,True,,14,300.0,7.0,2000.0,0.0,2022-04-23,2022-04-30,NaT,2022-04-30,200.0,2.0,active_default,415.0,2800.0,2022-03-19,10.0,0.71,207.0,0,0,0.35,0.12,0.12,0.7,0.71,0.71,Band 7,Band 8,0.55,0.3,0.3,0.0,1.0,0,0.25,0.1,0.05,0.05,1232.07,677.68,677.68,0.0,0.0,0.0,0,0,0,1,122700,0,0,0
7699,254722527886,943478,175.0,2022-09-29,2022-10-16,18,3,0.17,No_rules_relaxed,31.0,No,0.0,8485166,Reject,0.0,True,,17,300.0,30.0,40000.0,0.0,2021-08-25,2021-09-24,NaT,2021-09-24,418.0,1.0,active_default,,150000.0,2018-04-11,16.0,0.94,448.0,0,0,0.35,0.12,0.12,0.7,0.71,0.71,Band 1,Band 8,0.2,0.15,0.15,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1,11000,0,0,0


#### Limits

In [19]:
# Legacy limits
df[limit_cols].sum()

final_21_limit    209283300
final_7_limit     507871200
final_1_limit     511063900
dtype: int64

In [20]:
# Ubuntu 2.0 limits
df_scoring_stabilisation[limit_cols].sum()

final_21_limit    17065000
final_7_limit     18091300
final_1_limit     16272400
dtype: int64

#### Diagnostics

##### 01. approx_30_days_trx_val

##### 02. most_recent_trx_date_past_30_days

In [21]:
# Legacy
df.last_trx_date.max() - df.last_trx_date.min()

Timedelta('29 days 00:00:00')

In [22]:
# Ubuntu 2.0
df_scoring_stabilisation.last_trx_date.max() - df_scoring_stabilisation.last_trx_date.min()

Timedelta('30 days 00:00:00')

##### 03. days_since_last_trx

In [23]:
df.loc[(df.last_trx_date == (extract_end_date - dt.timedelta(days=5))), 'days_since_last_trx'].unique()

array([], dtype=float64)

In [24]:
df_scoring_stabilisation[df_scoring_stabilisation.days_since_last_trx == 5].head()

Unnamed: 0,mobile_number,store_number,approx_30_days_trx_val,most_recent_trx_date_past_30_days,last_trx_date,expected_trx_days,actual_trx_days,page_active_days,inference_col,days_since_last_trx,transacted_last_5_days,weight_till_recency,national_id,idm_recommendation,idm_limit,is_iprs_validated,client_mobile_number,loan_count,loan_status,term_frequency,principal_disbursed,principal_repaid,disbursed_on_date,expected_matured_on_date,closed_on_date,due_date_fixed,days_past_due,bloom_version,loan_repayment_status,src_crdt_score,max_principal_amount,max_loan_disbursement_date,count_good_loans,good_loans_repayment_ratio,num_days_since_last_disbursement,weight_dpd,adjusted_loan_count,limit_factor_21,limit_factor_7,limit_factor_1,idm_factor_21,idm_factor_7,idm_factor_1,trading_consistency_bands,loan_count_bands,new_limit_factor_21,new_limit_factor_7,new_limit_factor_1,weight_good_loans_repayment_ratio,weight_consistency,weight_recency,risk_rules_factor,ultimate_factor_21,ultimate_factor_7,ultimate_factor_1,limit_21_day,limit_7_day,limit_1_day,adjusted_21_limit,adjusted_7_limit,adjusted_1_limit,final_21_limit,final_7_limit,final_1_limit,blacklist_flag,total_final_21_limit,previous_21_limit,previous_7_limit,previous_1_limit


##### 04. good_loans_repayment_ratio

##### 05. max_loan_disbursement_date

##### 06. loan_id_product_concat

##### 07. weight_recency

##### 08. old_src_credit_score