In [1]:
# Import modules
import sys

In [2]:
# Import custom modules
sys.path.append('../')

from src.models.upload_data import *

In [3]:
# Set notebook attributes
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 300)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [4]:
# Set magic attributes
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [5]:
# Configuration
args = argparse.ArgumentParser()
args.add_argument("--config", default="../params.yaml")
parsed_args = vars(args.parse_args(""))
config_path = parsed_args['config']

### LFTSV

In [6]:
%%time
# Data ingestion, cleaning/wrangling
df_lftsv_raw = pull_data(config_path, lftsv_sql(), 'DWH', 'lftsv', False, 'raw')

Currently loading lftsv data set ...
Time taken is 0 seconds ...


Unnamed: 0,client_mifos_id,client_mobile_number,loan_status,loan_mifos_id,term_frequency,principal_disbursed,principal_repaid,interest_charged,interest_repaid,fee_charges_charged,fee_charges_repaid,penalty_charges_charged,penalty_charges_repaid,total_expected_repayment,total_repayment,total_outstanding,safaricom_loan_balance,disbursed_on_date,expected_matured_on_date,closed_on_date,store_number,bloom_version,src_crdt_score,due_date_fixed,end_rollover_date_fixed,expected_dpd90
261373,37756,,300,74352,7,6300.0,0.0,163.81,0.0,94.5,0.0,1954.28,151.24,8512.59,151.24,8361.35,,2022-04-22,2022-04-29,,7237855,2,,2022-04-29,2022-05-02,2022-07-31
177310,25330,,600,50270,7,18000.0,18000.0,360.0,360.0,0.0,0.0,0.0,0.0,18360.0,18360.0,0.0,,2019-01-01,2019-01-08,2019-01-07,698026,1,,2019-01-08,2019-01-11,2019-04-11


---------------------------------------------------------------------------------------------------------------------------------------
CPU times: user 532 ms, sys: 234 ms, total: 766 ms
Wall time: 380 ms


In [7]:
%%time
# Data ingestion, cleaning/wrangling
df_lftsv_clean = clean_dataset(config_path, 'lftsv')


Currently cleaning lftsv data set ...
Analysis start date 2017-12-10 00:00:00
Analysis latest date 2023-07-04 00:00:00


Unnamed: 0,client_mifos_id,client_mobile_number,loan_status,loan_mifos_id,term_frequency,principal_disbursed,principal_repaid,interest_charged,interest_repaid,fee_charges_charged,fee_charges_repaid,penalty_charges_charged,penalty_charges_repaid,total_expected_repayment,total_repayment,total_outstanding,disbursed_on_date,expected_matured_on_date,closed_on_date,store_number,bloom_version,src_crdt_score,due_date_fixed,end_rollover_date_fixed,expected_dpd90,safaricom_loan_balance
122137,21030,,600,8339,30,25000.0,25000.0,1750.0,1750.0,0.0,0.0,0.0,0.0,26750.0,26750.0,0.0,2018-03-27,2018-04-26,2018-04-16,932020,1.0,,2018-04-26,NaT,NaT,
271059,52036,254720390987.0,600,101573,7,12000.0,12000.0,312.01,312.01,0.0,0.0,0.0,0.0,12312.01,12312.01,0.0,2022-05-20,2022-05-27,2022-05-27,7357594,2.0,,2022-05-27,2022-05-30,2022-08-28,0.0


---------------------------------------------------------------------------------------------------------------------------------------
CPU times: user 4.5 s, sys: 536 ms, total: 5.04 s
Wall time: 4.73 s


In [8]:
%%time
# Feature engineering
df_lftsv_features = lftsv_feature_engineering(config_path, extract_end_date)


Any bloom 2 one day feature sample:
       loan_id_product_concat  any_bloom2_1day
108105             308912-2.0            False
---------------------------------------------------------------------------------------------------------------------------------------

Days past due feature sample:
       loan_id_product_concat  loan_status due_date_fixed closed_on_date   
108105             308912-2.0          600     2023-05-10     2023-05-10  \

       max_transaction_date  days_past_due  
108105           2023-05-10           0.00  
---------------------------------------------------------------------------------------------------------------------------------------

Days past end rollover feature sample:
       loan_id_product_concat  loan_status end_rollover_date_fixed   
108105             308912-2.0          600              2023-05-13  \

       closed_on_date max_transaction_date  days_past_end_rollover  
108105     2023-05-10           2023-05-10                   -3.00  
----

Unnamed: 0,client_mifos_id,client_mobile_number,loan_status,loan_mifos_id,term_frequency,principal_disbursed,principal_repaid,interest_charged,interest_repaid,fee_charges_charged,fee_charges_repaid,penalty_charges_charged,penalty_charges_repaid,total_expected_repayment,total_repayment,total_outstanding,disbursed_on_date,expected_matured_on_date,closed_on_date,store_number,bloom_version,src_crdt_score,due_date_fixed,end_rollover_date_fixed,expected_dpd90,safaricom_loan_balance,loan_id_product_concat,loan_count,loan_rank,total_repayment_vs_principal_amount,any_bloom2_1day,max_transaction_date,days_past_due,days_past_end_rollover,loan_repayment_status,days_diff_maturity_max_trans
77131,53224,254721134218,600,312504,7,20000.0,20000.0,624.0,624.0,0.0,0.0,0.0,0.0,20624.0,20624.0,0.0,2023-06-09,2023-06-16,2023-06-15,7188183,2.0,463.0,2023-06-16,2023-06-19,2023-09-17,0.0,312504-2.0,21,18.0,1.03,False,2023-06-15,-1.0,-4.0,closed_early_repayment,-1.0
131501,106886,254726677860,600,299796,21,3500.0,3500.0,319.2,319.2,0.0,0.0,0.0,0.0,3819.2,3819.2,0.0,2023-01-29,2023-02-19,2023-02-19,7786293,2.0,475.0,2023-02-19,2023-02-24,2023-05-25,0.0,299796-2.0,17,11.0,1.09,False,2023-02-19,0.0,-5.0,closed_on_time,0.0


---------------------------------------------------------------------------------------------------------------------------------------
CPU times: user 6.22 s, sys: 571 ms, total: 6.79 s
Wall time: 6.27 s


In [9]:
%%time
# Data aggregation
df_lftsv_aggregate = lftsv_agg_summaries(config_path, extract_end_date)


All loans:



Unnamed: 0,store_number,loan_rank,principal_disbursed,disbursed_on_date,safaricom_loan_balance,total_outstanding
39344,7491730,5.5,12000.0,2022-10-16,0.0,0.0
39345,7491730,5.5,81000.0,2022-10-16,115363.53,115254.37
39346,7491730,4.0,12900.0,2022-08-13,0.0,0.0
39347,7491730,3.0,12900.0,2022-08-06,0.0,0.0
39348,7491730,2.0,12900.0,2022-07-30,0.0,0.0
39349,7491730,1.0,12000.0,2022-07-25,12312.0,0.0


---------------------------------------------------------------------------------------------------------------------------------------

Loan snapshots:



Unnamed: 0,index,client_mifos_id,client_mobile_number,loan_status,loan_mifos_id,term_frequency,principal_disbursed,principal_repaid,interest_charged,interest_repaid,fee_charges_charged,fee_charges_repaid,penalty_charges_charged,penalty_charges_repaid,total_expected_repayment,total_repayment,total_outstanding,disbursed_on_date,expected_matured_on_date,closed_on_date,store_number,bloom_version,src_crdt_score,due_date_fixed,end_rollover_date_fixed,expected_dpd90,safaricom_loan_balance,loan_id_product_concat,loan_count,loan_rank,total_repayment_vs_principal_amount,any_bloom2_1day,max_transaction_date,days_past_due,days_past_end_rollover,loan_repayment_status,days_diff_maturity_max_trans,latest_21_loan,latest_7_loan,latest_1_loan,snapshot_3m_21_loan,snapshot_3m_7_loan,snapshot_3m_1_loan
19570,3811,55813,254700165018,600,288999,21,10500.0,10500.0,957.6,957.6,0.0,0.0,0.0,0.0,11457.6,11457.6,0.0,2022-12-23,2023-01-13,2023-01-10,7105817,2.0,450.0,2023-01-13,2023-01-18,2023-04-18,0.0,288999-2.0,11,11.0,1.09,False,2023-01-10,-3.0,-8.0,closed_early_repayment,-3.0,10500.0,0.0,0.0,10500.0,0.0,0.0


---------------------------------------------------------------------------------------------------------------------------------------

Latest loan:



Unnamed: 0,store_number,loan_rank,principal_disbursed,safaricom_loan_balance,total_outstanding
34331,7491730,5.5,81000.0,115363.53,115254.37


---------------------------------------------------------------------------------------------------------------------------------------


Unnamed: 0,client_mobile_number,store_number,loan_count,loan_status,term_frequency,principal_disbursed,principal_repaid,disbursed_on_date,expected_matured_on_date,closed_on_date,due_date_fixed,end_rollover_date_fixed,expected_dpd90,days_past_due,bloom_version,loan_repayment_status,src_crdt_score,max_principal_amount,max_loan_disbursement_date,count_good_loans,good_loans_repayment_ratio,num_days_since_last_disbursement,inference_col,weight_dpd,max_days_past_due,loan_count_past_3_months,count_7_day_loans,avg_loan_tenure,count_7_day_loans_paid_upto_rollover,good_loans_repayment_ratio(7_day_loans),minimum_7_day_principal_disbursed,total_sum_7_day_principal_disbursed,total_sum_1_day_principal_disbursed,avg_7_day_principal_disbursed,days_past_end_rollover,any_bloom2_1day,total_outstanding_sum,safaricom_loan_balance_sum,loan_balance,latest_21_loan,latest_7_loan,latest_1_loan,snapshot_3m_21_loan,snapshot_3m_7_loan,snapshot_3m_1_loan
54471,,987640,8.0,600.0,30.0,10000.0,10000.0,2019-01-29,2019-02-28,2019-03-25,2019-02-28,NaT,NaT,25.0,1.0,closed_default,,70000.0,2018-09-24,6.0,0.75,1617.0,No_rules_relaxed,0.0,25.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,False,0.0,0.0,0.0,0.0,5000.0,0.0,0.0,5000.0,0.0
24938,,724493,4.0,600.0,30.0,5000.0,5000.0,2019-11-25,2019-12-25,2019-12-25,2019-12-25,NaT,NaT,0.0,1.0,closed_on_time,,5000.0,2019-11-25,4.0,1.0,1317.0,relax_rules,1.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


---------------------------------------------------------------------------------------------------------------------------------------
CPU times: user 17.8 s, sys: 670 ms, total: 18.5 s
Wall time: 17.5 s


In [10]:
# Final dataset schema
df_lftsv_aggregate.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54882 entries, 0 to 54881
Data columns (total 45 columns):
 #   Column                                   Non-Null Count  Dtype         
---  ------                                   --------------  -----         
 0   client_mobile_number                     54860 non-null  object        
 1   store_number                             54882 non-null  object        
 2   loan_count                               54860 non-null  float64       
 3   loan_status                              54860 non-null  float64       
 4   term_frequency                           54860 non-null  float64       
 5   principal_disbursed                      54860 non-null  float64       
 6   principal_repaid                         54860 non-null  float64       
 7   disbursed_on_date                        54860 non-null  datetime64[ns]
 8   expected_matured_on_date                 54860 non-null  datetime64[ns]
 9   closed_on_date                         