# **Impact Assessment**

### Import processing and prepping libraries

In [1]:
#import data processing libraries
import pandas as pd
import numpy as np
import psycopg2
import pymysql
import datetime as dt
from datetime import timedelta
import os
import math as math
from scipy import stats

#import visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline

from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, normalize, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, classification_report, confusion_matrix, mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.manifold import TSNE
import xgboost as xgb
from sklearn.svm import SVR

import joblib
import pyarrow as pa
import pickle
from umap import UMAP

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

pd.set_option('display.float_format', lambda x: '%.3f' % x)

### Load the Data

In [3]:
# repayments = pd.read_csv("repayments_by_different_milestones.csv")
# repayments.to_parquet('repayments_by_different_milestones_2.parquet', index=False)

In [4]:
repayments = pd.read_parquet('repayments_by_different_milestones_2.parquet')

repayments['store_number'] = repayments['store_number'].astype(str)

repayments.head(3)

Unnamed: 0,loan_surrogate_id,loan_mifos_id,store_number,principal_disbursed,term_frequency,repayment_amount_by_due_date,repayment_amount_by_rllvr_date,repayment_amount_by_dpd30
0,5475396,315040,7032770,19500.0,7,,20711.65,20711.65
1,5475513,60991,7850981,55000.0,7,,,
2,5471575,62008,7258160,10000.0,7,,10410.0,10410.0


In [5]:
repayments = repayments[(repayments['term_frequency'] == 7) | (repayments['term_frequency'] == 21)]

In [6]:
repayments.info()

<class 'pandas.core.frame.DataFrame'>
Index: 260922 entries, 0 to 286376
Data columns (total 8 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   loan_surrogate_id               260922 non-null  int64  
 1   loan_mifos_id                   260922 non-null  int64  
 2   store_number                    260922 non-null  object 
 3   principal_disbursed             260922 non-null  float64
 4   term_frequency                  260922 non-null  int64  
 5   repayment_amount_by_due_date    212051 non-null  float64
 6   repayment_amount_by_rllvr_date  234608 non-null  float64
 7   repayment_amount_by_dpd30       244299 non-null  float64
dtypes: float64(4), int64(3), object(1)
memory usage: 17.9+ MB


In [7]:
repayments['repayment_amount_by_due_date'] = repayments['repayment_amount_by_due_date'].fillna(0)
repayments['repayment_amount_by_rllvr_date'] = repayments['repayment_amount_by_rllvr_date'].fillna(0)
repayments['repayment_amount_by_dpd30'] = repayments['repayment_amount_by_dpd30'].fillna(0)

In [8]:
repayments.info()

<class 'pandas.core.frame.DataFrame'>
Index: 260922 entries, 0 to 286376
Data columns (total 8 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   loan_surrogate_id               260922 non-null  int64  
 1   loan_mifos_id                   260922 non-null  int64  
 2   store_number                    260922 non-null  object 
 3   principal_disbursed             260922 non-null  float64
 4   term_frequency                  260922 non-null  int64  
 5   repayment_amount_by_due_date    260922 non-null  float64
 6   repayment_amount_by_rllvr_date  260922 non-null  float64
 7   repayment_amount_by_dpd30       260922 non-null  float64
dtypes: float64(4), int64(3), object(1)
memory usage: 17.9+ MB


In [9]:
repayments['repayment_rate_by_due_date'] = round(repayments['repayment_amount_by_due_date'] / repayments['principal_disbursed'], 4)
repayments['repayment_rate_by_rllvr_date'] = round(repayments['repayment_amount_by_rllvr_date'] / repayments['principal_disbursed'], 4)
repayments['repayment_rate_by_dpd30'] = round(repayments['repayment_amount_by_dpd30'] / repayments['principal_disbursed'], 4)

repayments.head(3)

Unnamed: 0,loan_surrogate_id,loan_mifos_id,store_number,principal_disbursed,term_frequency,repayment_amount_by_due_date,repayment_amount_by_rllvr_date,repayment_amount_by_dpd30,repayment_rate_by_due_date,repayment_rate_by_rllvr_date,repayment_rate_by_dpd30
0,5475396,315040,7032770,19500.0,7,0.0,20711.65,20711.65,0.0,1.062,1.062
1,5475513,60991,7850981,55000.0,7,0.0,0.0,0.0,0.0,0.0,0.0
2,5471575,62008,7258160,10000.0,7,0.0,10410.0,10410.0,0.0,1.041,1.041


In [10]:
hurdle_rate_by_due_date = repayments.groupby(['store_number'], as_index=False)['repayment_rate_by_due_date'].mean()
hurdle_rate_by_due_date = hurdle_rate_by_due_date.rename(columns={'repayment_rate_by_due_date':'hurdle_rate_by_due_date_mean'})

hurdle_rate_by_rllvr_date = repayments.groupby(['store_number'], as_index=False)['repayment_rate_by_rllvr_date'].mean()
hurdle_rate_by_rllvr_date = hurdle_rate_by_rllvr_date.rename(columns={'repayment_rate_by_rllvr_date':'hurdle_rate_by_end_rollover_date_mean'})

hurdle_rate_by_dpd30 = repayments.groupby(['store_number'], as_index=False)['repayment_rate_by_dpd30'].mean()
hurdle_rate_by_dpd30 = hurdle_rate_by_dpd30.rename(columns={'repayment_rate_by_dpd30':'hurdle_rate_by_dpd30_mean'})

hurdle_rates_df = pd.merge(hurdle_rate_by_due_date, hurdle_rate_by_rllvr_date, on='store_number', how='outer')
hurdle_rates_df = pd.merge(hurdle_rates_df, hurdle_rate_by_dpd30, on='store_number', how='outer')

hurdle_rates_df = hurdle_rates_df.fillna(0)

hurdle_rates_df.head(3)

Unnamed: 0,store_number,hurdle_rate_by_due_date_mean,hurdle_rate_by_end_rollover_date_mean,hurdle_rate_by_dpd30_mean
0,105295,1.087,1.087,1.087
1,105570,0.0,0.0,0.0
2,105652,0.842,0.842,0.969


In [11]:
# current_refresh = pd.read_excel("Limits_refresh_summary_20230817_multiple_products.xlsx")
# current_refresh.drop(columns=['Unnamed: 0'], inplace=True)
# current_refresh.to_parquet("Limits_refresh_summary_20230817_multiple_products.parquet", index=False)

In [12]:
current_refresh = pd.read_parquet("Limits_refresh_summary_20230914_multiple_products.parquet")

del current_refresh['hurdle_rate_by_due_date_mean']
del current_refresh['hurdle_rate_by_end_rollover_date_mean']
del current_refresh['hurdle_rate_by_dpd30_mean']

current_refresh['store_number'] = current_refresh['store_number'].astype(str)

current_refresh.head(2)

Unnamed: 0,mobile_number,store_number,approx_30_days_trx_val,most_recent_trx_date_past_30_days,last_trx_date,expected_trx_days,actual_trx_days,page_active_days,inference_col,days_since_last_trx,transacted_last_5_days,weight_till_recency,national_id,idm_recommendation,idm_limit,is_iprs_validated,is_location_mapped,client_mobile_number,loan_count,loan_status,term_frequency,principal_disbursed,principal_repaid,disbursed_on_date,expected_matured_on_date,closed_on_date,due_date_fixed,days_past_due,bloom_version,loan_repayment_status,src_crdt_score,max_principal_amount,max_loan_disbursement_date,count_good_loans,good_loans_repayment_ratio,num_days_since_last_disbursement,weight_dpd,adjusted_loan_count,repayments_by_dd_vs_principal_mean,due_date_rm_ge_rm_1d,due_date_rm_ge_rm_add_back,repayments_by_erd_vs_principal_mean,rllvr_date_rm_ge_rm_add_back,21_day_graduation_flag,max_global_limit,previous_max_global_limit,max_limit_diff,limit_factor_21,limit_factor_7,limit_factor_1,idm_factor_21,idm_factor_7,idm_factor_1,trading_consistency_bands,loan_count_bands,new_limit_factor_21,new_limit_factor_7,new_limit_factor_1,weight_good_loans_repayment_ratio,weight_consistency,weight_recency,risk_rules_factor,ultimate_factor_21,ultimate_factor_7,ultimate_factor_1,limit_21_day,limit_7_day,limit_1_day,adjusted_21_limit,adjusted_7_limit,adjusted_1_limit,final_21_limit,final_7_limit,final_1_limit,blacklist_flag,total_final_21_limit,previous_21_limit,previous_7_limit,previous_1_limit,opt_in_date,model_version,created_at
0,254723778344,7606765,373123.95,2023-08-15,2023-09-13,30,30,1.0,relax_rules,0.0,Yes,1.0,23908954,Approve,,True,True,254723778344,1,600.0,7.0,30000.0,30000.0,2023-09-04,2023-09-11,2023-09-11,2023-09-11,0.0,2.0,closed_on_time,458.0,30000.0,2023-09-04,1.0,1.0,9.0,1,1,1.031,1,1,,1,pass,93300,30000,63300,0.5,0.17,0.17,1.0,1.0,1.0,Band 7,Band 2,0.25,0.15,0.15,1.0,1.0,1,1.0,0.25,0.15,0.15,93280.988,55968.592,55968.592,93280.988,55968.592,55968.592,93300,45000,0,0,0,0,30000,0,2022-07-07,"2023-014[2023-09-01, 2023-09-14]",2023-09-14 10:03:35
1,254723635919,7051927,51450.0,2023-08-16,2023-09-13,29,28,0.97,relax_rules,0.0,Yes,1.0,23607603,Approve,0.0,True,True,254723635919,3,300.0,7.0,6000.0,3812.8,2023-09-08,2023-09-15,NaT,2023-09-15,-2.0,2.0,current_active,458.0,6000.0,2023-09-08,3.0,1.0,5.0,1,3,1.031,1,1,,1,pass,15500,6200,9300,0.5,0.17,0.17,1.0,1.0,1.0,Band 7,Band 3,0.3,0.175,0.175,1.0,1.0,1,1.0,0.3,0.175,0.15,15435.0,9003.75,7717.5,15435.0,9003.75,7717.5,15500,7800,0,0,0,0,6200,0,2022-08-25,"2023-014[2023-09-01, 2023-09-14]",2023-09-14 10:03:35


In [13]:
current_refresh = pd.merge(current_refresh, hurdle_rates_df, on='store_number', how='left')

current_refresh.head(2)

Unnamed: 0,mobile_number,store_number,approx_30_days_trx_val,most_recent_trx_date_past_30_days,last_trx_date,expected_trx_days,actual_trx_days,page_active_days,inference_col,days_since_last_trx,transacted_last_5_days,weight_till_recency,national_id,idm_recommendation,idm_limit,is_iprs_validated,is_location_mapped,client_mobile_number,loan_count,loan_status,term_frequency,principal_disbursed,principal_repaid,disbursed_on_date,expected_matured_on_date,closed_on_date,due_date_fixed,days_past_due,bloom_version,loan_repayment_status,src_crdt_score,max_principal_amount,max_loan_disbursement_date,count_good_loans,good_loans_repayment_ratio,num_days_since_last_disbursement,weight_dpd,adjusted_loan_count,repayments_by_dd_vs_principal_mean,due_date_rm_ge_rm_1d,due_date_rm_ge_rm_add_back,repayments_by_erd_vs_principal_mean,rllvr_date_rm_ge_rm_add_back,21_day_graduation_flag,max_global_limit,previous_max_global_limit,max_limit_diff,limit_factor_21,limit_factor_7,limit_factor_1,idm_factor_21,idm_factor_7,idm_factor_1,trading_consistency_bands,loan_count_bands,new_limit_factor_21,new_limit_factor_7,new_limit_factor_1,weight_good_loans_repayment_ratio,weight_consistency,weight_recency,risk_rules_factor,ultimate_factor_21,ultimate_factor_7,ultimate_factor_1,limit_21_day,limit_7_day,limit_1_day,adjusted_21_limit,adjusted_7_limit,adjusted_1_limit,final_21_limit,final_7_limit,final_1_limit,blacklist_flag,total_final_21_limit,previous_21_limit,previous_7_limit,previous_1_limit,opt_in_date,model_version,created_at,hurdle_rate_by_due_date_mean,hurdle_rate_by_end_rollover_date_mean,hurdle_rate_by_dpd30_mean
0,254723778344,7606765,373123.95,2023-08-15,2023-09-13,30,30,1.0,relax_rules,0.0,Yes,1.0,23908954,Approve,,True,True,254723778344,1,600.0,7.0,30000.0,30000.0,2023-09-04,2023-09-11,2023-09-11,2023-09-11,0.0,2.0,closed_on_time,458.0,30000.0,2023-09-04,1.0,1.0,9.0,1,1,1.031,1,1,,1,pass,93300,30000,63300,0.5,0.17,0.17,1.0,1.0,1.0,Band 7,Band 2,0.25,0.15,0.15,1.0,1.0,1,1.0,0.25,0.15,0.15,93280.988,55968.592,55968.592,93280.988,55968.592,55968.592,93300,45000,0,0,0,0,30000,0,2022-07-07,"2023-014[2023-09-01, 2023-09-14]",2023-09-14 10:03:35,,,
1,254723635919,7051927,51450.0,2023-08-16,2023-09-13,29,28,0.97,relax_rules,0.0,Yes,1.0,23607603,Approve,0.0,True,True,254723635919,3,300.0,7.0,6000.0,3812.8,2023-09-08,2023-09-15,NaT,2023-09-15,-2.0,2.0,current_active,458.0,6000.0,2023-09-08,3.0,1.0,5.0,1,3,1.031,1,1,,1,pass,15500,6200,9300,0.5,0.17,0.17,1.0,1.0,1.0,Band 7,Band 3,0.3,0.175,0.175,1.0,1.0,1,1.0,0.3,0.175,0.15,15435.0,9003.75,7717.5,15435.0,9003.75,7717.5,15500,7800,0,0,0,0,6200,0,2022-08-25,"2023-014[2023-09-01, 2023-09-14]",2023-09-14 10:03:35,1.031,1.031,1.031


In [14]:
##############################
current_refresh['days_since_last_trx'] = current_refresh['days_since_last_trx'].fillna(31)

In [15]:
current_refresh[['final_21_limit', 'final_7_limit', 'final_1_limit']].sum()

final_21_limit    224807200
final_7_limit     189116700
final_1_limit             0
dtype: int64

In [16]:
# selected_columns = current_refresh[["store_number", "approx_30_days_trx_val", "expected_trx_days", "actual_trx_days", "page_active_days", "days_since_last_trx", "weight_consistency", "weight_recency", "hurdle_rate_by_due_date_mean", "hurdle_rate_by_end_rollover_date_mean", "hurdle_rate_by_dpd30_mean"]]
selected_columns = current_refresh[["store_number", "approx_30_days_trx_val", "expected_trx_days", "actual_trx_days", "page_active_days", "days_since_last_trx", "weight_consistency", "weight_recency", "hurdle_rate_by_dpd30_mean"]]

selected_columns.head(2)

Unnamed: 0,store_number,approx_30_days_trx_val,expected_trx_days,actual_trx_days,page_active_days,days_since_last_trx,weight_consistency,weight_recency,hurdle_rate_by_dpd30_mean
0,7606765,373123.95,30,30,1.0,0.0,1.0,1,
1,7051927,51450.0,29,28,0.97,0.0,1.0,1,1.031


In [17]:
selected_columns.shape

(66244, 9)

In [18]:
without_performance_data = selected_columns[selected_columns['hurdle_rate_by_dpd30_mean'].isnull()].reset_index(drop=True)
print(without_performance_data.shape)
with_performance_data = selected_columns[selected_columns['hurdle_rate_by_dpd30_mean'].notnull()].reset_index(drop=True)
print(with_performance_data.shape)

(34162, 9)
(32082, 9)


In [19]:
with_performance_data[with_performance_data['hurdle_rate_by_dpd30_mean'] >= 1.03].shape

(11717, 9)

In [20]:
(11704/32082) * 100

36.481516114955426

In [21]:
summaries = with_performance_data.copy()

In [22]:
# Assuming 'X' contains your independent variables (excluding the unique identifier) and 'y' contains the target variable
X_with_performance_data = with_performance_data.drop(['hurdle_rate_by_dpd30_mean', 'store_number'], axis=1)
y_with_performance_data = with_performance_data['hurdle_rate_by_dpd30_mean']

In [23]:
# Split the dataset into a training set and a test set (adjust the test_size as needed)
X_train_with_performance_data, X_test_with_performance_data, y_train_with_performance_data, y_test_with_performance_data = train_test_split(X_with_performance_data, y_with_performance_data, test_size=0.2, random_state=42)

In [35]:
X_train_with_performance_data.columns

Index(['approx_30_days_trx_val', 'expected_trx_days', 'actual_trx_days', 'page_active_days', 'days_since_last_trx', 'weight_consistency', 'weight_recency'], dtype='object')

In [24]:
# Create a StandardScaler to scale the features
scaler = StandardScaler()

# Fit the scaler on the training data and transform both the training and test data
X_train_scaled_with_performance_data = scaler.fit_transform(X_train_with_performance_data)
X_test_scaled_with_performance_data = scaler.transform(X_test_with_performance_data)

In [25]:
with open("scaler_model.pkl", "wb") as f:
    pickle.dump(scaler, f)

In [26]:
# Create a Linear Regression model
# model = LinearRegression()

# Create a Random Forest Regression model
# model = RandomForestRegressor(n_estimators=500, max_depth=3, random_state=42)

# # Create an XGBoost Regressor model
model = xgb.XGBRegressor(
    n_estimators=500,          # Number of boosting rounds (trees)
    learning_rate=0.1,         # Step size shrinkage to prevent overfitting
    max_depth=7,               # Maximum depth of individual trees
    random_state=42
)

# Create an SVR model
# model = SVR(kernel='rbf', C=20)

# Fit the model on the training data
model.fit(X_train_scaled_with_performance_data, y_train_with_performance_data)

In [27]:
with open("regression_model.pkl", "wb") as f:
    pickle.dump(model, f)

In [28]:
# Make predictions on the test data
y_pred_with_performance_data = model.predict(X_test_scaled_with_performance_data)

In [29]:
# Evaluate the model
mse = mean_squared_error(y_test_with_performance_data, y_pred_with_performance_data)
r2 = r2_score(y_test_with_performance_data, y_pred_with_performance_data)
mae = mean_absolute_error(y_test_with_performance_data, y_pred_with_performance_data)

# Calculate Adjusted R-squared
n = X_test_scaled_with_performance_data.shape[0]  # Number of samples
p = X_test_scaled_with_performance_data.shape[1]  # Number of predictors (features)
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

# Print the evaluation metrics
print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("Adjusted R-squared:", adjusted_r2)
print("Mean Absolute Error:", mae)

Mean Squared Error: 0.13400684282255024
R-squared: -0.036550564638229366
Adjusted R-squared: -0.037682699753296856
Mean Absolute Error: 0.28005485749764575


In [30]:
# Assuming 'X' contains your independent variables (excluding the unique identifier) for 'without_performance_data'
X_without_performance_data = without_performance_data.drop(['hurdle_rate_by_dpd30_mean', 'store_number'], axis=1)

# Scale the features using the same scaler used for the 'with_performance_data' dataset
X_scaled_without_performance_data = scaler.transform(X_without_performance_data)

# Make predictions on the 'without_performance_data' dataset
y_pred_without_performance_data = model.predict(X_scaled_without_performance_data)

# Add the predictions to the 'without_performance_data' DataFrame
without_performance_data['hurdle_rate_by_dpd30_mean'] = y_pred_without_performance_data

In [31]:
without_performance_data.head()

Unnamed: 0,store_number,approx_30_days_trx_val,expected_trx_days,actual_trx_days,page_active_days,days_since_last_trx,weight_consistency,weight_recency,hurdle_rate_by_dpd30_mean
0,7606765,373123.95,30,30,1.0,0.0,1.0,1,0.941
1,7095897,309150.0,30,30,1.0,0.0,1.0,1,0.947
2,7143211,0.0,1,1,1.0,15.0,1.0,0,0.767
3,7904971,192403.95,30,30,1.0,0.0,1.0,1,0.856
4,670522,0.0,27,16,0.59,2.0,0.0,1,0.832


In [32]:
without_performance_data.shape

(34162, 9)

In [33]:
without_performance_data['hurdle_rate_by_dpd30_mean'].describe()

count   34162.000
mean        0.859
std         0.109
min        -0.050
25%         0.823
50%         0.877
75%         0.920
max         1.393
Name: hurdle_rate_by_dpd30_mean, dtype: float64

In [34]:
without_performance_data[without_performance_data['hurdle_rate_by_dpd30_mean'] >= 1.03].shape

(653, 9)