In [None]:
pip install pymysql

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pymysql
  Downloading PyMySQL-1.0.2-py3-none-any.whl (43 kB)
[K     |████████████████████████████████| 43 kB 2.2 MB/s 
[?25hInstalling collected packages: pymysql
Successfully installed pymysql-1.0.2


In [None]:
#import data processing libraries

import os
import pandas as pd
import numpy as np
import math as math
import datetime as dt
from scipy.stats import mode
import psycopg2 #reading data from Postgres DB
import pymysql

import sys

In [None]:
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 300)

#suppress scientific notation to 2 decimal places
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [None]:
#import visualization libraries

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline

In [None]:
# Connection parameters
param_dic = {
    "host"      : "157.245.248.249",
    "database"  : "ubuntu",
    "user"      : "jacklinengenia",
    "password"  : "x3MX&8#!"
}

def connect(params_dic):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = psycopg2.connect(**params_dic)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print("Connection successful")
    return conn

In [None]:
def postgresql_to_dataframe(conn, select_query, column_names):
    """
    Tranform a SELECT query into a pandas dataframe
    """
    cursor = conn.cursor()
    try:
        cursor.execute(select_query)
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        cursor.close()
        return 1
    
    # Naturally we get a list of tupples
    tupples = cursor.fetchall()
    cursor.close()
    
    # Convert the list of tuples in to a pandas dataframe
    df = pd.DataFrame(tupples, columns=column_names)
    return df 

In [None]:
# Connect to the database and load the scoring data bloomlive table
conn = connect(param_dic)

columns = "client_mifos_id,client_mobile_number,loan_status,loan_mifos_id,term_frequency,principal_disbursed,\
principal_repaid,interest_charged,interest_repaid,\
fee_charges_charged,fee_charges_repaid,\
penalty_charges_charged,penalty_charges_repaid,\
total_expected_repayment, total_repayment,\
total_outstanding,\
disbursed_on_date,expected_matured_on_date,\
closed_on_date,store_number, bloom_version, src_crdt_score"

column_names = columns.strip().split(",")
# Execute the "SELECT cols" query
df = postgresql_to_dataframe(conn,
                             "select \
                             client_mifos_id,client_mobile_number,loan_status,loan_mifos_id,term_frequency,principal_disbursed,\
                             principal_repaid,interest_charged,interest_repaid,\
                             fee_charges_charged,fee_charges_repaid,\
                             penalty_charges_charged,penalty_charges_repaid,\
                             total_expected_repayment, total_repayment,\
                             total_outstanding,\
                             disbursed_on_date,expected_matured_on_date,\
                             closed_on_date,store_number, bloom_version, src_crdt_score\
                             from bloomlive.loans_fact_table_summary_view",
                             column_names)

Connecting to the PostgreSQL database...
Connection successful


In [None]:
df.head(3)

Unnamed: 0,client_mifos_id,client_mobile_number,loan_status,loan_mifos_id,term_frequency,principal_disbursed,principal_repaid,interest_charged,interest_repaid,fee_charges_charged,fee_charges_repaid,penalty_charges_charged,penalty_charges_repaid,total_expected_repayment,total_repayment,total_outstanding,disbursed_on_date,expected_matured_on_date,closed_on_date,store_number,bloom_version,src_crdt_score
0,14269,,600,16296,7,4300.0,4300.0,146.21,146.21,0.01,0.01,0.05,0.05,4446.27,4446.27,0.0,2022-01-20,2022-01-27,2022-01-27,7529969,2.0,399.0
1,2439,,600,16357,21,56000.0,56000.0,4872.01,4872.01,0.0,0.0,0.0,0.0,60872.01,60872.01,0.0,2022-01-21,2022-02-11,2022-02-11,840549,2.0,
2,7520,,600,16386,7,1400.0,1400.0,47.61,47.61,0.01,0.01,0.05,0.05,1447.67,1447.67,0.0,2022-01-21,2022-01-28,2022-01-28,7216519,2.0,


In [None]:
#replace None type datatype with Nan values across entire df

df = df.fillna(value=np.nan)

In [None]:
#convert date columns
datetime_cols = ["disbursed_on_date", "expected_matured_on_date", "closed_on_date"]

df[datetime_cols] = df[datetime_cols].apply(pd.to_datetime, errors='coerce')


#convert specific int/float columns to string type
int_to_string_cols = ["client_mifos_id", "client_mobile_number"]

df[int_to_string_cols] = df[int_to_string_cols].astype(str)


#convert specific string columns to float, first by leaving out the non-targeted columns
string_to_float_cols = df.columns.drop(["client_mifos_id", "loan_status", "term_frequency",
                                        "disbursed_on_date", "expected_matured_on_date",
                                        "closed_on_date"])

df[string_to_float_cols] = df[string_to_float_cols].apply(pd.to_numeric, errors='coerce')


#replace the in the column headers with underscores & make all column headers in lower case
df.columns = df.columns.str.lower().str.replace(' ','')

In [None]:
#create new column that concats the bloom version and loan id so as to prevent any loan duplicates from the same mifos instance

df["loan_id_product_concat"] = (df["loan_mifos_id"].astype("str")+"-"+df["bloom_version"].astype("str")).astype("str")

In [None]:
#drop all loan records that have irrelevant status keys OR were never disbursed
df = df.drop(df[
    (df["loan_status"] == 0) |
    (df["loan_status"] == 100) |
    (df["loan_status"] == 400) |
    (df["loan_status"] == 500)
].index)


#drop all rows where loan id is blank
df = df.loc[df["loan_mifos_id"].notnull()]


#drop all rows with duplicate loan id is blank
df = df.loc[~df["loan_id_product_concat"].duplicated()]


#drop all rows where disbursement date is blank
df = df.loc[df["disbursed_on_date"].notnull()]

In [None]:
#clean up mobile number column
df[["client_mobile_number_2", "temp"]] = df["client_mobile_number"].astype("str").str.split(".", expand=True)


#drop newly created temp column
df.drop(["temp","client_mobile_number"], axis=1, inplace=True)

df.rename(columns={"client_mobile_number_2": "client_mobile_number"}, inplace=True)

In [None]:
#sort dataframe based on specific columns
df.sort_values(["client_mobile_number","disbursed_on_date"], ascending=[True, False], inplace=True)


#create loan count column i.e adds a new column that captures the num of loans a customer has taken
df["loan_count"] = df.groupby("store_number")["store_number"].transform('size')

#create loan rank column i.e adds a new column that captures the rank of each loan a customer has taken
#df["loan_rank"] = df.groupby("store_number")["loan_mifos_id"].rank(ascending=True)
df["loan_rank"] = df.groupby("store_number")["disbursed_on_date"].rank(ascending=True)


##sort out minor Mifos errors relating to loans with status 700
df.loc[(df["loan_status"] == 700) & (df["closed_on_date"].isnull()), "closed_on_date"] = df["expected_matured_on_date"]

## Notes ==> fix issue where loan rank gets messed up because of different Mifos instance

In [None]:
#create due_date fixed column to ensure due date tallies with the term frequency for each loan

df["due_date_fixed"] = df["disbursed_on_date"] + pd.to_timedelta(df["term_frequency"], unit='d')

In [None]:
#create a copy of df

all_loans = df.copy()

In [None]:
#analysis period

print('analysis begin date {}'.format(all_loans['disbursed_on_date'].min()))
print('analysis latest date {}'.format(all_loans['disbursed_on_date'].max()))

analysis begin date 2017-12-10 00:00:00
analysis latest date 2022-09-22 00:00:00


In [None]:
def calc_days_past_due(df):
    """
    Function to calculate days past due for each loan record. 
    The function uses pandas.Series vectorized arguments to ensure fast iterations/loops.
    Inputs are the arguments passed in the conditions list.
    Outputs are the results that are reported based on the choices list.
    Each output/choice assigned corresponds to the input/conditions level assigned above i.e.
    the first condition corresponds to the first choice etc, as such the rows of conditions & choices need to match.
    If the output reported is a string '0', that's an error/edgecase whose conditions, choices were not well declared.
    
    Inputs: 
    1) The current status of a loan as captured on corebanking(Mifos),
    2) The loans due date, 
    3) the last repayment date on record for the loan.
    
    
    Outputs:
    A calculation of a loan's number of days past due, that is converted from datetime to integer.
    
    """
    #subtract one day from today timestamp to ensure evaluation is in-line with Mifos which is Time-1 i.e one day behind
    today = (pd.Timestamp.today()).strftime('%Y-%m-%d')
    today = pd.Timestamp(today) - dt.timedelta(days=1)
        
    loan_status = df["loan_status"]
    due_date = df["due_date_fixed"]
    closed_on_date = df["closed_on_date"]
    
    #the main input is the loan status id that is used to slice the df
    conditions = [
        loan_status.eq(300),
        loan_status.eq(600),
        loan_status.eq(601),
        loan_status.eq(700),
    ]
    
    #the other date inputs are used for calculation based on the condition of loan status of a loan at any given point
    choices = [
        (today - due_date).dt.days,
        (closed_on_date - due_date).dt.days,
        (closed_on_date - due_date).dt.days,
        (closed_on_date - due_date).dt.days,
    ]
    
    days_past_due = np.select(conditions, choices)
    
    return days_past_due


#apply the function to the df to create the days_past_due_column
all_loans["days_past_due"] = calc_days_past_due(all_loans)

In [None]:
def set_loan_status_labels(df):
    """
    Function to set the loan repayment status of a loan.
    Inputs are the arguments passed in the conditions list.
    Outputs are the results that are reported based on the choices list.
    Each output/choice assigned corresponds to the input/conditions level assigned above i.e.
    the first condition corresponds to the first choice etc, as such the rows of conditions & choices need to match.
    If the output reported is a string '0', that's an error/edgecase whose conditions, choices were not well declared.
    
    Inputs: 
    1) The current status of a loan as captured on corebanking(Mifos),
    2) Number of days past due for each loan, 
    3) Term frequency for each loan,
    4) The version of Bloom tied to the loan record
    
    Outputs: 
    A string label noting the current loan repayment status of each loan record
    """
    thirty_day_product_rollover = 7 #Bloom 1.0
    twenty_one_day_product_rollover = 5 #Bloom 2.0 only
    seven_day_product_rollover_bloom1 = 7 #Bloom 2.0 adjusted to 5 days from 7 days in Bloom 1.0
    seven_day_product_rollover_bloom2 = 5 
    three_day_product_rollover = 2
    one_day_product_rollover = 1 #Bloom 2.0, may be adjusted to 5 days as well

    loan_status = all_loans["loan_status"]
    days_past_due = all_loans["days_past_due"]
    term_frequency = all_loans["term_frequency"]
    bloom_version = all_loans["bloom_version"]

    conditions = [
        #written off loans
        loan_status.eq(601),
        
        #currently active OR loans closed in tenure
        loan_status.eq(300) & days_past_due.le(0),
        loan_status.eq(600) & days_past_due.lt(0),
        loan_status.eq(700) & days_past_due.lt(0),
        loan_status.eq(600) & days_past_due.eq(0),
        loan_status.eq(700) & days_past_due.eq(0),
    
        #active loans that are presently in rollover
        loan_status.eq(300) & term_frequency.eq(1) & days_past_due.le(one_day_product_rollover),
        loan_status.eq(300) & term_frequency.eq(3) & days_past_due.le(three_day_product_rollover),
        loan_status.eq(300) & term_frequency.eq(7) & bloom_version.eq(1) & days_past_due.le(seven_day_product_rollover_bloom1),
        loan_status.eq(300) & term_frequency.eq(7) & bloom_version.eq(2) & days_past_due.le(seven_day_product_rollover_bloom2),
        loan_status.eq(300) & term_frequency.eq(21) & days_past_due.le(twenty_one_day_product_rollover),
        loan_status.eq(300) & term_frequency.eq(30) & days_past_due.le(thirty_day_product_rollover),
    
        #loans that were cleared/closed after they got to rollover and cleared with exact balance due
        loan_status.eq(600) & term_frequency.eq(1) & days_past_due.le(one_day_product_rollover),
        loan_status.eq(600) & term_frequency.eq(3) & days_past_due.le(three_day_product_rollover),
        loan_status.eq(600) & term_frequency.eq(7) & bloom_version.eq(1) & days_past_due.le(seven_day_product_rollover_bloom1),
        loan_status.eq(600) & term_frequency.eq(7) & bloom_version.eq(2) & days_past_due.le(seven_day_product_rollover_bloom2),
        loan_status.eq(600) & term_frequency.eq(21) & days_past_due.le(twenty_one_day_product_rollover),
        loan_status.eq(600) & term_frequency.eq(30) & days_past_due.le(thirty_day_product_rollover),
    
        #loans that were cleared/closed after they got to rollover and were overpaid
        loan_status.eq(700) & term_frequency.eq(1) & days_past_due.le(one_day_product_rollover),
        loan_status.eq(700) & term_frequency.eq(3) & days_past_due.le(three_day_product_rollover),
        loan_status.eq(700) & term_frequency.eq(7) & bloom_version.eq(1) & days_past_due.le(seven_day_product_rollover_bloom1),
        loan_status.eq(700) & term_frequency.eq(7) & bloom_version.eq(2) & days_past_due.le(seven_day_product_rollover_bloom2),
        loan_status.eq(700) & term_frequency.eq(21) & days_past_due.le(twenty_one_day_product_rollover),
        loan_status.eq(700) & term_frequency.eq(30) & days_past_due.le(thirty_day_product_rollover),

        #active loans that are presently in default
        loan_status.eq(300) & term_frequency.eq(1) & days_past_due.gt(one_day_product_rollover),
        loan_status.eq(300) & term_frequency.eq(3) & days_past_due.gt(three_day_product_rollover),
        loan_status.eq(300) & term_frequency.eq(7) & bloom_version.eq(1) & days_past_due.gt(seven_day_product_rollover_bloom1),
        loan_status.eq(300) & term_frequency.eq(7) & bloom_version.eq(2) & days_past_due.gt(seven_day_product_rollover_bloom2),
        loan_status.eq(300) & term_frequency.eq(21) & days_past_due.gt(twenty_one_day_product_rollover),
        loan_status.eq(300) & term_frequency.eq(30) & days_past_due.gt(thirty_day_product_rollover),


        #loans were cleared/closed when they had got to default status and cleared with exact balance due
        loan_status.eq(600) & term_frequency.eq(1) & days_past_due.gt(one_day_product_rollover),
        loan_status.eq(600) & term_frequency.eq(3) & days_past_due.gt(three_day_product_rollover),
        loan_status.eq(600) & term_frequency.eq(7) & bloom_version.eq(1) & days_past_due.gt(seven_day_product_rollover_bloom1),
        loan_status.eq(600) & term_frequency.eq(7) & bloom_version.eq(2) & days_past_due.gt(seven_day_product_rollover_bloom2),
        loan_status.eq(600) & term_frequency.eq(21) & days_past_due.gt(twenty_one_day_product_rollover),
        loan_status.eq(600) & term_frequency.eq(30) & days_past_due.gt(thirty_day_product_rollover),

        #loans that were cleared/closed after they got to default and were overpaid
        loan_status.eq(700) & term_frequency.eq(1) & days_past_due.gt(one_day_product_rollover),
        loan_status.eq(700) & term_frequency.eq(3) & days_past_due.gt(three_day_product_rollover),
        loan_status.eq(700) & term_frequency.eq(7) & bloom_version.eq(1) & days_past_due.gt(seven_day_product_rollover_bloom1),
        loan_status.eq(700) & term_frequency.eq(7) & bloom_version.eq(2) & days_past_due.gt(seven_day_product_rollover_bloom2),
        loan_status.eq(700) & term_frequency.eq(21) & days_past_due.gt(twenty_one_day_product_rollover),
        loan_status.eq(700) & term_frequency.eq(30) & days_past_due.gt(thirty_day_product_rollover),
    ]


    choices = [
        "written-off_default",
        
        "current_active",
        "closed_early_repayment",
        "closed_early_repayment_overpaid",
        "closed_on_time",
        "closed_on_time_overpaid",    
        
        "active_rollover",
        "active_rollover",
        "active_rollover",
        "active_rollover",
        "active_rollover",
        "active_rollover",
        
        "closed_rollover",
        "closed_rollover",
        "closed_rollover",
        "closed_rollover",
        "closed_rollover",
        "closed_rollover",    
        
        "closed_rollover_overpaid",
        "closed_rollover_overpaid",
        "closed_rollover_overpaid",
        "closed_rollover_overpaid",
        "closed_rollover_overpaid",
        "closed_rollover_overpaid",
        
        "active_default",
        "active_default",
        "active_default",
        "active_default",
        "active_default",
        "active_default",   
        
        "closed_default",
        "closed_default",
        "closed_default",
        "closed_default",
        "closed_default",
        "closed_default",        
        
        "closed_default_overpaid",
        "closed_default_overpaid",
        "closed_default_overpaid",
        "closed_default_overpaid",
        "closed_default_overpaid",
        "closed_default_overpaid",

    ]
    
    loan_labels = np.select(conditions, choices)
    
    return loan_labels


#apply the function to the df to create the days_past_due_column
all_loans["loan_repayment_status"] = set_loan_status_labels(all_loans)

In [None]:
#return df with two most recent loans for each client id

#temp_df = all_loans.groupby("client_id").head(2).reset_index()

In [None]:
all_loans.head(3)

Unnamed: 0,client_mifos_id,loan_status,loan_mifos_id,term_frequency,principal_disbursed,principal_repaid,interest_charged,interest_repaid,fee_charges_charged,fee_charges_repaid,penalty_charges_charged,penalty_charges_repaid,total_expected_repayment,total_repayment,total_outstanding,disbursed_on_date,expected_matured_on_date,closed_on_date,store_number,bloom_version,src_crdt_score,loan_id_product_concat,client_mobile_number,loan_count,loan_rank,due_date_fixed,days_past_due,loan_repayment_status
83198,53322,600,184191,7,1500.0,1500.0,39.01,39.01,0.0,0.0,0.0,0.0,1539.01,1539.01,0.0,2022-08-03,2022-08-10,NaT,7761393.0,2.0,438.0,184191-2.0,254110007123,2.0,2.0,2022-08-10,,0
82392,53322,600,161253,7,1500.0,1500.0,39.01,39.01,0.0,0.0,0.0,0.0,1539.01,1539.01,0.0,2022-07-19,2022-07-26,2022-07-26,7761393.0,2.0,438.0,161253-2.0,254110007123,2.0,1.0,2022-07-26,0.0,closed_on_time
50748,11011,300,118239,21,14400.0,0.0,1094.41,0.0,237.6,0.0,6349.07,0.0,22081.08,0.0,22081.08,2022-06-09,2022-06-30,NaT,7767671.0,2.0,425.0,118239-2.0,254110013557,6.0,6.0,2022-06-30,83.0,active_default


In [None]:
# Connect to the database and load the scoring data bloomlive table
conn = connect(param_dic)

columns = "mifos_loan_id,is_reversed,transaction_type_enum,transaction_date,bloom_version"

column_names = columns.strip().split(",")
# Execute the "SELECT cols" query
df_transactions = postgresql_to_dataframe(conn,
                             "select \
                             mifos_loan_id,is_reversed,transaction_type_enum,\
                             transaction_date,bloom_version\
                             from bloomlive.transactions_dimension\
                             where is_reversed is false and transaction_type_enum = 2",
                             column_names)

Connecting to the PostgreSQL database...
Connection successful


In [None]:
df_transactions["bloom_version"] = df_transactions["bloom_version"].astype("float")

In [None]:
df_transactions.head()

Unnamed: 0,mifos_loan_id,is_reversed,transaction_type_enum,transaction_date,bloom_version
0,4316,False,2,2022-02-28,2.0
1,38252,False,2,2022-03-04,2.0
2,38266,False,2,2022-03-04,2.0
3,38259,False,2,2022-03-04,2.0
4,38257,False,2,2022-03-05,2.0


In [None]:
df_transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 736485 entries, 0 to 736484
Data columns (total 5 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   mifos_loan_id          736485 non-null  object 
 1   is_reversed            736485 non-null  bool   
 2   transaction_type_enum  736485 non-null  int64  
 3   transaction_date       736485 non-null  object 
 4   bloom_version          736485 non-null  float64
dtypes: bool(1), float64(1), int64(1), object(2)
memory usage: 23.2+ MB


In [None]:
#create new column that concats the bloom version and loan id so as to prevent any loan duplicates from the same mifos instance

df_transactions["loan_id_product_concat"] = (df_transactions["mifos_loan_id"].astype("str")+"-"+df_transactions["bloom_version"].astype("str")).astype("str")

df_transactions['transaction_date'] = pd.to_datetime(df_transactions['transaction_date'], errors='coerce')

# grouping loan_mifos_id to find the max transaction date
transactions = df_transactions.groupby(['loan_id_product_concat'], as_index=False)['transaction_date'].max()
transactions = transactions.rename(columns={'transaction_date':'max_transaction_date'})

transactions.head()

Unnamed: 0,loan_id_product_concat,max_transaction_date
0,10000-1.0,2018-05-10
1,10000-2.0,2022-01-14
2,100000-2.0,2022-06-08
3,100001-2.0,2022-05-28
4,100002-2.0,2022-05-28


In [None]:
all_loans = pd.merge(all_loans, transactions, how = 'left', on = 'loan_id_product_concat')

all_loans['max_transaction_date'] = pd.to_datetime(all_loans['max_transaction_date'], errors='coerce')

all_loans['days_diff_maturity_max_trans'] = (all_loans['max_transaction_date'] - all_loans['due_date_fixed']).dt.days

all_loans['total_repayment_vs_principal_amount'] = all_loans['total_repayment'] / all_loans['principal_disbursed']

all_loans.head(10)

Unnamed: 0,client_mifos_id,loan_status,loan_mifos_id,term_frequency,principal_disbursed,principal_repaid,interest_charged,interest_repaid,fee_charges_charged,fee_charges_repaid,penalty_charges_charged,penalty_charges_repaid,total_expected_repayment,total_repayment,total_outstanding,disbursed_on_date,expected_matured_on_date,closed_on_date,store_number,bloom_version,src_crdt_score,loan_id_product_concat,client_mobile_number,loan_count,loan_rank,due_date_fixed,days_past_due,loan_repayment_status,max_transaction_date,days_diff_maturity_max_trans,total_repayment_vs_principal_amount
0,53322,600,184191,7,1500.0,1500.0,39.01,39.01,0.0,0.0,0.0,0.0,1539.01,1539.01,0.0,2022-08-03,2022-08-10,NaT,7761393.0,2.0,438.0,184191-2.0,254110007123,2.0,2.0,2022-08-10,,0,2022-08-11,1.0,1.03
1,53322,600,161253,7,1500.0,1500.0,39.01,39.01,0.0,0.0,0.0,0.0,1539.01,1539.01,0.0,2022-07-19,2022-07-26,2022-07-26,7761393.0,2.0,438.0,161253-2.0,254110007123,2.0,1.0,2022-07-26,0.0,closed_on_time,2022-07-23,-3.0,1.03
2,11011,300,118239,21,14400.0,0.0,1094.41,0.0,237.6,0.0,6349.07,0.0,22081.08,0.0,22081.08,2022-06-09,2022-06-30,NaT,7767671.0,2.0,425.0,118239-2.0,254110013557,6.0,6.0,2022-06-30,83.0,active_default,NaT,,0.0
3,11011,600,116171,21,14400.0,14400.0,1094.41,1094.41,0.0,0.0,0.0,0.0,15494.41,15494.41,0.0,2022-06-07,2022-06-28,2022-06-28,7767671.0,2.0,425.0,116171-2.0,254110013557,6.0,5.0,2022-06-28,0.0,closed_on_time,2022-06-09,-19.0,1.08
4,11011,600,108030,21,14000.0,14000.0,1064.01,1064.01,0.0,0.0,0.0,0.0,15064.01,15064.01,0.0,2022-05-27,2022-06-17,NaT,7767671.0,2.0,425.0,108030-2.0,254110013557,6.0,4.0,2022-06-17,,0,2022-06-06,-11.0,1.08
5,11011,600,101438,7,10000.0,10000.0,260.01,260.01,0.0,0.0,0.0,0.0,10260.01,10260.01,0.0,2022-05-20,2022-05-27,2022-05-27,7767671.0,2.0,,101438-2.0,254110013557,6.0,3.0,2022-05-27,0.0,closed_on_time,2022-05-27,0.0,1.03
6,39154,300,204152,7,2300.0,0.0,71.76,0.0,71.16,0.0,144.36,0.0,2587.28,0.0,2587.28,2022-09-04,2022-09-11,NaT,7904403.0,2.0,,204152-2.0,254110023683,1.0,1.0,2022-09-11,10.0,active_default,NaT,,0.0
7,59479,600,187965,1,600.0,600.0,2.4,2.4,12.15,12.15,37.27,37.27,651.82,651.82,0.0,2022-08-08,2022-08-09,NaT,7414322.0,2.0,441.0,187965-2.0,254110032116,11.0,11.0,2022-08-09,,0,2022-08-27,18.0,1.09
8,59479,600,183273,1,600.0,600.0,2.4,2.4,7.26,7.26,0.0,0.0,609.66,609.66,0.0,2022-08-02,2022-08-03,NaT,7414322.0,2.0,441.0,183273-2.0,254110032116,11.0,10.0,2022-08-03,,0,2022-08-07,4.0,1.02
9,59479,600,192916,1,600.0,600.0,2.4,2.4,0.0,0.0,10.92,10.92,613.32,613.32,0.0,2022-07-30,2022-07-31,2022-07-31,7414322.0,2.0,441.0,192916-2.0,254110032116,11.0,9.0,2022-07-31,0.0,closed_on_time,2022-08-02,2.0,1.02


In [None]:
all_loans.head(10)

Unnamed: 0,client_mifos_id,loan_status,loan_mifos_id,term_frequency,principal_disbursed,principal_repaid,interest_charged,interest_repaid,fee_charges_charged,fee_charges_repaid,penalty_charges_charged,penalty_charges_repaid,total_expected_repayment,total_repayment,total_outstanding,disbursed_on_date,expected_matured_on_date,closed_on_date,store_number,bloom_version,src_crdt_score,loan_id_product_concat,client_mobile_number,loan_count,loan_rank,due_date_fixed,days_past_due,loan_repayment_status,max_transaction_date,days_diff_maturity_max_trans,total_repayment_vs_principal_amount
0,53322,600,184191,7,1500.0,1500.0,39.01,39.01,0.0,0.0,0.0,0.0,1539.01,1539.01,0.0,2022-08-03,2022-08-10,NaT,7761393.0,2.0,438.0,184191-2.0,254110007123,2.0,2.0,2022-08-10,,0,2022-08-11,1.0,1.03
1,53322,600,161253,7,1500.0,1500.0,39.01,39.01,0.0,0.0,0.0,0.0,1539.01,1539.01,0.0,2022-07-19,2022-07-26,2022-07-26,7761393.0,2.0,438.0,161253-2.0,254110007123,2.0,1.0,2022-07-26,0.0,closed_on_time,2022-07-23,-3.0,1.03
2,11011,300,118239,21,14400.0,0.0,1094.41,0.0,237.6,0.0,6349.07,0.0,22081.08,0.0,22081.08,2022-06-09,2022-06-30,NaT,7767671.0,2.0,425.0,118239-2.0,254110013557,6.0,6.0,2022-06-30,83.0,active_default,NaT,,0.0
3,11011,600,116171,21,14400.0,14400.0,1094.41,1094.41,0.0,0.0,0.0,0.0,15494.41,15494.41,0.0,2022-06-07,2022-06-28,2022-06-28,7767671.0,2.0,425.0,116171-2.0,254110013557,6.0,5.0,2022-06-28,0.0,closed_on_time,2022-06-09,-19.0,1.08
4,11011,600,108030,21,14000.0,14000.0,1064.01,1064.01,0.0,0.0,0.0,0.0,15064.01,15064.01,0.0,2022-05-27,2022-06-17,NaT,7767671.0,2.0,425.0,108030-2.0,254110013557,6.0,4.0,2022-06-17,,0,2022-06-06,-11.0,1.08
5,11011,600,101438,7,10000.0,10000.0,260.01,260.01,0.0,0.0,0.0,0.0,10260.01,10260.01,0.0,2022-05-20,2022-05-27,2022-05-27,7767671.0,2.0,,101438-2.0,254110013557,6.0,3.0,2022-05-27,0.0,closed_on_time,2022-05-27,0.0,1.03
6,39154,300,204152,7,2300.0,0.0,71.76,0.0,71.16,0.0,144.36,0.0,2587.28,0.0,2587.28,2022-09-04,2022-09-11,NaT,7904403.0,2.0,,204152-2.0,254110023683,1.0,1.0,2022-09-11,10.0,active_default,NaT,,0.0
7,59479,600,187965,1,600.0,600.0,2.4,2.4,12.15,12.15,37.27,37.27,651.82,651.82,0.0,2022-08-08,2022-08-09,NaT,7414322.0,2.0,441.0,187965-2.0,254110032116,11.0,11.0,2022-08-09,,0,2022-08-27,18.0,1.09
8,59479,600,183273,1,600.0,600.0,2.4,2.4,7.26,7.26,0.0,0.0,609.66,609.66,0.0,2022-08-02,2022-08-03,NaT,7414322.0,2.0,441.0,183273-2.0,254110032116,11.0,10.0,2022-08-03,,0,2022-08-07,4.0,1.02
9,59479,600,192916,1,600.0,600.0,2.4,2.4,0.0,0.0,10.92,10.92,613.32,613.32,0.0,2022-07-30,2022-07-31,2022-07-31,7414322.0,2.0,441.0,192916-2.0,254110032116,11.0,9.0,2022-07-31,0.0,closed_on_time,2022-08-02,2.0,1.02


In [None]:
all_loans.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 361069 entries, 0 to 361068
Data columns (total 31 columns):
 #   Column                               Non-Null Count   Dtype         
---  ------                               --------------   -----         
 0   client_mifos_id                      361069 non-null  object        
 1   loan_status                          361069 non-null  int64         
 2   loan_mifos_id                        361069 non-null  int64         
 3   term_frequency                       361069 non-null  int64         
 4   principal_disbursed                  361069 non-null  float64       
 5   principal_repaid                     361069 non-null  float64       
 6   interest_charged                     361069 non-null  float64       
 7   interest_repaid                      361069 non-null  float64       
 8   fee_charges_charged                  361069 non-null  float64       
 9   fee_charges_repaid                   361069 non-null  float64       
 

---
#### Generate aggregate summaries

In [None]:
#return df with the most recent loan for each borrower

#agg_summary = all_loans.groupby("store_number").head(1).reset_index()
agg_summary = all_loans.loc[all_loans.groupby('store_number').loan_rank.idxmax()].reset_index()

In [None]:
#trim df to only relevant columns
target_cols = ["client_mobile_number","store_number","loan_count","loan_status","term_frequency",
  "principal_disbursed","principal_repaid","disbursed_on_date",
  "expected_matured_on_date","closed_on_date","due_date_fixed",
  "days_past_due","bloom_version","loan_repayment_status", "src_crdt_score"]

agg_summary = agg_summary[target_cols]

In [None]:
#aggregate maximum principal disbursed for each client id

agg_summary = pd.merge(agg_summary, (all_loans.groupby("store_number")["principal_disbursed"].max().rename("max_principal_amount").reset_index()), on="store_number")

In [None]:
#get df for when a customer got their max loan principal

max_principal_dates = all_loans.sort_values("principal_disbursed", ascending=False).groupby("store_number").first().reset_index()

In [None]:
#trim df to only remain with relevant columns

max_principal_dates = max_principal_dates[["store_number","disbursed_on_date"]]

#rename column to make it clearer
max_principal_dates.rename(columns = {"disbursed_on_date": "max_loan_disbursement_date"}, inplace = True)

#merge df

agg_summary = pd.merge(agg_summary, max_principal_dates, on="store_number")

In [None]:
agg_summary.shape

(48100, 17)

In [None]:
all_loans.shape

(361069, 31)

In [None]:
#aggregate of loans well paid or in good standing

agg_good_loans = (all_loans.loc[((all_loans['total_repayment_vs_principal_amount'] > 1) & (all_loans['days_diff_maturity_max_trans'] <= 15)) |
                                 (all_loans['loan_repayment_status'] == 'current_active')
                                 ]).groupby("store_number")["loan_id_product_concat"].aggregate("count").rename("count_good_loans").reset_index()

In [None]:
#merge agg_summary & agg good loans

agg_summary = pd.merge(agg_summary, agg_good_loans, how="outer", on="store_number")

In [None]:
agg_summary.head()

Unnamed: 0,client_mobile_number,store_number,loan_count,loan_status,term_frequency,principal_disbursed,principal_repaid,disbursed_on_date,expected_matured_on_date,closed_on_date,due_date_fixed,days_past_due,bloom_version,loan_repayment_status,src_crdt_score,max_principal_amount,max_loan_disbursement_date,count_good_loans
0,254726604388.0,6.0,8.0,300,7,200000.0,92180.64,2022-07-30,2022-08-06,NaT,2022-08-06,46.0,2.0,active_default,491.0,200000.0,2022-07-30,5.0
1,,11.0,3.0,600,30,5000.0,5000.0,2019-12-01,2019-12-31,2019-12-30,2019-12-31,-1.0,1.0,closed_early_repayment,,5000.0,2019-09-24,3.0
2,254714319251.0,30.0,16.0,300,7,12200.0,0.0,2022-09-19,2022-09-26,NaT,2022-09-26,-5.0,2.0,current_active,432.0,12200.0,2022-09-19,16.0
3,,63.0,1.0,300,21,42900.0,0.0,2021-12-04,2021-12-25,NaT,2021-12-25,270.0,2.0,active_default,,42900.0,2021-12-04,
4,,68.0,3.0,601,30,15000.0,0.0,2019-07-03,2019-08-02,2021-08-24,2019-08-02,753.0,1.0,written-off_default,,30000.0,2019-06-06,2.0


In [None]:
#fill rest of missing values with zeros

cols_fillna = ['count_good_loans']
# replace 'NaN' with zero in these columns
for col in cols_fillna:
    agg_summary[col].fillna(0,inplace=True)

In [None]:
#calculate good repayment ratios for the borrowers

agg_summary["good_loans_repayment_ratio"] = round(agg_summary["count_good_loans"]/agg_summary["loan_count"], 2)

In [None]:
#calculate num days since last disbursement
#subtract one day from today timestamp to ensure evaluation is in-line with Mifos which is Time-1 i.e one day behind

today = (pd.Timestamp.today()).strftime('%Y-%m-%d')
today = pd.Timestamp(today) - dt.timedelta(days=1)

agg_summary["num_days_since_last_disbursement"] = pd.to_numeric((today - agg_summary["disbursed_on_date"]).dt.days, downcast='integer')

In [None]:
#delete test accounts

agg_summary = agg_summary[agg_summary["max_principal_amount"]>=200]

In [None]:
# add column to label inference column

def assign_inference_label(df):
    """
    Function to assess the weight to be assigned based on good loans repayment ratio i.e num of loans paid within tenure\
    for customers who qualify for limit stabilization
    
    Inputs:   
    1) The inference column indicating whether rules are to be relaxed OR not,
    2) Good loans repayment ratio tracking column i.e. ratio of num of loans paid within tenure vs total num loans taken
    
    Outputs:
    A column assigning the assigned weight for good loans repayment ratio
    """ 
    
    target_col = df["good_loans_repayment_ratio"]
    good_loans_repayment_ratio_threshold = 0.9
    
    conditions = [
        target_col.ge(good_loans_repayment_ratio_threshold),
        target_col.lt(good_loans_repayment_ratio_threshold),
    ]
    
    choices = [
        "relax_rules",
        "No_rules_relaxed",
    ]
    
    inference_col = np.select(conditions, choices)
    
    return inference_col

#apply function
agg_summary["inference_col"] = assign_inference_label(agg_summary)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
def weight_dpd(df):
    """
    Function to assess the weight to be assigned based on days past due for customers who qualify for limit stabilization
    
    Inputs:   
    1) The inference column indicating whether rules are to be relaxed OR not,
    2) Days past due tracking column i.e. num of days past due
    
    Outputs:
    A column assigning the assigned weight for days past due
    """ 
    
    dpd_col = df["days_past_due"]
    inference_col = df["inference_col"]
    inference_col_target = "relax_rules"
    
    conditions = [
        inference_col.str.match(inference_col_target) & dpd_col.lt(30),
        inference_col.str.match(inference_col_target) & dpd_col.ge(30) & dpd_col.lt(35),
        inference_col.str.match(inference_col_target) & dpd_col.ge(35) & dpd_col.lt(38),
        inference_col.str.match(inference_col_target) & dpd_col.ge(38) & dpd_col.lt(41),
        dpd_col.gt(41)
    ]
    
    choices = [
        1,
        0.9,
        0.8,
        0.7,
        0 
    ]
    
    weight_dpd_col = np.select(conditions, choices)
    
    return weight_dpd_col

#apply function
agg_summary["weight_dpd"] = weight_dpd(agg_summary)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
agg_summary.head(3)

Unnamed: 0,client_mobile_number,store_number,loan_count,loan_status,term_frequency,principal_disbursed,principal_repaid,disbursed_on_date,expected_matured_on_date,closed_on_date,due_date_fixed,days_past_due,bloom_version,loan_repayment_status,src_crdt_score,max_principal_amount,max_loan_disbursement_date,count_good_loans,good_loans_repayment_ratio,num_days_since_last_disbursement,inference_col,weight_dpd
0,254726604388.0,6.0,8.0,300,7,200000.0,92180.64,2022-07-30,2022-08-06,NaT,2022-08-06,46.0,2.0,active_default,491.0,200000.0,2022-07-30,5.0,0.62,53,No_rules_relaxed,0.0
1,,11.0,3.0,600,30,5000.0,5000.0,2019-12-01,2019-12-31,2019-12-30,2019-12-31,-1.0,1.0,closed_early_repayment,,5000.0,2019-09-24,3.0,1.0,1025,relax_rules,1.0
2,254714319251.0,30.0,16.0,300,7,12200.0,0.0,2022-09-19,2022-09-26,NaT,2022-09-26,-5.0,2.0,current_active,432.0,12200.0,2022-09-19,16.0,1.0,2,relax_rules,1.0


In [None]:
agg_summary.shape

(48065, 22)

In [None]:
agg_summary['store_number'].nunique()

48065

In [None]:
agg_summary.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48065 entries, 0 to 48099
Data columns (total 22 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   client_mobile_number              48065 non-null  object        
 1   store_number                      48065 non-null  float64       
 2   loan_count                        48065 non-null  float64       
 3   loan_status                       48065 non-null  int64         
 4   term_frequency                    48065 non-null  int64         
 5   principal_disbursed               48065 non-null  float64       
 6   principal_repaid                  48065 non-null  float64       
 7   disbursed_on_date                 48065 non-null  datetime64[ns]
 8   expected_matured_on_date          48065 non-null  datetime64[ns]
 9   closed_on_date                    17511 non-null  datetime64[ns]
 10  due_date_fixed                    48065 non-nu

---
#### Save agg_summaries

In [None]:
agg_summary.to_excel("C:\\Project_summaries\\Bloom\\Bloom all_loans\\20220922\\Analysis_summaries\\Bloom_clients_loans_summary_20220922.xlsx")

In [None]:
agg_summary.loc[agg_summary["store_number"].notnull()][["store_number","inference_col"]].to_excel("C:\\Project_summaries\\Bloom\\Bloom all_loans\\20220922\\Analysis_summaries\\Bloom_clients_inference_summary_20220922.xlsx")