In [1]:
#import data processing libraries

import os
import pandas as pd
import numpy as np
import math as math
import datetime as dt
from scipy.stats import mode
import psycopg2 #reading data from Postgres DB
import sys

In [2]:
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 300)

#suppress scientific notation to 2 decimal places
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [3]:
#import visualization libraries

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline

In [4]:
# Connection parameters
param_dic = {
    "host"      : "157.245.248.249",
    "database"  : "ubuntu",
    "user"      : "jacklinengenia",
    "password"  : "x3MX&8#!"
}

def connect(params_dic):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = psycopg2.connect(**params_dic)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print("Connection successful")
    return conn

In [5]:
def postgresql_to_dataframe(conn, select_query, column_names):
    """
    Tranform a SELECT query into a pandas dataframe
    """
    cursor = conn.cursor()
    try:
        cursor.execute(select_query)
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        cursor.close()
        return 1
    
    # Naturally we get a list of tupples
    tupples = cursor.fetchall()
    cursor.close()
    
    # Convert the list of tuples in to a pandas dataframe
    df = pd.DataFrame(tupples, columns=column_names)
    return df 

In [6]:
# Connect to the database and load the scoring data bloomlive table
conn = connect(param_dic)

columns = "client_mifos_id,client_mobile_number,loan_status,loan_mifos_id,term_frequency,principal_disbursed,\
principal_repaid,interest_charged,interest_repaid,\
fee_charges_charged,fee_charges_repaid,\
penalty_charges_charged,penalty_charges_repaid,\
total_outstanding,\
disbursed_on_date,expected_matured_on_date,\
closed_on_date,store_number, bloom_version"

column_names = columns.strip().split(",")
# Execute the "SELECT cols" query
df = postgresql_to_dataframe(conn,
                             "select \
                             client_mifos_id,client_mobile_number,loan_status,loan_mifos_id,term_frequency,principal_disbursed,\
                             principal_repaid,interest_charged,interest_repaid,\
                             fee_charges_charged,fee_charges_repaid,\
                             penalty_charges_charged,penalty_charges_repaid,\
                             total_outstanding,\
                             disbursed_on_date,expected_matured_on_date,\
                             closed_on_date,store_number, bloom_version\
                             from bloomlive.loans_fact_table_summary_view",
                             column_names)

Connecting to the PostgreSQL database...
Connection successful


In [7]:
df.head(3)

Unnamed: 0,client_mifos_id,client_mobile_number,loan_status,loan_mifos_id,term_frequency,principal_disbursed,principal_repaid,interest_charged,interest_repaid,fee_charges_charged,fee_charges_repaid,penalty_charges_charged,penalty_charges_repaid,total_outstanding,disbursed_on_date,expected_matured_on_date,closed_on_date,store_number,bloom_version
0,60422,254717560267,600,150401,7,60000.0,60000.0,1560.0,1560.0,0.0,0.0,0.0,0.0,0.0,2022-07-09,2022-07-16,,7163929,2.0
1,50110,254791819706,600,150425,7,21000.0,21000.0,546.0,546.0,0.0,0.0,0.0,0.0,0.0,2022-07-09,2022-07-16,,7867323,2.0
2,40650,254745756396,600,151392,7,500.0,500.0,13.0,13.0,0.0,0.0,0.0,0.0,0.0,2022-07-11,2022-07-18,2022-07-12,7997987,2.0


In [8]:
#replace None type datatype with Nan values across entire df

df = df.fillna(value=np.nan)

In [9]:
#convert date columns
datetime_cols = ["disbursed_on_date", "expected_matured_on_date", "closed_on_date"]

df[datetime_cols] = df[datetime_cols].apply(pd.to_datetime, errors='coerce')


#convert specific int/float columns to string type
int_to_string_cols = ["client_mifos_id", "client_mobile_number"]

df[int_to_string_cols] = df[int_to_string_cols].astype(str)


#convert specific string columns to float, first by leaving out the non-targeted columns
string_to_float_cols = df.columns.drop(["client_mifos_id", "loan_status", "term_frequency",
                                        "disbursed_on_date", "expected_matured_on_date",
                                        "closed_on_date"])

df[string_to_float_cols] = df[string_to_float_cols].apply(pd.to_numeric, errors='coerce')


#replace the in the column headers with underscores & make all column headers in lower case
df.columns = df.columns.str.lower().str.replace(' ','')

In [10]:
#create new column that concats the bloom version and loan id so as to prevent any loan duplicates from the same mifos instance

df["loan_id_product_concat"] = (df["loan_mifos_id"].astype("str")+"-"+df["bloom_version"].astype("str")).astype("str")

In [11]:
#drop all loan records that have irrelevant status keys OR were never disbursed
df = df.drop(df[
    (df["loan_status"] == 0) |
    (df["loan_status"] == 100) |
    (df["loan_status"] == 400) |
    (df["loan_status"] == 500)
].index)


#drop all rows where loan id is blank
df = df.loc[df["loan_mifos_id"].notnull()]


#drop all rows with duplicate loan id is blank
df = df.loc[~df["loan_id_product_concat"].duplicated()]


#drop all rows where disbursement date is blank
df = df.loc[df["disbursed_on_date"].notnull()]

In [12]:
#clean up mobile number column
df[["client_mobile_number_2", "temp"]] = df["client_mobile_number"].astype("str").str.split(".", expand=True)


#drop newly created temp column
df.drop(["temp","client_mobile_number"], axis=1, inplace=True)

df.rename(columns={"client_mobile_number_2": "client_mobile_number"}, inplace=True)

In [13]:
#sort dataframe based on specific columns
df.sort_values(["client_mobile_number","disbursed_on_date"], ascending=[True, False], inplace=True)


#create loan count column i.e adds a new column that captures the num of loans a customer has taken
df["loan_count"] = df.groupby("store_number")["store_number"].transform('size')

#create loan rank column i.e adds a new column that captures the rank of each loan a customer has taken
#df["loan_rank"] = df.groupby("store_number")["loan_mifos_id"].rank(ascending=True)
df["loan_rank"] = df.groupby("store_number")["disbursed_on_date"].rank(ascending=True)


##sort out minor Mifos errors relating to loans with status 700
df.loc[(df["loan_status"] == 700) & (df["closed_on_date"].isnull()), "closed_on_date"] = df["expected_matured_on_date"]

## Notes ==> fix issue where loan rank gets messed up because of different Mifos instance

In [14]:
#create due_date fixed column to ensure due date tallies with the term frequency for each loan

df["due_date_fixed"] = df["disbursed_on_date"] + pd.to_timedelta(df["term_frequency"], unit='d')

In [15]:
#create a copy of df

all_loans = df.copy()

In [16]:
#analysis period

print('analysis begin date {}'.format(all_loans['disbursed_on_date'].min()))
print('analysis latest date {}'.format(all_loans['disbursed_on_date'].max()))

analysis begin date 2017-12-10 00:00:00
analysis latest date 2022-07-25 00:00:00


In [17]:
def calc_days_past_due(df):
    """
    Function to calculate days past due for each loan record. 
    The function uses pandas.Series vectorized arguments to ensure fast iterations/loops.
    Inputs are the arguments passed in the conditions list.
    Outputs are the results that are reported based on the choices list.
    Each output/choice assigned corresponds to the input/conditions level assigned above i.e.
    the first condition corresponds to the first choice etc, as such the rows of conditions & choices need to match.
    If the output reported is a string '0', that's an error/edgecase whose conditions, choices were not well declared.
    
    Inputs: 
    1) The current status of a loan as captured on corebanking(Mifos),
    2) The loans due date, 
    3) the last repayment date on record for the loan.
    
    
    Outputs:
    A calculation of a loan's number of days past due, that is converted from datetime to integer.
    
    """
    #subtract one day from today timestamp to ensure evaluation is in-line with Mifos which is Time-1 i.e one day behind
    today = (pd.Timestamp.today()).strftime('%Y-%m-%d')
    today = pd.Timestamp(today) - dt.timedelta(days=1)
        
    loan_status = df["loan_status"]
    due_date = df["due_date_fixed"]
    closed_on_date = df["closed_on_date"]
    
    #the main input is the loan status id that is used to slice the df
    conditions = [
        loan_status.eq(300),
        loan_status.eq(600),
        loan_status.eq(601),
        loan_status.eq(700),
    ]
    
    #the other date inputs are used for calculation based on the condition of loan status of a loan at any given point
    choices = [
        (today - due_date).dt.days,
        (closed_on_date - due_date).dt.days,
        (closed_on_date - due_date).dt.days,
        (closed_on_date - due_date).dt.days,
    ]
    
    days_past_due = np.select(conditions, choices)
    
    return days_past_due


#apply the function to the df to create the days_past_due_column
all_loans["days_past_due"] = calc_days_past_due(all_loans)

In [18]:
def set_loan_status_labels(df):
    """
    Function to set the loan repayment status of a loan.
    Inputs are the arguments passed in the conditions list.
    Outputs are the results that are reported based on the choices list.
    Each output/choice assigned corresponds to the input/conditions level assigned above i.e.
    the first condition corresponds to the first choice etc, as such the rows of conditions & choices need to match.
    If the output reported is a string '0', that's an error/edgecase whose conditions, choices were not well declared.
    
    Inputs: 
    1) The current status of a loan as captured on corebanking(Mifos),
    2) Number of days past due for each loan, 
    3) Term frequency for each loan,
    4) The version of Bloom tied to the loan record
    
    Outputs: 
    A string label noting the current loan repayment status of each loan record
    """
    thirty_day_product_rollover = 7 #Bloom 1.0
    twenty_one_day_product_rollover = 5 #Bloom 2.0 only
    seven_day_product_rollover_bloom1 = 7 #Bloom 2.0 adjusted to 5 days from 7 days in Bloom 1.0
    seven_day_product_rollover_bloom2 = 5 
    three_day_product_rollover = 2
    one_day_product_rollover = 1 #Bloom 2.0, may be adjusted to 5 days as well

    loan_status = all_loans["loan_status"]
    days_past_due = all_loans["days_past_due"]
    term_frequency = all_loans["term_frequency"]
    bloom_version = all_loans["bloom_version"]

    conditions = [
        #written off loans
        loan_status.eq(601),
        
        #currently active OR loans closed in tenure
        loan_status.eq(300) & days_past_due.le(0),
        loan_status.eq(600) & days_past_due.lt(0),
        loan_status.eq(700) & days_past_due.lt(0),
        loan_status.eq(600) & days_past_due.eq(0),
        loan_status.eq(700) & days_past_due.eq(0),
    
        #active loans that are presently in rollover
        loan_status.eq(300) & term_frequency.eq(1) & days_past_due.le(one_day_product_rollover),
        loan_status.eq(300) & term_frequency.eq(3) & days_past_due.le(three_day_product_rollover),
        loan_status.eq(300) & term_frequency.eq(7) & bloom_version.eq(1) & days_past_due.le(seven_day_product_rollover_bloom1),
        loan_status.eq(300) & term_frequency.eq(7) & bloom_version.eq(2) & days_past_due.le(seven_day_product_rollover_bloom2),
        loan_status.eq(300) & term_frequency.eq(21) & days_past_due.le(twenty_one_day_product_rollover),
        loan_status.eq(300) & term_frequency.eq(30) & days_past_due.le(thirty_day_product_rollover),
    
        #loans that were cleared/closed after they got to rollover and cleared with exact balance due
        loan_status.eq(600) & term_frequency.eq(1) & days_past_due.le(one_day_product_rollover),
        loan_status.eq(600) & term_frequency.eq(3) & days_past_due.le(three_day_product_rollover),
        loan_status.eq(600) & term_frequency.eq(7) & bloom_version.eq(1) & days_past_due.le(seven_day_product_rollover_bloom1),
        loan_status.eq(600) & term_frequency.eq(7) & bloom_version.eq(2) & days_past_due.le(seven_day_product_rollover_bloom2),
        loan_status.eq(600) & term_frequency.eq(21) & days_past_due.le(twenty_one_day_product_rollover),
        loan_status.eq(600) & term_frequency.eq(30) & days_past_due.le(thirty_day_product_rollover),
    
        #loans that were cleared/closed after they got to rollover and were overpaid
        loan_status.eq(700) & term_frequency.eq(1) & days_past_due.le(one_day_product_rollover),
        loan_status.eq(700) & term_frequency.eq(3) & days_past_due.le(three_day_product_rollover),
        loan_status.eq(700) & term_frequency.eq(7) & bloom_version.eq(1) & days_past_due.le(seven_day_product_rollover_bloom1),
        loan_status.eq(700) & term_frequency.eq(7) & bloom_version.eq(2) & days_past_due.le(seven_day_product_rollover_bloom2),
        loan_status.eq(700) & term_frequency.eq(21) & days_past_due.le(twenty_one_day_product_rollover),
        loan_status.eq(700) & term_frequency.eq(30) & days_past_due.le(thirty_day_product_rollover),

        #active loans that are presently in default
        loan_status.eq(300) & term_frequency.eq(1) & days_past_due.gt(one_day_product_rollover),
        loan_status.eq(300) & term_frequency.eq(3) & days_past_due.gt(three_day_product_rollover),
        loan_status.eq(300) & term_frequency.eq(7) & bloom_version.eq(1) & days_past_due.gt(seven_day_product_rollover_bloom1),
        loan_status.eq(300) & term_frequency.eq(7) & bloom_version.eq(2) & days_past_due.gt(seven_day_product_rollover_bloom2),
        loan_status.eq(300) & term_frequency.eq(21) & days_past_due.gt(twenty_one_day_product_rollover),
        loan_status.eq(300) & term_frequency.eq(30) & days_past_due.gt(thirty_day_product_rollover),


        #loans were cleared/closed when they had got to default status and cleared with exact balance due
        loan_status.eq(600) & term_frequency.eq(1) & days_past_due.gt(one_day_product_rollover),
        loan_status.eq(600) & term_frequency.eq(3) & days_past_due.gt(three_day_product_rollover),
        loan_status.eq(600) & term_frequency.eq(7) & bloom_version.eq(1) & days_past_due.gt(seven_day_product_rollover_bloom1),
        loan_status.eq(600) & term_frequency.eq(7) & bloom_version.eq(2) & days_past_due.gt(seven_day_product_rollover_bloom2),
        loan_status.eq(600) & term_frequency.eq(21) & days_past_due.gt(twenty_one_day_product_rollover),
        loan_status.eq(600) & term_frequency.eq(30) & days_past_due.gt(thirty_day_product_rollover),

        #loans that were cleared/closed after they got to default and were overpaid
        loan_status.eq(700) & term_frequency.eq(1) & days_past_due.gt(one_day_product_rollover),
        loan_status.eq(700) & term_frequency.eq(3) & days_past_due.gt(three_day_product_rollover),
        loan_status.eq(700) & term_frequency.eq(7) & bloom_version.eq(1) & days_past_due.gt(seven_day_product_rollover_bloom1),
        loan_status.eq(700) & term_frequency.eq(7) & bloom_version.eq(2) & days_past_due.gt(seven_day_product_rollover_bloom2),
        loan_status.eq(700) & term_frequency.eq(21) & days_past_due.gt(twenty_one_day_product_rollover),
        loan_status.eq(700) & term_frequency.eq(30) & days_past_due.gt(thirty_day_product_rollover),
    ]


    choices = [
        "written-off_default",
        
        "current_active",
        "closed_early_repayment",
        "closed_early_repayment_overpaid",
        "closed_on_time",
        "closed_on_time_overpaid",    
        
        "active_rollover",
        "active_rollover",
        "active_rollover",
        "active_rollover",
        "active_rollover",
        "active_rollover",
        
        "closed_rollover",
        "closed_rollover",
        "closed_rollover",
        "closed_rollover",
        "closed_rollover",
        "closed_rollover",    
        
        "closed_rollover_overpaid",
        "closed_rollover_overpaid",
        "closed_rollover_overpaid",
        "closed_rollover_overpaid",
        "closed_rollover_overpaid",
        "closed_rollover_overpaid",
        
        "active_default",
        "active_default",
        "active_default",
        "active_default",
        "active_default",
        "active_default",   
        
        "closed_default",
        "closed_default",
        "closed_default",
        "closed_default",
        "closed_default",
        "closed_default",        
        
        "closed_default_overpaid",
        "closed_default_overpaid",
        "closed_default_overpaid",
        "closed_default_overpaid",
        "closed_default_overpaid",
        "closed_default_overpaid",

    ]
    
    loan_labels = np.select(conditions, choices)
    
    return loan_labels


#apply the function to the df to create the days_past_due_column
all_loans["loan_repayment_status"] = set_loan_status_labels(all_loans)

In [19]:
#return df with two most recent loans for each client id

#temp_df = all_loans.groupby("client_id").head(2).reset_index()

In [20]:
all_loans.head(3)

Unnamed: 0,client_mifos_id,loan_status,loan_mifos_id,term_frequency,principal_disbursed,principal_repaid,interest_charged,interest_repaid,fee_charges_charged,fee_charges_repaid,penalty_charges_charged,penalty_charges_repaid,total_outstanding,disbursed_on_date,expected_matured_on_date,closed_on_date,store_number,bloom_version,loan_id_product_concat,client_mobile_number,loan_count,loan_rank,due_date_fixed,days_past_due,loan_repayment_status
302122,119640,300,173183,21,29300.0,0.0,2226.8,0.0,0.0,0.0,0.0,0.0,31526.8,2022-07-22,2022-08-12,NaT,,2.0,173183-2.0,-1,,53.5,2022-08-12,-19.0,current_active
302188,119640,300,173141,7,15700.0,0.0,408.2,0.0,0.0,0.0,0.0,0.0,16108.2,2022-07-22,2022-07-29,NaT,,2.0,173141-2.0,-1,,53.5,2022-07-29,-5.0,current_active
317286,119640,300,173144,7,11200.0,0.0,291.2,0.0,0.0,0.0,0.0,0.0,11491.2,2022-07-22,2022-07-29,NaT,,2.0,173144-2.0,-1,,53.5,2022-07-29,-5.0,current_active


In [21]:
all_loans.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 322537 entries, 302122 to 99866
Data columns (total 25 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   client_mifos_id           322537 non-null  object        
 1   loan_status               322537 non-null  int64         
 2   loan_mifos_id             322537 non-null  int64         
 3   term_frequency            322537 non-null  int64         
 4   principal_disbursed       322537 non-null  float64       
 5   principal_repaid          322537 non-null  float64       
 6   interest_charged          322537 non-null  float64       
 7   interest_repaid           322537 non-null  float64       
 8   fee_charges_charged       322537 non-null  float64       
 9   fee_charges_repaid        322537 non-null  float64       
 10  penalty_charges_charged   322537 non-null  float64       
 11  penalty_charges_repaid    322537 non-null  float64       
 12

---
#### Generate aggregate summaries

In [22]:
#return df with the most recent loan for each borrower

#agg_summary = all_loans.groupby("store_number").head(1).reset_index()
agg_summary = all_loans.loc[all_loans.groupby('store_number').loan_rank.idxmax()].reset_index()

In [23]:
#trim df to only relevant columns
target_cols = ["client_mobile_number","store_number","loan_count","loan_status","term_frequency",
  "principal_disbursed","principal_repaid","disbursed_on_date",
  "expected_matured_on_date","closed_on_date","due_date_fixed",
  "days_past_due","bloom_version","loan_repayment_status"]

agg_summary = agg_summary[target_cols]

In [24]:
#aggregate maximum principal disbursed for each client id

agg_summary = pd.merge(agg_summary, (all_loans.groupby("store_number")["principal_disbursed"].max().rename("max_principal_amount").reset_index()), on="store_number")

In [25]:
#get df for when a customer got their max loan principal

max_principal_dates = all_loans.sort_values("principal_disbursed", ascending=False).groupby("store_number").first().reset_index()

In [26]:
#trim df to only remain with relevant columns

max_principal_dates = max_principal_dates[["store_number","disbursed_on_date"]]

#rename column to make it clearer
max_principal_dates.rename(columns = {"disbursed_on_date": "max_loan_disbursement_date"}, inplace = True)

#merge df

agg_summary = pd.merge(agg_summary, max_principal_dates, on="store_number")

In [27]:
#aggregate of loans well paid or in good standing

agg_good_loans = (all_loans.loc[(all_loans["loan_repayment_status"]== "closed_early_repayment")|
                                (all_loans["loan_repayment_status"]== "closed_early_repayment_overpaid")|
                                (all_loans["loan_repayment_status"]== "closed_on_time")|
                                (all_loans["loan_repayment_status"]== "closed_on_time_overpaid")|
                                (all_loans["loan_repayment_status"]== "current_active")
                            ]).groupby("store_number")["loan_id_product_concat"].aggregate("count").rename("count_good_loans").reset_index()

In [28]:
#merge agg_summary & agg good loans

agg_summary = pd.merge(agg_summary, agg_good_loans, how="outer", on="store_number")

In [29]:
#fill rest of missing values with zeros

cols_fillna = ['count_good_loans']
# replace 'NaN' with zero in these columns
for col in cols_fillna:
    agg_summary[col].fillna(0,inplace=True)

In [30]:
#calculate good repayment ratios for the borrowers

agg_summary["good_loans_repayment_ratio"] = round(agg_summary["count_good_loans"]/agg_summary["loan_count"], 2)

In [31]:
#calculate num days since last disbursement
#subtract one day from today timestamp to ensure evaluation is in-line with Mifos which is Time-1 i.e one day behind

today = (pd.Timestamp.today()).strftime('%Y-%m-%d')
today = pd.Timestamp(today) - dt.timedelta(days=1)

agg_summary["num_days_since_last_disbursement"] = pd.to_numeric((today - agg_summary["disbursed_on_date"]).dt.days, downcast='integer')

In [32]:
#delete test accounts

agg_summary = agg_summary[agg_summary["max_principal_amount"]>=200]

In [33]:
# add column to label inference column

def assign_inference_label(df):
    """
    Function to assess the weight to be assigned based on good loans repayment ratio i.e num of loans paid within tenure\
    for customers who qualify for limit stabilization
    
    Inputs:   
    1) The inference column indicating whether rules are to be relaxed OR not,
    2) Good loans repayment ratio tracking column i.e. ratio of num of loans paid within tenure vs total num loans taken
    
    Outputs:
    A column assigning the assigned weight for good loans repayment ratio
    """ 
    
    target_col = df["good_loans_repayment_ratio"]
    good_loans_repayment_ratio_threshold = 0.7
    
    conditions = [
        target_col.ge(good_loans_repayment_ratio_threshold),
        target_col.lt(good_loans_repayment_ratio_threshold),
    ]
    
    choices = [
        "relax_rules",
        "No_rules_relaxed",
    ]
    
    inference_col = np.select(conditions, choices)
    
    return inference_col

#apply function
agg_summary["inference_col"] = assign_inference_label(agg_summary)

In [34]:
def weight_dpd(df):
    """
    Function to assess the weight to be assigned based on days past due for customers who qualify for limit stabilization
    
    Inputs:   
    1) The inference column indicating whether rules are to be relaxed OR not,
    2) Days past due tracking column i.e. num of days past due
    
    Outputs:
    A column assigning the assigned weight for days past due
    """ 
    
    dpd_col = df["days_past_due"]
    inference_col = df["inference_col"]
    inference_col_target = "relax_rules"
    
    conditions = [
        inference_col.str.match(inference_col_target) & dpd_col.lt(30),
        inference_col.str.match(inference_col_target) & dpd_col.ge(30) & dpd_col.lt(35),
        inference_col.str.match(inference_col_target) & dpd_col.ge(35) & dpd_col.lt(38),
        inference_col.str.match(inference_col_target) & dpd_col.ge(38) & dpd_col.lt(41),
        dpd_col.gt(41)
    ]
    
    choices = [
        1,
        0.9,
        0.8,
        0.7,
        0 
    ]
    
    weight_dpd_col = np.select(conditions, choices)
    
    return weight_dpd_col

#apply function
agg_summary["weight_dpd"] = weight_dpd(agg_summary)

In [35]:
agg_summary.head(3)

Unnamed: 0,client_mobile_number,store_number,loan_count,loan_status,term_frequency,principal_disbursed,principal_repaid,disbursed_on_date,expected_matured_on_date,closed_on_date,due_date_fixed,days_past_due,bloom_version,loan_repayment_status,max_principal_amount,max_loan_disbursement_date,count_good_loans,good_loans_repayment_ratio,num_days_since_last_disbursement,inference_col,weight_dpd
0,254726604388.0,6.0,7.0,300,7,200000.0,0.0,2022-07-24,2022-07-31,NaT,2022-07-31,-7.0,2.0,current_active,200000.0,2022-07-04,3.0,0.43,0,No_rules_relaxed,0.0
1,,7.0,21.0,300,7,1300.0,0.0,2022-07-23,2022-07-30,NaT,2022-07-30,-6.0,2.0,current_active,200000.0,2022-04-12,10.0,0.48,1,No_rules_relaxed,0.0
2,254720272826.0,11.0,3.0,600,30,5000.0,5000.0,2019-12-01,2019-12-31,2019-12-30,2019-12-31,-1.0,1.0,closed_early_repayment,5000.0,2019-12-01,3.0,1.0,966,relax_rules,1.0


In [42]:
agg_summary[agg_summary['days_past_due'] < 41][['days_past_due', 'inference_col', 'weight_dpd']]

Unnamed: 0,days_past_due,inference_col,weight_dpd
0,-7.00,No_rules_relaxed,0.00
1,-6.00,No_rules_relaxed,0.00
2,-1.00,relax_rules,1.00
3,-6.00,relax_rules,1.00
12,-18.00,relax_rules,1.00
...,...,...,...
45334,-19.00,relax_rules,1.00
45335,-15.00,relax_rules,1.00
45336,-4.00,relax_rules,1.00
45337,-2.00,relax_rules,1.00


In [36]:
agg_summary.shape

(45309, 21)

In [37]:
agg_summary['store_number'].nunique()

45309

In [38]:
agg_summary.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45309 entries, 0 to 45339
Data columns (total 21 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   client_mobile_number              45309 non-null  object        
 1   store_number                      45309 non-null  float64       
 2   loan_count                        45309 non-null  float64       
 3   loan_status                       45309 non-null  int64         
 4   term_frequency                    45309 non-null  int64         
 5   principal_disbursed               45309 non-null  float64       
 6   principal_repaid                  45309 non-null  float64       
 7   disbursed_on_date                 45309 non-null  datetime64[ns]
 8   expected_matured_on_date          45309 non-null  datetime64[ns]
 9   closed_on_date                    14642 non-null  datetime64[ns]
 10  due_date_fixed                    45309 non-nu

---
#### Save agg_summaries

In [39]:
agg_summary.to_excel("C:\\Project_summaries\\Bloom\\Bloom all_loans\\20220721\\Analysis_summaries\\Bloom_clients_loans_summary_20220721.xlsx")

In [40]:
agg_summary.loc[agg_summary["store_number"].notnull()][["store_number","inference_col"]].to_excel("C:\\Project_summaries\\Bloom\\Bloom all_loans\\20220721\\Analysis_summaries\\Bloom_clients_inference_summary_20220721.xlsx")