In [1]:
#import data processing libraries

import os
import pandas as pd
import numpy as np
import math as math
import datetime as dt
from scipy.stats import mode
import psycopg2

In [2]:
#import visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline

In [3]:
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 300)

#suppress scientific notation to 2 decimal places

pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [4]:
#path to data dump

path = "C:\\Project_summaries\\Bloom\\Metabase_dumps\\"

In [5]:
files = os.listdir(path)

files_csv = [f for f in files if f[-3:] == 'csv']
files_csv

['july 1.csv',
 'july 10.csv',
 'july 11.csv',
 'july 12.csv',
 'july 13.csv',
 'july 14.csv',
 'july 15.csv',
 'july 16.csv',
 'july 17.csv',
 'july 18.csv',
 'july 19.csv',
 'july 2.csv',
 'july 20.csv',
 'july 21.csv',
 'july 22.csv',
 'july 23.csv',
 'july 24.csv',
 'july 3.csv',
 'july 4.csv',
 'july 5.csv',
 'july 6.csv',
 'july 7.csv',
 'july 8.csv',
 'july 9.csv',
 'june 23.csv',
 'june 24.csv',
 'june 25.csv',
 'june 26.csv',
 'june 27.csv',
 'june 28.csv',
 'june 29.csv',
 'june 30.csv']

In [6]:
headers_list = ["Account No", "Amount", "Balance After", 
                "Client Name", "Comments", "Created At",
                "ID", "Phone", "Status", "Store Number",
                "Transaction ID", "Transaction Time",
                "Type", "Updated At"]


#load the data from path to pandas df

list_of_dataframes = []
for filename in files_csv:
    list_of_dataframes.append(pd.read_csv(path+filename, names=headers_list, header=0, skiprows = 1,\
                                          on_bad_lines='skip', encoding='utf-8', engine='python'))

df = pd.concat(list_of_dataframes)

#replace whitespaces on column headers with 
df.columns = df.columns.str.replace(' ','_')

#rename Pandas columns to lower case
df.columns= df.columns.str.lower()

In [7]:
#rename column to make it clearer
df.rename(columns = {"id": "db_transaction_id"}, inplace = True)

#trim data to only have target colums
target_columns = ["store_number","phone","transaction_id","amount","transaction_time"]

df = df[target_columns]

In [8]:
#convert date columns
datetime_cols = ["transaction_time"]

df[datetime_cols] = df[datetime_cols].apply(pd.to_datetime, errors='coerce')


#convert specific int/float columns to string type
int_to_string_cols = ["store_number"]

df[int_to_string_cols] = df[int_to_string_cols].astype(str)

In [9]:
#drop any duplicate transactions
df = df.loc[~df["transaction_id"].duplicated()]


#drop any rows with national id being blank
df = df.loc[df["store_number"].notnull()]

In [10]:
# strip transaction time column to only include Y-m-d ==> this converts datetime column to string
df["transaction_time"] = df["transaction_time"].apply(lambda x: x.strftime("%Y-%m-%d")) 


# convert transaction time column to datetime
datetime_cols = ["transaction_time"]
df[datetime_cols] = df[datetime_cols].apply(pd.to_datetime, errors='coerce')

In [11]:
#trim df to only contain past 30 days trx

df = df[df["transaction_time"] > dt.datetime.now() - pd.to_timedelta("30day")]

In [12]:
df.head(3)

Unnamed: 0,store_number,phone,transaction_id,amount,transaction_time
0,7903417,254717000000.0,QG16NBMSH0,150.0,2022-07-01
1,7169099,254711000000.0,QG14NBMUQC,80.0,2022-07-01
2,680565,254718000000.0,QG19NBMUQH,110.0,2022-07-01


In [13]:
df['store_number'].nunique()

80289

In [14]:
df.shape

(17430284, 5)

---
#### Generate aggregate summaries

In [15]:
agg_summary = df.groupby("store_number").agg({
    "amount":"sum",
    "transaction_time":"min"
}).reset_index()

agg_summary.rename(
    columns={
        "amount": "approx_30_days_trx_val",
        "transaction_time": "most_recent_trx_date_past_30_days",
    }, inplace=True)

In [16]:
#aggregate for latest trx date

agg_summary = pd.merge(agg_summary, (df.groupby("store_number")["transaction_time"].max().rename("last_trx_date").reset_index()), on="store_number")

In [17]:
#calculate expected trx days

agg_summary["expected_trx_days"] = ((agg_summary["last_trx_date"]-agg_summary["most_recent_trx_date_past_30_days"]).dt.days)+1

In [18]:
#get unique trx days
#calculate consistency

agg_summary["page_active_days"] = round(agg_summary["actual_trx_days"]/agg_summary["expected_trx_days"], 2)
agg_summary = pd.merge(agg_summary, (df.groupby("store_number")["transaction_time"].nunique().rename("actual_trx_days").reset_index()), on="store_number")

In [19]:
#calculate consistency

agg_summary["page_active_days"] = round(agg_summary["actual_trx_days"]/agg_summary["expected_trx_days"], 2)

In [20]:
#load df with store number inferences
#merge df to add inference column

agg_summary = agg_summary.merge(inference_df, how="left", on="store_number")
inference_df_path = "C:\\Project_summaries\\Bloom\\Bloom all_loans\\20220721\\Analysis_summaries\\"
inference_df = pd.read_excel(inference_df_path+"Bloom_clients_inference_summary_20220721.xlsx")
inference_df.drop(columns="Unnamed: 0", inplace=True)
inference_df["store_number"] = inference_df["store_number"].astype("str")

In [21]:
#merge df to add inference column

agg_summary = agg_summary.merge(inference_df, how="left", on="store_number")

In [22]:
#fill rest of missing values with zeros

cols_fillna = ["inference_col"]
# replace 'NaN' with zero in these columns
for col in cols_fillna:
    agg_summary[col].fillna("No_rules_relaxed",inplace=True)

In [23]:
#get num days since last trx

#today = dt.datetime(2022,2,26)
#today = pd.Timestamp(today)
today = (pd.Timestamp.today()).strftime('%Y-%m-%d')
today = pd.Timestamp(today)

#not counting the end date because the scoring refresh might be triggered early in the morning or before working day is over
agg_summary["days_since_last_trx"] = (today - agg_summary["last_trx_date"]).dt.days

In [24]:
def recency_check(df):
    """
    Function to assess whether store number has recently been trading up to a certain allowed threshold i.e. 5 days for\
    those that don't qualify for limit stabilization and upto 7 days for those that qualify
    
    Inputs:   
    1) The inference column indicating whether rules are to be relaxed OR not,
    2) Recency tracking column i.e. num of days since store number last had a transaction
    3) Recency thresholds
    
    Outputs:
    A column denoting a boolean yes OR no wrt to whether a customer met the required recency threshold
    """   
    inference_col = df["inference_col"]
    inference_col_target = "relax_rules"
    recency_col = df["days_since_last_trx"]
    no_rules_relaxed_recency_threshold = 5
    rules_relaxed_recency_threshold = 7
    
    #choice responses
    transaction_boolean_accepted = "Yes"
    transaction_boolean_rejected = "No"
    
    
    conditions = [
        recency_col.le(no_rules_relaxed_recency_threshold),
        inference_col.str.match(inference_col_target) & recency_col.le(rules_relaxed_recency_threshold),
        recency_col.gt(no_rules_relaxed_recency_threshold),  
    ]
    
    choices = [
        transaction_boolean_accepted,
        transaction_boolean_accepted,
        transaction_boolean_rejected,
    ]
    
    new_col = np.select(conditions, choices)
    
    return new_col

#apply the function to the df to create 21 day adjusted product limit allocation
agg_summary["transacted_last_5_days"] = recency_check(agg_summary)

In [25]:
def weight_till_recency(df):
    """
    Function to assess the weight to be assigned based on till recency for customers who qualify for limit stabilization
    
    Inputs:   
    1) The inference column indicating whether rules are to be relaxed OR not,
    2) Recency tracking column i.e. num of days since store number last had a transaction
    
    Outputs:
    A column assigning the assigned weight for till recency
    """       
    recency = df["days_since_last_trx"]
    inference_col = df["inference_col"]
    inference_col_target = "relax_rules"
    
    conditions = [
        recency.le(5),
        inference_col.str.match(inference_col_target) & recency.le(5),
        inference_col.str.match(inference_col_target) & recency.gt(5) & recency.le(6),
        inference_col.str.match(inference_col_target) & recency.gt(6) & recency.le(7),
        recency.gt(7),
    ]
    
    choices = [
        1,
        1,
        0.9,
        0.7,
        0 
    ]
    
    weight_till_recency_col = np.select(conditions, choices)
    
    return weight_till_recency_col

#apply function
agg_summary["weight_till_recency"] = weight_till_recency(agg_summary)

In [26]:
agg_summary.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 80289 entries, 0 to 80288
Data columns (total 11 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   store_number                       80289 non-null  object        
 1   approx_30_days_trx_val             80289 non-null  float64       
 2   most_recent_trx_date_past_30_days  80289 non-null  datetime64[ns]
 3   last_trx_date                      80289 non-null  datetime64[ns]
 4   expected_trx_days                  80289 non-null  int64         
 5   actual_trx_days                    80289 non-null  int64         
 6   page_active_days                   80289 non-null  float64       
 7   inference_col                      80289 non-null  object        
 8   days_since_last_trx                80289 non-null  int64         
 9   transacted_last_5_days             80289 non-null  object        
 10  weight_till_recency               

---
#### Load customer details & map store number to national id

In [27]:
customers_path = "C:\\Project_summaries\\Bloom\\Bloom all_loans\\20220721\Till_data_summaries\\"

In [28]:
agg_summary.to_excel("C:\\Project_summaries\\Bloom\\Bloom all_loans\\20220721\Till_data_summaries\\agg_summary.xlsx")

In [29]:
host = '157.245.248.249'
port = int(5432)
dbname = 'ubuntu'
user = 'jacklinengenia'
password = 'x3MX&8#!'


def get_query_results_postgres():
    with psycopg2.connect(host = host,
                          port = port,
                          database = dbname,
                          user = user,
                          password = password) as conn:
        sql = "select * from bloomlive.client_summary_view csv2 where is_iprs_validated is true"
        df = pd.read_sql(sql, conn)
        
    return df

    conn.close()

In [30]:
# loading data from scoring results table
customer_details = get_query_results_postgres()

customer_details.head(3)

Unnamed: 0,bloom_version,surrogate_id,mifos_id,mobile_number,store_number,national_id,first_name,middle_name,last_name,iprs_first_name,iprs_other_name,iprs_surname,date_of_birth,gender,status,submitted_on_date,client_type,company_name,provided_first_name,iprs_name_matched,is_iprs_checked,is_iprs_validated
0,2.0,173856,91945,254714958267,7783183,25721072,Edigar,Litunda,Segero,Edigar,Litunda,Segero,2022-04-28,,Active,2022-04-28,,EDIGAR LITUNDA,,True,True,True
1,2.0,174245,92337,254727059406,7357462,28167918,Sylvia,Mumbi,Ngunga,Sylvia,Mumbi,Ngunga,2022-04-29,,Active,2022-04-29,,SYLVIA MUMBI,,True,True,True
2,2.0,174098,92190,254705801996,7119000,33503541,Simon,Kamau,Waruiru,Simon,Kamau,Waruiru,2022-04-29,,Active,2022-04-29,,SIMON KAMAU 4,,True,True,True


In [31]:
customer_details['bloom_version'].nunique()

2

In [32]:
customer_details.shape

(130914, 22)

In [33]:
customer_details = customer_details.drop_duplicates(subset=['store_number'], keep='last')

customer_details.shape

(95046, 22)

In [34]:
customer_details['store_number'].nunique()

95045

In [35]:
#load customer_details with store number & national id mapping

#customer_details = pd.read_csv(customers_path+"iprs_report_2021-10-14.csv")

#replace whitespaces on column headers with 
customer_details.columns = customer_details.columns.str.replace(' ','_')

#rename Pandas columns to lower case
customer_details.columns = customer_details.columns.str.lower()

#rename column to make it clearer
#customer_details.rename(columns = {"customeridnumber": "national_id", "accountnumber":"store_number"}, inplace = True)

#trim customer_details to only have target columns
customer_target_columns = ["store_number", "national_id", 'mobile_number']

customer_details = customer_details[customer_target_columns]

In [36]:
#convert specific int/float columns to string type
int_to_string_cols_2 = ["store_number", "national_id", "mobile_number"]

customer_details[int_to_string_cols_2] = customer_details[int_to_string_cols_2].astype(str)

In [37]:
customer_details.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95046 entries, 0 to 130913
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   store_number   95046 non-null  object
 1   national_id    95046 non-null  object
 2   mobile_number  95046 non-null  object
dtypes: object(3)
memory usage: 2.9+ MB


In [38]:
#merge dfs to map store numbers to national ids

agg_summary = agg_summary.merge(customer_details, how="left", on="store_number")

In [39]:
#rearrange column order

agg_summary_cols = agg_summary.columns.to_list()

agg_summary_cols = agg_summary_cols[-1:] + agg_summary_cols[:-1]

agg_summary = agg_summary[agg_summary_cols]

In [40]:
agg_summary.head(3)

Unnamed: 0,mobile_number,store_number,approx_30_days_trx_val,most_recent_trx_date_past_30_days,last_trx_date,expected_trx_days,actual_trx_days,page_active_days,inference_col,days_since_last_trx,transacted_last_5_days,weight_till_recency,national_id
0,254711519875,101212,12530.0,2022-07-04,2022-07-21,18,6,0.33,No_rules_relaxed,4,Yes,1.0,8026792
1,254716180492,105295,543428.75,2022-06-26,2022-07-24,29,28,0.97,relax_rules,1,Yes,1.0,27881033
2,254701582431,105581,3603.0,2022-07-05,2022-07-22,18,4,0.22,No_rules_relaxed,3,Yes,1.0,32339396


In [41]:
agg_summary.shape

(80289, 13)

In [42]:
agg_summary['store_number'].nunique()

80289

In [43]:
#save_df

agg_summary.to_excel(customers_path+"Bloom_scoring_trx_data_20220721.xlsx")