# How to use this file

## Introduction

All functions have the form

function(df, column_name, bins = None, method = None, target = None)

There are FIVE groups of functions:

    - Group 1: For each feature, extract the list of bins.
    - Group 2: For each feature, extract the distribution corresponding to the list of bins.
    - Group 3: For each feature, extract the bad rate corresponding to each bin in the list of bins. The 'target' argument must be assigned if using the functions in this group.
    - Group 4: For each feature, extract the result whether it is needed to re-bin.
    - Group 5: For each feature, perform re-binning by grouping categorical bins together and extract the new distribution table and the new bad rate table.

There are THREE choices to use these functions:

    - method == None: You choose bins on your own.
    - method == 'optimal_binning': You use the Optimal Binning method.
    - method == 'statistics': You use the Statistics method.

## Examples

### Group 1: Extract List of bins:

binning_list(df, 'max_dpdall_ref', bins = [0,1])

binning_list_new(df, 'max_dpdall_ref', method = 'statistics')

binning_list_plus(df, 'max_dpdall_ref', method = 'optimal_binning', target = 'del90_mob12_app')

ranking_list(df, 'max_dpdall_ref', method = 'optimal_binning', target = 'del90_mob12_app')

### Group 2: Extract Distribution array, Distribution table, and plot Distribution bar chart

count_binning_final(df, 'max_dpdall_ref', bins = [0,1])

distribution_table_final(df, 'max_dpdall_ref', method = 'statistics')

distribution_plotting(df, 'max_dpdall_ref', method = 'optimal_binning', target = 'del90_mob12_app')

### Group 3: Extract Bad rate array, Bad rate table, and plot Bad rate line chart

del_mob_percent_final(df, 'max_dpdall_ref', bins = [0,1])

del_mob_table_final(df, 'max_dpdall_ref', method = 'statistics')
    
del_mob_plotting(df, 'max_dpdall_ref', method = 'optimal_binning', target = 'del90_mob12_app')

### Group 4: Decide whether it is needed to conduct re-binning for a feature

Note that the 'target' argument MUST NOT be None.

is_feature_need_to_be_rebinned(df, 'max_dpdall_ref', bins = [0,1], target = 'del90_mob12_app')

is_feature_need_to_be_rebinned(df, 'max_dpdall_ref', method = 'statistics', target = 'del90_mob12_app')

is_feature_need_to_be_rebinned(df, 'max_dpdall_ref', method = 'optimal_binning', target = 'del90_mob12_app')

### Group 5: Group categorical values and extract the updated tables of Distribution and Bad rate

The 'bins' argument MUST be a list of list. And the 'target' argument MUST NOT be None.

bin1 = [['X-sell 2ND', 'X-sell CDL'], ['X-sell PL'], ['X-sell TW'], ['unknown']]
group_categorical_values(df, 'product_group_dt2', bins = bin1, method = 'statistics', target = 'del90_mob12_app')

bin2 = [['<= 0'], ['<= 1', '> 1'], ['unknown']]
group_categorical_values(df, list_column[4], bins = bin2, method = 'optimal_binning', target = 'del90_mob12_app')

# Import packages

In [1]:
import csv
import logging
import cx_Oracle
import pandas as pd
import math as m
import xlsxwriter
import numpy as np
from io import BytesIO
#import polars as pl
import plotly.graph_objects as go
#import risk_mdl
import datetime as dt
from datetime import date
from optbinning import OptimalBinning, BinningProcess
pd.set_option('mode.chained_assignment', None) # Hide warnings

In [2]:
connection = cx_Oracle.connect(user = **********, password = *********,  dsn = **********)
cursor = connection.cursor()
sql_command = '''
select A.*,
       ------------------------------------------------------------
       (case when
            (to_date(concat((substr(to_char(applied_date, 'YYYY/'), 1, 5)),'01/01'), 'yyyy/mm/dd') between (applied_date -5)  and (applied_date + 5))
       or   (to_date(concat((substr(to_char(applied_date, 'YYYY/'), 1, 5)),'02/14'), 'yyyy/mm/dd') between (applied_date -5)  and (applied_date + 5))
       or   (to_date(concat((substr(to_char(applied_date, 'YYYY/'), 1, 5)),'03/08'), 'yyyy/mm/dd') between (applied_date -5)  and (applied_date + 5))
       or   (to_date(concat((substr(to_char(applied_date, 'YYYY/'), 1, 5)),'04/30'), 'yyyy/mm/dd') between (applied_date -5)  and (applied_date + 5))
       or   (to_date(concat((substr(to_char(applied_date, 'YYYY/'), 1, 5)),'05/01'), 'yyyy/mm/dd') between (applied_date -5)  and (applied_date + 5))
       or   (to_date(concat((substr(to_char(applied_date, 'YYYY/'), 1, 5)),'06/01'), 'yyyy/mm/dd') between (applied_date -5)  and (applied_date + 5))
       or   (to_date(concat((substr(to_char(applied_date, 'YYYY/'), 1, 5)),'09/02'), 'yyyy/mm/dd') between (applied_date -5)  and (applied_date + 5))
       or   (to_date(concat((substr(to_char(applied_date, 'YYYY/'), 1, 5)),'09/05'), 'yyyy/mm/dd') between (applied_date -5)  and (applied_date + 5))
       or   (to_date(concat((substr(to_char(applied_date, 'YYYY/'), 1, 5)),'10/20'), 'yyyy/mm/dd') between (applied_date -5)  and (applied_date + 5))
       or   (to_date(concat((substr(to_char(applied_date, 'YYYY/'), 1, 5)),'11/20'), 'yyyy/mm/dd') between (applied_date -5)  and (applied_date + 5))
       or   (to_date(concat((substr(to_char(applied_date, 'YYYY/'), 1, 5)),'12/25'), 'yyyy/mm/dd') between (applied_date -5)  and (applied_date + 5))
       then 1
       else 0
       end
       ) as holiday_flag,
       --------------------------
       (case when
            to_date(concat
                     (
                     (substr(to_char(applied_date, 'YYYY/MM/'), 1, 7)),
                     (substr(to_char(applied_date, 'YYYY/MM'), 6, 7))              
                     ), 'yyyy/mm/dd') between (applied_date -3)  and (applied_date + 3)
       then 1
       else 0
       end
       ) as double_day_flag,
       --------------------------
       to_char(applied_date, 'day') as day_of_week,
       case when to_char(applied_date, 'd') in (1,7) then 1 else 0 end as weekend_flag
from quan_tbl_final_all A
'''
df = pd.read_sql_query(sql_command, con= connection)

df.columns = df.columns.str.lower()
list_column = df.columns.to_list()



# Fix some negative values

In [63]:
df['max_dpdall_ref'] = df['max_dpdall_ref'].mask(df['max_dpdall_ref'] < 0).fillna(np.nan)
df['max_dpd_ref'] = df['max_dpd_ref'].mask(df['max_dpd_ref'] < 0).fillna(np.nan)
df['months_dpd_0_cus_journey'] = df['months_dpd_0_cus_journey'].mask(df['months_dpd_0_cus_journey'] < 0).fillna(np.nan)
df['months_dpd_10_cus_journey'] = df['months_dpd_10_cus_journey'].mask(df['months_dpd_10_cus_journey'] < 0).fillna(np.nan)
df['months_dpd_30_cus_journey'] = df['months_dpd_30_cus_journey'].mask(df['months_dpd_30_cus_journey'] < 0).fillna(np.nan)
df['avg_enr_ratio_ever'] = df['avg_enr_ratio_ever'].mask(df['avg_enr_ratio_ever'] < 0).fillna(np.nan)
df['avg_enr_ratio_last_3m'] = df['avg_enr_ratio_last_3m'].mask(df['avg_enr_ratio_last_3m'] < 0).fillna(np.nan)
df['avg_enr_ratio_last_3_6m'] = df['avg_enr_ratio_last_3_6m'].mask(df['avg_enr_ratio_last_3_6m'] < 0).fillna(np.nan)
df['avg_enr_ratio_last_6_12m'] = df['avg_enr_ratio_last_6_12m'].mask(df['avg_enr_ratio_last_6_12m'] < 0).fillna(np.nan)
df['avg_enr_ratio_last_12_24m'] = df['avg_enr_ratio_last_12_24m'].mask(df['avg_enr_ratio_last_12_24m'] < 0).fillna(np.nan)

# Binning choices and Classifications (Old)
################################################# Statistics method
binning_choice_S1 = ['cif_nb', 'app_id', 'applied_date', 'agreement_no', 
                     'province', 'rownum_', 'applied_month', 'applied_month_xsell'
                    'deal_no', 'disbursal_dt'] #Do nothing
binning_choice_S2 = ['max_appregregion', 'min_appregregion', 'mode_appregregion',
                     'max_appresregion', 'min_appresregion', 'mode_appresregion'] #Special binning methods
binning_choice_Y = [] #Yes - Need binning methods
binning_choice_YS = [] #Yes with special treament
binning_choice_N = ['day_of_week'] #No - No need binning methods

for i in range(1, len(df_group_column)):
    if df_group_column['binning_choice_statistics'][i] == "Y":
        binning_choice_Y.append(df_group_column['column_name'][i])
    elif df_group_column['binning_choice_statistics'][i] == "YS":
        binning_choice_YS.append(df_group_column['column_name'][i])
    elif df_group_column['binning_choice_statistics'][i] == "N":
        binning_choice_N.append(df_group_column['column_name'][i])
        
################################################# Optimal Binning method          
optimal_binning_S1 = ['cif_nb', 'app_id', 'applied_date', 'agreement_no', 
                     'province', 'rownum_', 'applied_month',
                     'deal_no', 'disbursal_dt',
                     'app_id_xsell', 'app_status_xsell', 
                     'product_group'] #Do nothing
optimal_binning_S2 = ['max_appregregion', 'min_appregregion', 'mode_appregregion',
                      'max_appresregion', 'min_appresregion', 'mode_appresregion'] #Special binning methods
optimal_binning_N = ['day_of_week'] #Added 'day_of_week' feature
optimal_binning_Y_int = []
optimal_binning_Y_float = []

for column in list_column:
    if column in optimal_binning_S1:
        df[column] = df[column].fillna(value = 'unknown')
    elif column in optimal_binning_S2:
        df[column] = df[column].fillna(value = 'unknown')
    else:
        if df[column].dtype == 'object':
            optimal_binning_N.append(column)
            df[column] = df[column].fillna(value = 'unknown')
        elif df[column].dtype == 'int64':
            optimal_binning_Y_int.append(column)
            df[column] = df[column].fillna(value = np.nan)
        elif df[column].dtype == 'float64':
            optimal_binning_Y_float.append(column)
            df[column] = df[column].fillna(value = np.nan)

# Binning choices and Classification (new)

In [17]:
list_of_month = df['applied_month_xsell'].unique().tolist()
list_of_month = sorted(list_of_month, reverse = False)

# Group column and binning choice
df_group_column = pd.read_excel('distinct_value2.xlsx')

#############################################
groupA = [] #application 
groupB = [] #cic
groupC = [] #behavior
groupD = [] #collection feedback
groupE = [] #linkage

for i in range(1, len(list_column)):
    if (df_group_column['group'][i] == "A") or (df_group_column['group'][i] == "Another"):
        groupA.append(list_column[i])
    elif df_group_column['group'][i] == "B":
        groupB.append(list_column[i])
    elif df_group_column['group'][i] == "C":
        groupC.append(list_column[i])
    elif df_group_column['group'][i] == "D":
        groupD.append(list_column[i])
    elif df_group_column['group'][i] == "E":
        groupE.append(list_column[i])

#################################################
binning_choice_S1 = [] #Do nothing
binning_choice_S2 = [] #Special binning methods
binning_choice_Y = [] #Yes - Need binning methods
binning_choice_YS = [] #Yes with special treament
binning_choice_N =[]

optimal_binning_S1 = [] #Do nothing
optimal_binning_S2 = [] #Special binning methods
optimal_binning_N = [] #Added 'day_of_week' feature
optimal_binning_Y_int = []
optimal_binning_Y_float = []
for column in list_column:
    idx = df_group_column[df_group_column["column_name"] == column].index.tolist()[0]
    if df_group_column['binning_choice_statistics'][idx] == "Y":
        binning_choice_Y.append(column)
    elif df_group_column['binning_choice_statistics'][idx] == "YS":
        binning_choice_YS.append(column)
    elif df_group_column['binning_choice_statistics'][idx] == "N":
        binning_choice_N.append(column)
    elif df_group_column['binning_choice_statistics'][idx] == "S1":
        binning_choice_S1.append(column)
    elif df_group_column['binning_choice_statistics'][idx] == "S2":
        binning_choice_S2.append(column)
    
    if df_group_column['binning_choice_optimal'][idx] == "Y":
        if df[column].dtype == 'int64':
            optimal_binning_Y_int.append(column)
            df[column] = df[column].fillna(value = np.nan)
        elif df[column].dtype == 'float64':
            optimal_binning_Y_float.append(column)
            df[column] = df[column].fillna(value = np.nan)
    elif df_group_column['binning_choice_optimal'][idx] == "N":
        optimal_binning_N.append(df_group_column['column_name'][idx])
        df[column] = df[column].fillna(value = 'unknown')
    elif df_group_column['binning_choice_optimal'][idx] == "S1":
        optimal_binning_S1.append(df_group_column['column_name'][idx])
        df[column] = df[column].fillna(value = 'unknown')
    elif df_group_column['binning_choice_optimal'][idx] == "S2":
        optimal_binning_S2.append(df_group_column['column_name'][idx])
        df[column] = df[column].fillna(value = 'unknown')

# Binning List

In [18]:
def binning_list(df, column_name, bins = None, method = None, target = None):
    list_of_bin = []
    #
    if method == None:
        list_of_bin = [-10**9] + bins + [10**9]
        return list_of_bin
    #
    elif method == 'statistics':
        #
        if column_name in binning_choice_N:
            list_of_bin = df[column_name].unique().tolist()
            if 'blank' in list_of_bin:
                list_of_bin.remove('blank')
            if 'null' in list_of_bin:
                list_of_bin.remove('null')
            if 'unknown' in list_of_bin:
                list_of_bin.remove('unknown')
            if np.nan in list_of_bin:
                list_of_bin.remove(np.nan)
            if None in list_of_bin:
                list_of_bin.remove(None)
            list_of_bin = sorted(list_of_bin, reverse = False)
        #
        elif column_name in binning_choice_S1:
            return 'Not suitable for binning'
        #
        elif column_name in binning_choice_S2:
            list_of_bin = ['big_city', 'another_city']
        #
        elif column_name in binning_choice_Y:
            standard_deviation = np.std(df[df[column_name] != np.nan][column_name])
            average = np.mean(df[df[column_name] != np.nan][column_name])
            a = average - 1.282*standard_deviation
            b = average - 0.674*standard_deviation
            c = average - 0.385*standard_deviation
            d = average + 0.385*standard_deviation
            e = average + 0.842*standard_deviation
            f = average + 1.282*standard_deviation
            list_of_bin = [-10**9]
            for i in np.unique([a,b,c,d,e,f]).tolist():
                list_of_bin.append(round(i,5))
            list_of_bin.append(10**9)
        #
        elif column_name in binning_choice_YS:
            standard_deviation = np.std(df[df[column_name] != np.nan][column_name])
            average = np.mean(df[df[column_name] != np.nan][column_name])
            a = average - 1.282*standard_deviation
            b = average - 0.674*standard_deviation
            c = average + 0.674*standard_deviation
            d = average + 1.282*standard_deviation
            list_of_bin = [-10**9]
            for i in np.unique([a,b,c,d]).tolist():
                list_of_bin.append(round(i,5))
            list_of_bin.append(10**9)
        
        return list_of_bin
    #
    elif method == 'optimal_binning':
        #
        variable = column_name
        y = df[target]
        list_of_bin = []
        if column_name in optimal_binning_S1:
            return list_of_bin
        #
        elif column_name in optimal_binning_S2:
            list_of_bin = ['big_city', 'another_city']
            return list_of_bin
        #
        elif column_name in optimal_binning_N:
            list_of_bin = df[column_name].unique().tolist()
            if 'blank' in list_of_bin:
                list_of_bin.remove('blank')
            if 'null' in list_of_bin:
                list_of_bin.remove('null')
            if 'unknown' in list_of_bin:
                list_of_bin.remove('unknown')
            if np.nan in list_of_bin:
                list_of_bin.remove(np.nan)
            if None in list_of_bin:
                list_of_bin.remove(None)
            list_of_bin = sorted(list_of_bin, reverse = False)
            return list_of_bin
        #
        elif column_name in optimal_binning_Y_int:
            list_of_bin = [-10**9]
            var = [variable]
            binning_process = BinningProcess(variable_names=var, 
                                         #max_n_bins=5,
                                         #min_prebin_size = 0.2,
                                         split_digits=4)
            binning_process.fit(df[df[variable] != np.nan][var], y)
            a = sorted(np.unique(binning_process.transform(df[df[variable] != np.nan][var]).values).tolist(),
                       reverse = False)
            for i in a:
                list_of_bin.append(m.ceil(i))
            list_of_bin.append(10**9)
            return np.unique(list_of_bin).tolist()
        #
        elif column_name in optimal_binning_Y_float:
            list_of_bin = [-10**9]
            var = [variable]
            binning_process = BinningProcess(variable_names=var, 
                                         #max_n_bins=5,
                                         #min_prebin_size = 0.2,
                                         split_digits=4)
            binning_process.fit(df[df[variable] != np.nan][var], y)
            a = sorted(np.unique(binning_process.transform(df[df[variable] != np.nan][var]).values).tolist(),
                       reverse = False)
            for i in a:
                list_of_bin.append(round(i,5))
            list_of_bin.append(10**9)
            return np.unique(list_of_bin).tolist()

def binning_list_new(df, column_name, bins = None, method = None, target = None):
    list_of_bin = binning_list(df, column_name, bins, method, target)
    #
    if method == 'statistics':
        #
        if (column_name in binning_choice_S1) or (column_name in binning_choice_S2) or (column_name in binning_choice_N):
            return list_of_bin
        #
        elif column_name in binning_choice_YS:
            label = []
            category = ['very_low','low', 'medium', 'high', 'very_high']
            for k in range(len(list_of_bin)-2):
                label.append('<= ' + str(list_of_bin[k+1]) + ' - ' + category[k])
            label.append('> ' + str(list_of_bin[len(list_of_bin)-2]) + ' - ' + category[len(list_of_bin)-2])
            return label
        #
        elif column_name in binning_choice_Y:
            label = []
            category = ['extremely_low','very_low', 'low', 'medium', 'high', 'very_high', 'extremely_high']
            for k in range(len(list_of_bin)-2):
                label.append('<= ' + str(list_of_bin[k+1]) + ' - ' + category[k])
            label.append('> ' + str(list_of_bin[len(list_of_bin)-2]) + ' - ' + category[len(list_of_bin)-2])
            return label
    #
    elif (method == 'optimal_binning') or (method == None):
        if (column_name in optimal_binning_S1) or (column_name in optimal_binning_S2) or (column_name in optimal_binning_N):
            return list_of_bin
        else:
            label = []
            for k in range(len(list_of_bin)-2):
                label.append('<= ' + str(list_of_bin[k+1]))
            label.append('> ' + str(list_of_bin[len(list_of_bin)-2]))
            return label
        
def ranking_list(df, column_name, bins = None, method = None, target = None): #Only for S2, Y_int, Y_float, Y, YS types
    ranking_list = []
    if column_name in optimal_binning_S1:
        return 'Not suitable for ranking'
    elif column_name in optimal_binning_S2:
        for i in range(len(df[column_name])):
            if df[column_name][i] == 'unknown':
                ranking_list.append('unknown')
            elif df[column_name][i] < 60:
                ranking_list.append('another_city')
            elif df[column_name][i] >=60:
                ranking_list.append('big_city')
        return ranking_list
    elif (column_name in optimal_binning_N) or (column_name in binning_choice_N):
        return 'Not suitable for ranking'
    else:
        list_of_bin = binning_list(df, column_name, bins, method, target)
        label = binning_list_new(df, column_name, bins, method, target)
        ranking_list = pd.cut(np.array(df[column_name]), list_of_bin, labels= label)
        return ranking_list

def binning_list_plus(df, column_name, bins = None, method = None, target = None): #Only for Y and YS types
    if (column_name in binning_choice_S1) or (column_name in binning_choice_S2) or (column_name in binning_choice_N) or (column_name in optimal_binning_N):
        return binning_list(df, column_name, bins, method, target)
    else:
        a = ranking_list(df, column_name, bins, method, target).unique().tolist()
        b = binning_list_new(df, column_name, bins, method, target)
        c = []
        for i in b:
            if i in a:
                c.append(i)
        return c

def binning_list_final(df, column_name, bins = None, method = None, target = None):
    list_of_bin = binning_list_plus(df, column_name, bins, method, target)
    list_of_bin.append('unknown')
    return list_of_bin

# Distribution Table and Distribution Chart

In [19]:
def count_binning_N(df, column_name, bins = None, method = None, target = None): #Only for type N
    ## Get table
    list_of_bin = binning_list_plus(df, column_name, bins, method, target)
    list_of_total_by_month = df.groupby('applied_month_xsell', dropna = False)\
                               .aggregate({'app_id': 'count'})\
                               .reset_index()\
                               ['app_id']\
                               .cumsum().tolist()
    df2 = df[['app_id', 'applied_month_xsell', column_name]]\
                            .groupby(['applied_month_xsell', column_name], dropna = False)\
                            .aggregate({'app_id': 'count'})\
                            .reset_index()
    df3 = df2.pivot(index = 'applied_month_xsell', columns = column_name, values = 'app_id')\
                    .fillna(value = 0)
            
    ## Get distribution
    distribution = []
    for binning in list_of_bin:
        a = (np.array(df3[binning].cumsum().tolist())/(np.array(list_of_total_by_month)))
        df3[binning] = a
        distribution_bin = df3[binning].tolist()
        distribution.append(distribution_bin)
        
    x = distribution
    y = np.array([1]*len(x[0])) - sum(np.array(x[0:(len(x))])) #Distribution for 'unknown' class
    y = y.tolist()
    x.append(y)
    
    df3.loc[:,'unknown'] = y
    
    return [df3, x]

def count_binning_other(df, column_name, bins = None, method = None, target = None): #For Special type 2, Y, YS
    ## Get table
    #list_of_bin = optimal_binning(df,column_name, target)
    df2 = df[['app_id', 'applied_month_xsell', column_name]]
    df2.loc[:,'ranking'] = ranking_list(df, column_name, bins, method, target)#.fillna(value = 'unknown')
    list_of_bin = binning_list_plus(df, column_name, bins, method, target)
    
    ####################
    list_of_total_by_month = df.groupby('applied_month_xsell', dropna = True)\
                               .aggregate({'app_id': 'count'})\
                               .reset_index()\
                               ['app_id']\
                               .cumsum().tolist()
    df3 = df2[['app_id', 'applied_month_xsell', 'ranking']]\
                            .groupby(['applied_month_xsell', 'ranking'], dropna = False)\
                            .aggregate({'app_id': 'count'})\
                            .reset_index()
    df4 = df3.pivot(index = 'applied_month_xsell', columns = 'ranking', values = 'app_id')\
                    .fillna(value = 0)
            
    ## Get distribution
    distribution = []
    for binning in list_of_bin:
        a = (np.array(df4[binning].cumsum().tolist())/(np.array(list_of_total_by_month)))
        df4.loc[:, binning] = a
        distribution_bin = df4[binning].tolist()
        distribution.append(distribution_bin)
    
    x = distribution
    y = np.array([1]*len(x[0])) - sum(np.array(x[0:(len(x))]))
    y = y.tolist()
    x.append(y)
    
    df4.loc[:,'unknown'] = y
    
    return [df4,x]

def count_binning_final(df, column_name, bins = None, method = None, target = None):
    if (column_name in binning_choice_N) or (column_name in optimal_binning_N):
        return count_binning_N(df, column_name, bins, method, target)[1]
    elif column_name in binning_choice_S1:
        return 'Do nothing'
    else:
        return count_binning_other(df, column_name, bins, method, target)[1]

def distribution_table_final(df, column_name, bins = None, method = None, target = None):
    if (column_name in binning_choice_N) or (column_name in optimal_binning_N):
        return count_binning_N(df, column_name, bins, method, target)[0]
    elif column_name in binning_choice_S1:
        return 'Do nothing'
    else:
        return count_binning_other(df, column_name, bins, method, target)[0]
    
def distribution_plotting(df, column_name, bins = None, method = None, target = None): #Do not use for column in binning_choice_S1
    list_of_bin = binning_list_final(df, column_name, bins, method, target)
    distribution = count_binning_final(df, column_name, bins, method, target)
    fig = go.Figure()
    for k in range(len(list_of_bin)):
        fig.add_trace(
                    go.Bar(
                            x = list_of_month,
                            y = distribution[k],
                            name = list_of_bin[k]
                            )
                    )

        fig.update_layout(
        title="Distribution of {}".format(column_name),
        xaxis_title="Month",
        yaxis_title="Percent",
    
        font=dict(family="Calibri", size=12, color="Black")
                )

        fig.update_layout(barmode='stack', xaxis={'categoryorder':'category ascending'})

        fig.update_layout(dragmode='pan', hovermode='closest', hoverdistance=10)
        fig.update_layout(legend=dict(x=1, y=1, bgcolor='rgba(0,0,0,0)'))
        fig.update_layout(margin=dict(b=20, t=25, l=0, r=0))
        fig.update_xaxes(showspikes=True, spikemode='across',
                              spikesnap='cursor', spikedash='dot')
        fig.update_yaxes(showspikes=True, spikemode='across',
                              spikesnap='cursor', spikedash='dot')
        fig.update_xaxes(showgrid=True)
    return fig

# Del_Mob Line Charts

In [27]:
def del_mob_N(df, column_name, bins = None, method = None, target = None): #Only for type N
    ## Get table
    list_of_bin = binning_list_plus(df, column_name, bins, method, target)
    df2 = df[['app_id', 'applied_month_xsell', column_name]]
    df2.loc[:,'target'] = df[target].replace('unknown', 0) #It is not necessary
    
    list_of_total_by_month = df2.groupby('applied_month_xsell', dropna = False)\
                                .aggregate({'target': 'count'})\
                                .reset_index()\
                                ['target']\
                                .cumsum().tolist()
    df3 = df2[['app_id','applied_month_xsell', column_name, 'target']]\
                            .groupby(['applied_month_xsell', column_name], dropna = False)\
                            .aggregate({'target': 'sum'})\
                            .reset_index()
    df4 = df3.pivot(index = 'applied_month_xsell', columns = column_name, values = 'target')\
                    .fillna(value = 0)
            
    ## Get percent
    line_percent = []
    for binning in list_of_bin:
        a = (np.array(df4[binning].cumsum().tolist())/(np.array(list_of_total_by_month)))
        df4.loc[:, binning] = a
        line_percent_bin = df4[binning].tolist()
        line_percent.append(line_percent_bin)
    if 'unknown' in df4.columns.to_list():
        line_percent.append(
                            (
                                np.array(df4['unknown'].cumsum().tolist())/(np.array(list_of_total_by_month))
                            ).tolist()
                           )
        df4.loc[:, 'unknown'] = line_percent[-1]
    else:
        df4.loc[:,'unknown'] = [0]*len(df4)
        line_percent.append([0]*len(df4))    
    return [df4, line_percent]

def del_mob_other(df, column_name, bins = None, method = None, target = None): #For type S_2, Y_int, Y_float
    ## Get table
    list_of_bin = binning_list_plus(df, column_name, bins, method, target)
    df2 = df[['app_id', 'applied_month_xsell', column_name]]
    df2.loc[:, 'ranking'] = ranking_list(df, column_name, bins, method, target)
    df2.loc[:, 'target'] = df[target].replace('unknown', 0) #It is not necessary
    
    list_of_total_by_month = df2.groupby('applied_month_xsell', dropna = False)\
                                .aggregate({'target': 'count'})\
                                .reset_index()\
                                ['target']\
                                .cumsum().tolist()
    df3 = df2.groupby(['applied_month_xsell', 'ranking'], dropna = False)\
             .aggregate({'target': 'sum'})\
             .reset_index()

    df4 = df3.pivot(index = 'applied_month_xsell', columns = 'ranking', values = 'target')\
                    .fillna(value = 0)
               
    ## Get percent
    line_percent = []
    for binning in list_of_bin:
        a = (np.array(df4[binning].cumsum().tolist())/(np.array(list_of_total_by_month)))
        df4.loc[:, binning] = a
        line_percent_bin = df4[binning].tolist()
        line_percent.append(line_percent_bin)
    if 'unknown' in df4.columns.to_list():
        line_percent.append(
                            (
                                np.array(df4['unknown'].cumsum().tolist())/(np.array(list_of_total_by_month))
                            ).tolist()
                           )
        df4.loc[:, 'unknown'] = line_percent[-1]
    else:
        df4.loc[:, 'unknown'] = [0]*len(df4)
        line_percent.append([0]*len(df4))    
    return [df4, line_percent]

def del_mob_percent_final(df, column_name, bins = None, method = None, target = None):
    if (column_name in binning_choice_N) or (column_name in optimal_binning_N):
        return del_mob_N(df, column_name, bins, method, target)[1]
    elif column_name in binning_choice_S1:
        return 'Do nothing'
    else:
        return del_mob_other(df, column_name, bins, method, target)[1]

def del_mob_table_final(df, column_name, bins = None, method = None, target = None):
    if (column_name in binning_choice_N) or (column_name in optimal_binning_N):
        return del_mob_N(df, column_name, bins, method, target)[0]
    elif column_name in binning_choice_S1:
        return 'Do nothing'
    else:
        return del_mob_other(df, column_name, bins, method, target)[0]
    
def del_mob_plotting(df, column_name, bins = None, method = None, target = None): #Do not use for column in binning_choice_S1
    list_of_bin = binning_list_final(df, column_name, bins, method, target)
    del_mob = del_mob_percent_final(df, column_name, bins, method, target)
    fig = go.Figure()
    for k in range(len(list_of_bin)):
        fig.add_trace(
                    go.Scatter(
                            x = list_of_month,
                            y = del_mob[k],
                            name = list_of_bin[k]
                            )
                    )

        fig.update_layout(
        title="{} % by {}".format(target, column_name),
        xaxis_title="Month",
        yaxis_title="Percent",
    
        font=dict(family="Calibri", size=12, color="Black")
                )

        fig.update_layout(barmode='stack', xaxis={'categoryorder':'category ascending'})

        fig.update_layout(dragmode='pan', hovermode='closest', hoverdistance=10)
        fig.update_layout(legend=dict(x=1, y=1, bgcolor='rgba(0,0,0,0)'))
        fig.update_layout(margin=dict(b=20, t=25, l=0, r=0))
        fig.update_xaxes(showspikes=True, spikemode='across',
                              spikesnap='cursor', spikedash='dot')
        fig.update_yaxes(showspikes=True, spikemode='across',
                              spikesnap='cursor', spikedash='dot')
        fig.update_xaxes(showgrid=True)
    return fig

# Binning Decision

In [21]:
def do_two_lines_cut(list1, list2):
    diff = np.array(list1) - np.array(list2)
    diff = diff.tolist()
    sign = []
    for i in range(len(diff)-1):
        if (diff[i]*diff[i+1] <= 0):
            sign.append(1)
        else:
            sign.append(0)
    if 1 in sign:
        return 'Need to be re-binned (cut together)'
    else:
        return 'To be considered in the next step'
    
def do_two_lines_close(list1, list2):
    close1 = 0.005
    close2 = 0.003
    diff = np.absolute(np.array(list1) - np.array(list2))
    diff = diff.tolist()
    if max(diff) <= close1:
        if min(diff) >= close2:
            return 'No need'
        else:
            return 'Need to be re-binned (close to each other)'
    else:
        if min(diff) >= close1:
            return 'No need'
        else:
            return 'To be considered in the next step'
    
def do_two_lines_parallel(list1, list2):
    unit = 0.01
    angle_cut_off = m.pi/12
    diff = np.absolute(np.array(list1) - np.array(list2))
    diff = diff.tolist()
    angle = []
    for i in range(len(diff)-1):
        tangent_of_angle = abs(diff[i+1]-diff[i])/unit
        angle.append(m.atan(tangent_of_angle))
    mean = sum(angle)/len(angle)
    if mean <= angle_cut_off:
        return 'No need'
    else:
        return 'To be considered in the next step'
    
def do_two_lines_asymptote(list1, list2):
    unit = 0.01
    shift_forward = 6
    a = len(list1)
    b = len(list2)
    list3 = [] + list1 # Set list3 = list1 will cause error
    list4 = [] + list2
    for i in range(shift_forward):
        list3.append((list3[a-1+i] - list3[a-2+i] + list3[a-2+i] - list3[a-3+i])/2 * unit + list3[a-1+i])
        list4.append((list4[b-1+i] - list4[b-2+i] + list4[b-2+i] - list4[b-3+i])/2 * unit + list4[b-1+i])
    if do_two_lines_cut(list3, list4) == 'Need to be re-binned':
        return 'Need to be re-binned (asymptote)'
    else:
        return 'To be considered manually'
        
def do_two_lines_re_bin(list1, list2):
    if do_two_lines_cut(list1, list2) == 'Need to be re-binned (cut together)':
        return 'Y1'
    else:
        if do_two_lines_close(list1, list2) == 'Need to be re-binned (close to each other)':
            return 'Y2'
        elif do_two_lines_close(list1, list2) == 'No need':
            return 'N'
        else:
            if do_two_lines_parallel(list1, list2) == 'No need':
                return 'N'
            else:
                if do_two_lines_asymptote(list1, list2) == 'Need to be re-binned (asymptote)':
                    return 'Y3'
                else:
                    return 'Considered'

def is_feature_need_to_be_rebinned(df, column_name, bins = None, method = None, target = None):
    list_of_bin = binning_list_final(df, column_name, bins, method, target)
    del_mob = del_mob_percent_final(df, column_name, bins, method, target)
    re_bin = []
    for k in range(len(list_of_bin)-2):
        l = 1
        while (k+l) < len(list_of_bin) -1:
            re_bin.append(do_two_lines_re_bin(del_mob[k], del_mob[k+l]))
            l = l+1
    if ('Y1' in re_bin) or ('Y2' in re_bin) or ('Y3' in re_bin):
        return 'Y'
    else:
        if ('Considered' in re_bin):
            return 'Considered'
        else:
            return 'N'

# Group categorical bins

In [22]:
def group_categorical_values(df, column_name, bins = None, method = None, target = None):
    distribution = distribution_table_final(df, column_name, bins, method, target)
    del_mob = del_mob_table_final(df, column_name, bins, method, target)
    df_temp1 = pd.DataFrame()
    df_temp1['applied_month_xsell'] = list_of_month
    df_temp2 = pd.DataFrame()
    df_temp2['applied_month_xsell'] = list_of_month
    for k in range(len(bins)):
        new_bin1 = np.array([0]*len(df_temp1))
        new_bin2 = np.array([0]*len(df_temp2))
        for l in bins[k]:
            new_bin1 = new_bin1 + np.array(distribution[l])
            new_bin2 = new_bin2 + np.array(del_mob[l])
        df_temp1[str(bins[k])] = new_bin1
        df_temp2[str(bins[k])] = new_bin2
    return [df_temp1, df_temp2]

# Demonstration

In [None]:
target = 'del90_mob12_app'
bin_list = [
            0
            ]
i = 5
print(is_feature_need_to_be_rebinned(df, list_column[i], bins = bin_list, target = 'del90_mob12_app'))
print(is_feature_need_to_be_rebinned(df, list_column[i], method = 'optimal_binning', target = 'del90_mob12_app'))
print(is_feature_need_to_be_rebinned(df, list_column[i], method = 'statistics', target = 'del90_mob12_app'))

distribution_plotting(df, list_column[i], bins = bin_list , method = None).show()
del_mob_plotting(df, list_column[i], bins = bin_list , method = None, target = 'del90_mob12_app').show()

distribution_plotting(df, list_column[i], method = 'optimal_binning', target = 'del90_mob12_app').show()
del_mob_plotting(df, list_column[i], method = 'optimal_binning', target = 'del90_mob12_app').show()

distribution_plotting(df, list_column[i], method = 'statistics').show()
del_mob_plotting(df, list_column[i], method = 'statistics', target = 'del90_mob12_app').show()

bin2 = [['<= 0'], ['<= 1', '> 1'], ['unknown']]
group_categorical_values(df, list_column[4], bins = bin2, method = 'optimal_binning', target = 'del90_mob12_app')

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113743 entries, 0 to 113742
Data columns (total 57 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   cif_nb                      113743 non-null  int64         
 1   app_id                      113743 non-null  object        
 2   applied_date                113743 non-null  datetime64[ns]
 3   product_group_dt2           113743 non-null  object        
 4   max_dpdall_ref              104599 non-null  float64       
 5   max_dpd_ref                 104599 non-null  float64       
 6   prev_approved               113743 non-null  int64         
 7   prev_rejected               113743 non-null  int64         
 8   good_response_ratio         113743 non-null  float64       
 9   bad_response_ratio          113743 non-null  float64       
 10  max_nationalid_date         113743 non-null  float64       
 11  min_nationalid_date         113743 non-