In [1]:
import pandas as pd, numpy as np, os, re, math, time

In [2]:
def is_monotonic(temp_series):
    return all(temp_series[i] <= temp_series[i + 1] for i in range(len(temp_series) - 1)) or all(temp_series[i] >= temp_series[i + 1] for i in range(len(temp_series) - 1))

In [3]:
def prepare_bins(bin_data, c_i, target_col, max_bins):
    force_bin = True
    binned = False
    remarks = np.nan
    # ----------------- Monotonic binning -----------------
    for n_bins in range(max_bins, 2, -1):
        try:
            bin_data[c_i + "_bins"] = pd.qcut(bin_data[c_i], n_bins).astype(object)
            if is_monotonic(bin_data.groupby(c_i + "_bins")[target_col].mean().reset_index(drop=True)):
                force_bin = False
                binned = True
                remarks = "binned monotonically"
                break
        except:
            pass
    # ----------------- Force binning -----------------
    if force_bin:
        bin_data[c_i + "_bins"] = pd.qcut(bin_data[c_i], 2, duplicates='drop').astype(object)
        if bin_data[c_i + "_bins"].nunique() == 2:
            binned = True
            remarks = "binned forcefully"

    if binned:
        return c_i + "_bins", remarks, bin_data[[c_i+"_bins", target_col]].copy()
    else:
        remarks = "couldn't bin"
        return c_i, remarks, bin_data[[c_i, target_col]].copy()

In [13]:
def iv_woe_4iter(binned_data, target_col, class_col):
    binned_data = binned_data.fillna("Missing")
    temp_groupby = binned_data.groupby(class_col)[target_col].agg(["count", lambda x: (x == 0).sum(), lambda x: (x == 1).sum()])
    temp_groupby = temp_groupby.reset_index()
    temp_groupby.columns = ["sample_class", "sample_count", "good_count", "bad_count"]
    temp_groupby["feature"] = class_col
    if "_bins" in class_col:
        temp_groupby["sample_class_label"]=temp_groupby["sample_class"].replace({"Missing": np.nan}).astype('category').cat.codes.replace({-1: np.nan})
    else:
        temp_groupby["sample_class_label"]=np.nan
    temp_groupby = temp_groupby[["feature", "sample_class", "sample_class_label", "sample_count", "good_count", "bad_count"]]

    # get distribution of good and bad
    temp_groupby['distbn_good'] = temp_groupby.apply(lambda x: x["good_count"]/temp_groupby['good_count'].sum() if x["good_count"] > 0 else (x["good_count"] + 0.5)/temp_groupby['good_count'].sum(), axis=1)
    temp_groupby['distbn_bad'] = temp_groupby.apply(lambda x: x["bad_count"]/temp_groupby['bad_count'].sum() if x["bad_count"] > 0 else (x["bad_count"] + 0.5)/temp_groupby['bad_count'].sum(), axis=1)

    temp_groupby['woe'] = np.log(temp_groupby['distbn_good'] / temp_groupby['distbn_bad'])
    temp_groupby['iv'] = (temp_groupby['distbn_good'] - temp_groupby['distbn_bad']) * temp_groupby['woe']
    
    return temp_groupby

In [5]:
def var_iter(data, target_col, max_bins):
    woe_iv = pd.DataFrame()
    remarks_list = []
    for c_i in data.columns:
        if c_i not in [target_col]:
            if np.issubdtype(data[c_i], np.number) and data[c_i].nunique() > 2:
                class_col, remarks, binned_data = prepare_bins(data[[c_i, target_col]].copy(), c_i, target_col, max_bins)
                agg_data = iv_woe_4iter(binned_data.copy(), target_col, class_col)
                remarks_list.append({"feature": c_i, "remarks": remarks})
            else:
                agg_data = iv_woe_4iter(data[[c_i, target_col]].copy(), target_col, c_i)
                remarks_list.append({"feature": c_i, "remarks": np.nan})
            woe_iv = woe_iv.append(agg_data)
    return woe_iv, pd.DataFrame(remarks_list)

In [10]:
def get_iv_woe(data, target_col, max_bins, fill_by_woe=False, woe_var_list=[]):
    func_start_time = time.time()
    woe_iv, binning_remarks = var_iter(data, target_col, max_bins)
    woe_iv["sample_class_min"] = woe_iv["sample_class"].apply(lambda x:x.left if type(x) == pd._libs.interval.Interval else x)
    woe_iv["sample_class_max"] = woe_iv["sample_class"].apply(lambda x:x.right if type(x) == pd._libs.interval.Interval else x)
    
    woe_iv["feature"] = woe_iv["feature"].replace("_bins", "", regex=True)
    woe_iv = woe_iv[['feature', 'sample_class', 'sample_class_label', 'sample_class_min', 'sample_class_max',
                     'sample_count', 'good_count', 'bad_count', 'distbn_good', 'distbn_bad', 'woe', 'iv']]
    
    iv = woe_iv.groupby("feature")[["iv"]].agg(["sum", "count"]).reset_index()
    iv.columns = ["feature", "iv", "number_of_classes"]
    iv["feature_null_percent"] = iv["feature"].apply(lambda x:data.isnull().mean()[x])
    iv = iv.merge(binning_remarks, on="feature", how="left")
    
    print("Total time elapsed: {} minutes".format(round((time.time() - func_start_time) / 60, 3)))
    return iv, woe_iv.replace({"Missing": np.nan})

## load data

In [7]:
data_path="D:\\ml_from_scratch\\blog_series\\blog_1\\data"
data=pd.read_csv(os.path.join(data_path, "encoded_data_4blog_1_2020_11_10.csv"))
print(data.shape)
data.head(1)

(1000, 8)


Unnamed: 0,number_of_missed_payments,number_of_bank_visits,score,income,use_online_streaming,number_of_bank_accounts,state,bad_customer
0,4.0,,,16601.0,0.0,5.0,West Bengal,1.0


In [14]:
iv, woe_iv = get_iv_woe(data.copy(), "bad_customer", 20)
print(iv.shape, woe_iv.shape)

Total time elapsed: 0.015 minutes
(7, 5) (47, 12)


In [17]:
iv.sort_values("iv", ascending=False)

Unnamed: 0,feature,iv,number_of_classes,feature_null_percent,remarks
3,number_of_missed_payments,0.225253,3,0.028,binned forcefully
2,number_of_bank_visits,0.158359,5,0.322,binned monotonically
5,state,0.100092,26,0.001,
0,income,0.029497,4,0.126,binned monotonically
4,score,0.024453,4,0.226,binned monotonically
1,number_of_bank_accounts,0.013772,3,0.028,binned forcefully
6,use_online_streaming,0.003143,2,0.0,
