## discretization thresholds (CAIM and Infinitesimal Bins)

In [1]:
from libraries.caimcaim import CAIMD
import numpy as np
import pandas as pd

In [2]:
binary_X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
binary_X = pd.DataFrame(binary_X)
binary_X.columns = ['col1', 'col2']

binary_y = np.array([0,1,0,1])
binary_y = pd.Series(binary_y)

binary_categorical = []

In [3]:
binary_categorical_X = np.array([[1, 'a'], [3, 'b'], [5, 'c'], [7, 'a']])
binary_categorical_X = pd.DataFrame(binary_categorical_X)
binary_categorical_X.columns = ['col1', 'col2']

binary_categorical_y = np.array([0, 1, 0, 1])
binary_categorical_y = pd.Series(binary_categorical_y)

binary_categorical_categorical = ['col2']

In [4]:
ordinal_X = [[1, 2, 0], [1,2,1], [1,2,2], 
             [3,4, 0], [3,4,1], [3,4,2], 
             [5,6,0], [5,6,1], [5,6,2],
            [7,8,0], [7,8,1], [7,8,2]]
ordinal_X = pd.DataFrame(ordinal_X)
ordinal_X.columns = ['col1', 'col2', 'col3']

ordinal_y = np.array([0,0,0,
                      1,0,0,
                      1,1,0,
                      1,1,1])
ordinal_y = pd.Series(ordinal_y)
ordinal_y.name = 'binary_label'

ordinal_categorical = []

##### CAIM

In [5]:
def discretize_caim(X, y, categorical, use_sbc=False):
    index_categorical = [X.columns.get_loc(col) for col in categorical]
    caim = CAIMD(list(categorical))
    
    # remove sbc_column
    X_aux = X.copy()
    if use_sbc:
        sbc_column = X.columns[-1]
        # remove sbc_column from X_aux
        X_aux = X_aux.drop(columns=[sbc_column])

    # get thresholds
    caim.fit_transform(X_aux, y) # fit() and transform()
    
    # get thresholds from caim.split_scheme (dict with column index : thresholds)
    # transform all values to floats
    # and keys with column indexes to column names 
    index_non_categorical = [i for i in range(X_aux.shape[1]) if i not in index_categorical]
    thresholds = {X_aux.columns[index_non_categorical[i]]: [float(val) for val in values] for i, (key, values) in enumerate(caim.split_scheme.items())}
    
    # for categorical features
    # sort the unique values and make thresholds be the values in between each pair of consecutive values
    for i, col in enumerate(categorical):
        thresholds[col] = np.unique(X[col].astype(str))
        thresholds[col] = list(thresholds[col])
        
    # do thresholds for sbc_column (= the values of the column)
    if use_sbc:
        thresholds[sbc_column] = {float(val) for val in X[sbc_column]}
        thresholds[sbc_column] = list(thresholds[sbc_column])
    
    # print thresholds
    print("\nthresholds ", thresholds)
    print("num of bins: ")
    for i, (key, value) in enumerate(thresholds.items()):
        if i in index_categorical:
            print(f"  {key}: {len(value)}")
        else:
            # +1 because the number of bins is the number of thresholds + 1
            # e.g. if thresholds are [2, 4, 6], then there are 4 bins: (-inf, 2), [2, 4), [4, 6), [6, inf)
            print(f"  {key}: {len(value)+1}")
        
    return thresholds

In [6]:
binary_thresholds_caim = discretize_caim(binary_X, binary_y, binary_categorical)

# 0  GLOBAL CAIM  1.1666666666666665
# 1  GLOBAL CAIM  1.1666666666666665

thresholds  {'col1': [1.0, 5.0, 7.0], 'col2': [2.0, 6.0, 8.0]}
num of bins: 
  col1: 4
  col2: 4


In [7]:
binary_categorical_thresholds_caim = discretize_caim(binary_categorical_X, binary_categorical_y, binary_categorical_categorical)

# 0  GLOBAL CAIM  1.1666666666666665

thresholds  {'col1': [1.0, 5.0, 7.0], 'col2': ['a', 'b', 'c']}
num of bins: 
  col1: 4
  col2: 3


In [8]:
ordinal_thresholds_caim = discretize_caim(ordinal_X, ordinal_y, ordinal_categorical, use_sbc=True)

# 0  GLOBAL CAIM  4.166666666666667
# 1  GLOBAL CAIM  4.166666666666667

thresholds  {'col1': [1.0, 3.0, 7.0], 'col2': [2.0, 4.0, 8.0], 'col3': [0.0, 1.0, 2.0]}
num of bins: 
  col1: 4
  col2: 4
  col3: 4


##### infinitesimal bins

In [9]:
# thresholds are the points in between 2 consecutive values in the sorted list
def discretize_infbins(X, categorical):
    thresholds = {}
    for col in X.columns:
        if col in categorical:
            thresholds[col] = np.unique(X[col].astype(str))
            thresholds[col] = list(thresholds[col])
        else:
            sorted_col = np.unique(X[col])
            sorted_col = sorted_col.astype(float)
            col_thresholds = (sorted_col[:-1] + sorted_col[1:]) / 2
            thresholds[col] = col_thresholds.tolist()
    
    print("\nthresholds ", thresholds)
    print("num of bins: ")
    for key, value in thresholds.items():
        if key in categorical:
            print(f"  {key}: {len(value)}")
        else:
            print(f"  {key}: {len(value)+1}")

    return thresholds

In [10]:
binary_thresholds_infbins = discretize_infbins(binary_X, binary_categorical)


thresholds  {'col1': [2.0, 4.0, 6.0], 'col2': [3.0, 5.0, 7.0]}
num of bins: 
  col1: 4
  col2: 4


In [11]:
binary_categorical_thresholds_infbins = discretize_infbins(binary_categorical_X, binary_categorical_categorical)


thresholds  {'col1': [2.0, 4.0, 6.0], 'col2': ['a', 'b', 'c']}
num of bins: 
  col1: 4
  col2: 3


In [12]:
ordinal_thresholds_infbins = discretize_infbins(ordinal_X, ordinal_categorical)


thresholds  {'col1': [2.0, 4.0, 6.0], 'col2': [3.0, 5.0, 7.0], 'col3': [0.5, 1.5]}
num of bins: 
  col1: 4
  col2: 4
  col3: 3
