## encoding (1 out of k and differential coding)

In [32]:
import numpy as np
import pandas as pd

In [33]:
binary_X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
binary_X = pd.DataFrame(binary_X)
binary_X.columns = ['col1', 'col2']

binary_categorical = []

binary_thresholds = {
    'col1': [1.0, 5.0, 7.0],
    'col2': [2.0, 6.0, 8.0]
}

In [34]:
binary_categorical_X = np.array([[1, 'a'], [3, 'b'], [5, 'c'], [7, 'a']])
binary_categorical_X = pd.DataFrame(binary_categorical_X)
binary_categorical_X.columns = ['col1', 'col2']

binary_categorical_categorical = ['col2']

binary_categorical_thresholds = {
    'col1': [2.0, 6.0, 8.0],
    'col2': ['a', 'b', 'c']
}

In [35]:
ordinal_X = [[1, 2, 0], [1,2,1], [1,2,2], 
             [3,4, 0], [3,4,1], [3,4,2], 
             [5,6,0], [5,6,1], [5,6,2],
            [7,8,0], [7,8,1], [7,8,2]]
ordinal_X = pd.DataFrame(ordinal_X)
ordinal_X.columns = ['col1', 'col2', 'col3']

ordinal_categorical = []

ordinal_thresholds = {
    'col1': [1.0, 3.0, 7.0],
    'col2': [2.0, 4.0, 8.0],
    'col3': [0.0, 1.0, 2.0]
}

##### 1 out of K

In [63]:
def disc_1_out_of_k(X, categorical, thresholds):
    X_disc = []
    
    # for each column in X, create a one-hot encoding of the bins
    for col in X.columns:
        if col in categorical:
            print(f"Processing categorical column: {col}")
            bin = pd.Categorical(X[col], categories=thresholds[col], ordered=True).codes
            print("bin: ", bin)
            num_bins = len(thresholds[col])
            
        else:
            print(f"Processing numerical column: {col}")
            # make values in X[col] floats
            X[col] = X[col].astype(float)
            bin = np.digitize(X[col], thresholds[col]) # gets bin number of each row
            print("bin: ", bin)
            num_bins = len(thresholds[col]) + 1 
        bins_df = pd.get_dummies(bin, prefix=f'feat{col}-bin', prefix_sep='').astype(int) # one hot encoding
        
        # add missing columns (if some bins are not present in the data)
        missing_cols = []
        for i in range(1, num_bins):
            col_name = f'feat{col}-bin{i}'
            if col_name not in bins_df.columns:
                missing_cols.append(pd.Series(0, index=bins_df.index, name=col_name))
        if missing_cols:
            bins_df = pd.concat([bins_df] + missing_cols, axis=1)
        
        # remove first column (bin0)
        bins_df = bins_df.drop(columns=f'feat{col}-bin0', errors='ignore')
        
        bins_df = bins_df.reindex(sorted(bins_df.columns), axis=1)
        
        # add bins of the column to the list
        X_disc.append(bins_df)
    
    X_disc = pd.concat(X_disc, axis=1)
    
    # show encoded version of X
    print("X_disc shape: ", X_disc.shape)
    print("X_disc columns: ", X_disc.columns)
    print("X_disc: ", X_disc)
    
    return X_disc

In [None]:
binary_X_1ook = disc_1_out_of_k(binary_X, binary_categorical, binary_thresholds)

Processing numerical column: col1
bin:  [1 1 2 3]
Processing numerical column: col2
bin:  [1 1 2 3]
X_disc shape:  (4, 6)
X_disc columns:  Index(['featcol1-bin1', 'featcol1-bin2', 'featcol1-bin3', 'featcol2-bin1',
       'featcol2-bin2', 'featcol2-bin3'],
      dtype='object')
X_disc:     featcol1-bin1  featcol1-bin2  featcol1-bin3  featcol2-bin1  featcol2-bin2  \
0              1              0              0              1              0   
1              1              0              0              1              0   
2              0              1              0              0              1   
3              0              0              1              0              0   

   featcol2-bin3  
0              0  
1              0  
2              0  
3              1  


In [None]:
binary_categorical_X_1ook = disc_1_out_of_k(binary_categorical_X, binary_categorical_categorical, binary_categorical_thresholds)

Processing numerical column: col1
bin:  [0 1 1 2]
Processing categorical column: col2
bin:  [0 1 2 0]
X_disc shape:  (4, 5)
X_disc columns:  Index(['featcol1-bin1', 'featcol1-bin2', 'featcol1-bin3', 'featcol2-bin1',
       'featcol2-bin2'],
      dtype='object')
X_disc:     featcol1-bin1  featcol1-bin2  featcol1-bin3  featcol2-bin1  featcol2-bin2
0              0              0              0              0              0
1              1              0              0              1              0
2              1              0              0              0              1
3              0              1              0              0              0


In [None]:
ordinal_X_1ook = disc_1_out_of_k(ordinal_X, ordinal_categorical, ordinal_thresholds)

Processing numerical column: col1
bin:  [1 1 1 2 2 2 2 2 2 3 3 3]
Processing numerical column: col2
bin:  [1 1 1 2 2 2 2 2 2 3 3 3]
Processing numerical column: col3
bin:  [1 2 3 1 2 3 1 2 3 1 2 3]
X_disc shape:  (12, 9)
X_disc columns:  Index(['featcol1-bin1', 'featcol1-bin2', 'featcol1-bin3', 'featcol2-bin1',
       'featcol2-bin2', 'featcol2-bin3', 'featcol3-bin1', 'featcol3-bin2',
       'featcol3-bin3'],
      dtype='object')
X_disc:      featcol1-bin1  featcol1-bin2  featcol1-bin3  featcol2-bin1  featcol2-bin2  \
0               1              0              0              1              0   
1               1              0              0              1              0   
2               1              0              0              1              0   
3               0              1              0              0              1   
4               0              1              0              0              1   
5               0              1              0              0        

##### differential coding

In [73]:
def disc_diff_coding(X, categorical, thresholds):
    X_disc = []
    for col in X.columns:
        if col in categorical:
            print(f"Processing categorical column: {col}")
            # Convert categorical column to codes
            bin = pd.Categorical(X[col], categories=thresholds[col], ordered=True).codes
            print("bin: ", bin)
            num_bins = len(thresholds[col])
        else:
            print(f"Processing numerical column: {col}")
            # Ensure values in X[col] are floats
            X[col] = X[col].astype(float)
            bin = np.digitize(X[col], thresholds[col])
            print("bin: ", bin)
            # Get bin numbers for each row
            num_bins = len(thresholds[col]) + 1
            
        bin_df = pd.DataFrame(0, index=X.index, columns=[f'feat{col}-bin{i}' for i in range(1, num_bins)])
        for i in range(1, num_bins):
            bin_df[f'feat{col}-bin{i}'] = (bin >= i).astype(int)
        
        X_disc.append(bin_df)

    X_disc = pd.concat(X_disc, axis=1)
    print("X_disc shape: ", X_disc.shape)
    print("X_disc columns: ", X_disc.columns)
    print("X_disc: ", X_disc)
    return X_disc

In [74]:
binary_X_diffcoding = disc_diff_coding(binary_X, binary_categorical, binary_thresholds)

Processing numerical column: col1
bin:  [1 1 2 3]
Processing numerical column: col2
bin:  [1 1 2 3]
X_disc shape:  (4, 6)
X_disc columns:  Index(['featcol1-bin1', 'featcol1-bin2', 'featcol1-bin3', 'featcol2-bin1',
       'featcol2-bin2', 'featcol2-bin3'],
      dtype='object')
X_disc:     featcol1-bin1  featcol1-bin2  featcol1-bin3  featcol2-bin1  featcol2-bin2  \
0              1              0              0              1              0   
1              1              0              0              1              0   
2              1              1              0              1              1   
3              1              1              1              1              1   

   featcol2-bin3  
0              0  
1              0  
2              0  
3              1  


In [75]:
binary_categorical_X_diffcoding = disc_diff_coding(binary_categorical_X, binary_categorical_categorical, binary_categorical_thresholds)

Processing numerical column: col1
bin:  [0 1 1 2]
Processing categorical column: col2
bin:  [0 1 2 0]
X_disc shape:  (4, 5)
X_disc columns:  Index(['featcol1-bin1', 'featcol1-bin2', 'featcol1-bin3', 'featcol2-bin1',
       'featcol2-bin2'],
      dtype='object')
X_disc:     featcol1-bin1  featcol1-bin2  featcol1-bin3  featcol2-bin1  featcol2-bin2
0              0              0              0              0              0
1              1              0              0              1              0
2              1              0              0              1              1
3              1              1              0              0              0


In [76]:
ordinal_X_diffcoding = disc_diff_coding(ordinal_X, ordinal_categorical, ordinal_thresholds)

Processing numerical column: col1
bin:  [1 1 1 2 2 2 2 2 2 3 3 3]
Processing numerical column: col2
bin:  [1 1 1 2 2 2 2 2 2 3 3 3]
Processing numerical column: col3
bin:  [1 2 3 1 2 3 1 2 3 1 2 3]
X_disc shape:  (12, 9)
X_disc columns:  Index(['featcol1-bin1', 'featcol1-bin2', 'featcol1-bin3', 'featcol2-bin1',
       'featcol2-bin2', 'featcol2-bin3', 'featcol3-bin1', 'featcol3-bin2',
       'featcol3-bin3'],
      dtype='object')
X_disc:      featcol1-bin1  featcol1-bin2  featcol1-bin3  featcol2-bin1  featcol2-bin2  \
0               1              0              0              1              0   
1               1              0              0              1              0   
2               1              0              0              1              0   
3               1              1              0              1              1   
4               1              1              0              1              1   
5               1              1              0              1        