# lightGBM - Merge Exclusive Features

- Modified from: https://github.com/meanxai/machine_learning/blob/main/12.LGBM/4.merge_features.py
- A detailed description of this code can be found in https://youtu.be/orSRRtWtPwE

In [7]:
# Algorithm 4: Merge Exclusive Features
def merge_features(numData, F):
    binRanges = [0]
    totalBin = 0
    for f in F:
        totalBin += np.max(f)
        binRanges.append(int(totalBin))  # convert to int for better display

    newBin = np.zeros(numData, dtype=int)
    for i in range(numData):
        newBin[i] = 0
        for j in range(len(F)):
            if F[j][i] != 0:
                newBin[i] = F[j][i] + binRanges[j]
    return newBin, binRanges

In [8]:
# modified Algorithm 4 (skip-zero-version)
def merge_features2(numData, F):
    binRanges = [0]
    totalBin = 0
    for f in F:
        totalBin += np.max(f)
        binRanges.append(int(totalBin))  # convert to int for better display

    # initialize newBin with F[0] to skip zero in binRanges[0]
    newBin = F[0].copy()  # use copy for better visualization later, not needed to do so
    for i in range(numData):
        for j in range(1, len(F)):
            if F[j][i] != 0:
                newBin[i] = F[j][i] + binRanges[j]
    return newBin, binRanges

In [9]:
import numpy as np

x = np.array(
    [  # v        v      <--- showing buddle of feature 0 and 3 below
        [1, 1, 0, 0, 1],
        [0, 0, 1, 1, 1],
        [1, 2, 0, 0, 2],
        [0, 0, 2, 3, 1],
        [2, 1, 0, 0, 3],
        [3, 3, 0, 0, 1],
        [0, 0, 3, 0, 2],
        [1, 2, 3, 4, 3],  # <-- conflict here
        [1, 0, 1, 0, 0],
        [2, 3, 0, 0, 2],
    ]
)

In [10]:
bundles = [[4], [0, 3], [1, 2]]  # The result of Greedy Bundling

F = [x[:, i] for i in bundles[1]]
for i, f in enumerate(F):
    print(f"feature_{i}: {f}")

newBin, binRanges = merge_features(x.shape[0], F)
print("\nnewBin:", newBin)
print("binRanges:", binRanges)

newBin, binRanges = merge_features2(x.shape[0], F)
print("\nnewBin:", newBin)
print("binRanges:", binRanges)

feature_0: [1 0 1 0 2 3 0 1 1 2]
feature_1: [0 1 0 3 0 0 0 4 0 0]

newBin: [1 4 1 6 2 3 0 7 1 2]
binRanges: [0, 3, 7]

newBin: [1 4 1 6 2 3 0 7 1 2]
binRanges: [0, 3, 7]


In [11]:
for i, f in enumerate(F):
    print(f"Feature {i}:")
    print(f"  Range: {binRanges[i]} to {binRanges[i+1]}")
    print(f"  Unique values: {np.unique(f)}")

print("\nMerged Output (newBin):")
print(f"  Unique values in newBin: {np.unique(newBin)}")
print(f"  Number of unique merged bins: {len(np.unique(newBin))}")

Feature 0:
  Range: 0 to 3
  Unique values: [0 1 2 3]
Feature 1:
  Range: 3 to 7
  Unique values: [0 1 3 4]

Merged Output (newBin):
  Unique values in newBin: [0 1 2 3 4 6 7]
  Number of unique merged bins: 7


In [12]:
import pandas as pd


def visualize_merge(numData, F, newBin):
    # Create a dictionary to store the data for visualization
    data_dict = {}

    for i in range(len(F)):
        data_dict[f"feature_{i}"] = F[i]

    data_dict["merged_feature"] = newBin

    # Convert to a pandas DataFrame
    df = pd.DataFrame(data_dict)

    # Display the DataFrame for visualization
    print(df)


# Example usage:
visualize_merge(x.shape[0], F, newBin)

   feature_0  feature_1  merged_feature
0          1          0               1
1          0          1               4
2          1          0               1
3          0          3               6
4          2          0               2
5          3          0               3
6          0          0               0
7          1          4               7
8          1          0               1
9          2          0               2
