In [3]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
from lightgbm import LGBMClassifier, LGBMRegressor
from scipy.stats import randint
import pandas as pd
import numpy as np

In [8]:
x, y = make_classification(n_samples=1000, n_classes=2, weights=[0.7, 0.3], n_features=10, random_state=42)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

In [17]:
# LightGBM uses leaf-wise growth by default Control with num_leaves (max number of leaves)

lgb_model = LGBMClassifier(
    num_leaves=31,  # Maximum leaves (2^max_depth - 1 for balanced tree)
    max_depth=5,    # Limit depth
    n_estimators=100,
    random_state=42
)

#### LightGBM vs XGBoost

| Feature | LightGBM | XGBoost |
|---------|----------|---------|
| **Growth Strategy** | Leaf-wise | Level-wise |
| **Speed** | Faster | Slower |
| **Memory** | Lower | Higher |
| **Categorical** | Native support | Needs encoding |
| **GOSS** | Yes | No |
| **EFB** | Yes | No |
| **Accuracy** | Similar | Similar |


**GOSS**

1. Sort instances by gradient magnitude
2. Keep top `a × 100%` instances (large gradients)
3. Randomly sample `b × 100%` from remaining (small gradients)
4. Use weighted sampling to compensate for bias


In [None]:
# LightGBM with GOSS (automatic)
lgb_model = LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    num_leaves=31,
    
    # GOSS parameters
    top_rate=0.2,        # Keep top 20% instances with large gradients
    other_rate=0.1,      # Randomly sample 10% of remaining instances

    random_state=42
)

lgb_model.fit(x_train, y_train)

In [10]:
importance = lgb_model.feature_importances_
print(importance)

[ 54  54  37  52  27  45  94  27 134  61]


**EFB (Exclusive Feature Bundling)**

1. Identify mutually exclusive features
2. Bundle them into single features
3. Use different bin boundaries to distinguish
4. Reduces feature count without information loss


In [None]:
# LightGBM with EFB (automatic)
lgb_model_efb = LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    num_leaves=31,
    
    # EFB parameters
    max_bin=255,         # Number of bins (affects bundling)
    min_data_in_bin=3,   # Minimum data in bin
    
    random_state=42
)

lgb_model_efb.fit(x_train, y_train)

**LightGBM with Categorical Features**

In [None]:
lgb_model = LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    num_leaves=31,
    
    # Categorical features (specify indices)
    categorical_feature=[0, 1, 2],  # Column indices
    
    random_state=42
)

lgb_model.fit(x_train, y_train, 
                  categorical_feature=[0, 1, 2])