# Comparing multiclass LightGBM classifiers for imbalanced darasets

## Imports, utilities

In [1]:
from sklearn import preprocessing
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score,accuracy_score,confusion_matrix,roc_auc_score
from OneVsRestClassifierCustomizedLoss import *
from FocalLoss import FocalLoss
import lightgbm as lgb

### Plotting confusion matrix

In [2]:
import itertools
import matplotlib.pyplot as plt
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.3f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [3]:
X, y = make_classification(n_classes=3,
                           n_samples=2000, 
                           n_features=2,
                           n_informative=2,
                           n_redundant =0,
                           n_clusters_per_class=1,
                           weights=[.01, .01, .98], 
                           flip_y=.01, 
                           random_state=42)

le = preprocessing.LabelEncoder()
y_label = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y_label, test_size=0.30, random_state=42)

In [4]:
classes =[]
labeles=np.unique(y_label)
for v in labeles:
    classes.append('Class '+ str(v))
print(classes)

['Class 0', 'Class 1', 'Class 2']


## Multiclass lightgbm 

In [5]:
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train, verbose=0)

LGBMClassifier()

In [6]:
clf = lgb.LGBMClassifier()
init_score=np.full_like(y_train, 0.0, dtype=float)
clf.fit(X_train, y_train,init_score=init_score)

y_test_pred = clf.predict(X_test)
pred_accuracy_score = accuracy_score(y_test, y_test_pred)
pred_recall_score = recall_score(y_test, y_test_pred, average='macro')
print('Prediction accuracy', pred_accuracy_score,' recall ', pred_recall_score)

cnf_matrix = confusion_matrix(y_test, y_test_pred, labels=labeles)
plot_confusion_matrix(cnf_matrix, classes=classes,normalize=True,  title='Confusion matrix')

LightGBMError: Number of class for initial score error

## Multiclass lightgbm with focal loss

In [None]:
loss = FocalLoss(alpha=0.75, gamma=2.0)
loss_fun = lambda y_true, y_pred: (loss.grad(y_true, special.expit(y_pred)), loss.hess(y_true, special.expit(y_pred)))

# Not using early stopping
estimator = lgb.LGBMClassifier(objective=loss_fun)
clf = OneVsRestClassifierCustomizedLoss(estimator=estimator, loss=loss)
clf.fit(X_train, y_train)

# For using early stopping, uncomment the following three lines and comment the two above
#estimator = lgb.LGBMClassifier(objective=loss_fun,metric='custom')
#clf = OneVsRestClassifierCustomizedLoss(estimator=estimator, loss=loss)
#eval_metric = lambda y_true, y_pred: ('focal_loss', loss(y_true, special.expit(y_pred)).sum(), False)
#fit_params = {'eval_set': [(X_test, y_test)], 'eval_metric': eval_metric}
#clf.fit(X_train, y_train, **fit_params)

y_test_pred = clf.predict(X_test)
pred_accuracy_score = accuracy_score(y_test, y_test_pred)
pred_recall_score = recall_score(y_test, y_test_pred, average='macro')
print('prediction accuracy', pred_accuracy_score,' recall ', pred_recall_score)

cnf_matrix = confusion_matrix(y_test, y_test_pred, labels=labeles)
plot_confusion_matrix(cnf_matrix, classes=classes,normalize=True,  title='Confusion matrix')

In [None]:
X, y = make_classification(n_classes=2,
                           n_samples=200, 
                           n_features=2,
                           n_informative=2,
                           n_redundant =0,
                           n_clusters_per_class=1,
                           weights=[.5, .5], 
                           flip_y=.0, 
                           random_state=42)

le = preprocessing.LabelEncoder()
y_label = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y_label, test_size=0.30, random_state=42)

In [None]:
clf = lgb.LGBMClassifier()

In [None]:
clf.fit(X_train, y_train)
yhat = clf.predict(X_train)
y_score = clf.predict(X_train, raw_score=True)
print(y_score)


clf2 = clf
clf2.fit(X_train, y_train, init_score=y_score)
y_score2 = clf2.predict(X_train,raw_score=True) + y_score

print(y_score2)

binary

In [11]:
X, y = make_classification(n_classes=2,
                           n_samples=2000, 
                           n_features=2,
                           n_informative=2,
                           n_redundant =0,
                           n_clusters_per_class=1,
                           weights=[.4, .6], 
                           flip_y=.01, 
                           random_state=42)

le = preprocessing.LabelEncoder()
y_label = le.fit_transform(y)
X_fit, X_val, y_fit, y_val = train_test_split(X, y_label, test_size=0.30, random_state=42)

In [23]:
from sklearn import metrics
def logloss_objective(preds, train_data):
    y = train_data.get_label()
    p = special.expit(preds)
    grad = p - y
    hess = p * (1 - p)
    return grad, hess

def logloss_metric(preds, train_data):
    y = train_data.get_label()
    p = special.expit(preds)

    ll = np.empty_like(p)
    pos = y == 1
    ll[pos] = np.log(p[pos])
    ll[~pos] = np.log(1 - p[~pos])

    is_higher_better = False
    return 'logloss', -ll.mean(), is_higher_better

def logloss_init_score(y):
    p = y.mean()
    p = np.clip(p, 1e-15, 1 - 1e-15)  # never hurts
    log_odds = np.log(p / (1 - p))
    return log_odds

fit = lgb.Dataset(
    X_fit, y_fit,
    init_score=np.full_like(y_fit, logloss_init_score(y_fit), dtype=float)
)

val = lgb.Dataset(
    X_val, y_val,
    init_score=np.full_like(y_val, logloss_init_score(y_fit), dtype=float),
    reference=fit
)

model = lgb.train(
    params={'learning_rate': 0.01},
    train_set=fit,
    num_boost_round=10000,
    valid_sets=(fit, val),
    valid_names=('fit', 'val'),
    early_stopping_rounds=20,
    verbose_eval=100,
    fobj=logloss_objective,
    feval=logloss_metric
)

# Notice the change here
y_pred = special.expit(logloss_init_score(y_fit) + model.predict(X_test))

print()
print(f"Test's ROC AUC: {metrics.roc_auc_score(y_test, y_pred):.5f}")
print(f"Test's logloss: {metrics.log_loss(y_test, y_pred):.5f}")

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 1400, number of used features: 2
Training until validation scores don't improve for 20 rounds
[100]	fit's logloss: 0.272019	val's logloss: 0.302937
[200]	fit's logloss: 0.150033	val's logloss: 0.209395
[300]	fit's logloss: 0.0982027	val's logloss: 0.183269
Early stopping, best iteration is:
[358]	fit's logloss: 0.0807425	val's logloss: 0.181665

Test's ROC AUC: 0.96233
Test's logloss: 0.18167


In [34]:
def logloss_objective_sk(preds, y):
    p = special.expit(preds)
    grad = p - y
    hess = p * (1 - p)
    return grad, hess

def logloss_metric_sk(preds, y):
    p = special.expit(preds)

    ll = np.empty_like(p)
    pos = y == 1
    ll[pos] = np.log(p[pos])
    ll[~pos] = np.log(1 - p[~pos])

    is_higher_better = False
    return 'logloss', -ll.mean(), is_higher_better


model = lgb.LGBMClassifier(objective=logloss_objective_sk,num_boost_round=10000)

init_score=np.full_like(y_fit, logloss_init_score(y_fit), dtype=float)
eval_init_score= [np.full_like(y_val, logloss_init_score(y_fit), dtype=float)]

model.fit(X_fit, y_fit,
              init_score = init_score,
              eval_init_score = eval_init_score,
              early_stopping_rounds=10,
              eval_set=[(X_val, y_val)],
              eval_metric=logloss_metric_sk,
              verbose=False)

# Notice the change here
#y_pred = special.expit(model.predict(X_test))
y_pred = special.expit(logloss_init_score(y_fit) + model.predict(X_test))

print()
print(f"Test's ROC AUC: {metrics.roc_auc_score(y_test, y_pred):.5f}")
print(f"Test's logloss: {metrics.log_loss(y_test, y_pred):.5f}")


Test's ROC AUC: 0.04481
Test's logloss: 0.69805


Returning raw scores instead.


In [18]:
fit = lgb.Dataset(X_fit, y_fit)
val = lgb.Dataset(X_val, y_val, reference=fit)

model = lgb.train(
    params={
        'learning_rate': 0.01,
        'objective': 'binary'
    },
    train_set=fit,
    num_boost_round=10000,
    valid_sets=(fit, val),
    valid_names=('fit', 'val'),
    early_stopping_rounds=20,
    verbose_eval=100
)

y_pred = model.predict(X_test)

print()
print(f"Test's ROC AUC: {metrics.roc_auc_score(y_test, y_pred):.5f}")
print(f"Test's logloss: {metrics.log_loss(y_test, y_pred):.5f}")

[LightGBM] [Info] Number of positive: 826, number of negative: 574
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 1400, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.590000 -> initscore=0.363965
[LightGBM] [Info] Start training from score 0.363965
Training until validation scores don't improve for 20 rounds
[100]	fit's binary_logloss: 0.272019	val's binary_logloss: 0.302937
[200]	fit's binary_logloss: 0.150033	val's binary_logloss: 0.209395
[300]	fit's binary_logloss: 0.0982027	val's binary_logloss: 0.183269
Early stopping, best iteration is:
[358]	fit's binary_logloss: 0.0807425	val's binary_logloss: 0.181665

Test's ROC AUC: 0.96233
Test's logloss: 0.18167
