In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import lightgbm as lgb
from lightgbm import LGBMClassifier, early_stopping, log_evaluation

In [3]:
df = pd.read_parquet('features-dask.parquet')
df.shape

(458913, 745)

In [4]:
df = df.dropna(axis=1, thresh=int(0.80 * df.shape[0]))
df.shape

(458913, 620)

In [5]:
y = df['target']
X = df.drop(['target'],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

print("X_train Training Data Size :",X_train.shape[0])
print("X_test Testing Data Size   :",X_test.shape[0])

X_train Training Data Size : 367130
X_test Testing Data Size   : 91783


In [24]:
# https://www.kaggle.com/code/inversion/amex-competition-metric-python
# https://www.kaggle.com/code/jpison/custom-lgbm-obj-weighted-logloss-function

MULT_NO4PERC = 5.0
MAX_WEIGHTS = 2.0

def weighted_logloss(preds, dtrain):
    global MULT_NO4PERC, MAX_WEIGHTS
    eps = 1e-16
    labels = dtrain.get_label()
    preds = 1.0 / (1.0 + np.exp(-preds))
    
    # top 4%
    labels_mat = np.transpose(np.array([np.arange(len(labels)), labels, preds]))
    pos_ord = labels_mat[:, 2].argsort()[::-1]
    labels_mat = labels_mat[pos_ord]
    weights_4perc    = np.where(labels_mat[:,1]==0, 20, 1)
    top4   = np.cumsum(weights_4perc) <= int(0.04 * np.sum(weights_4perc))
    top4   = top4[labels_mat[:, 0].argsort()]

    weights = 1+np.exp(-MULT_NO4PERC*np.linspace(MAX_WEIGHTS-1,0,len(top4)))[labels_mat[:, 0].argsort()]
    weights[top4 & (labels==1.0)] = 1.0 # Set to one weights of positive labels in top 4%
    weights[(labels==0.0)] = 1.0 # Set to one weights of negative labels

    grad = (preds - labels) * weights
    hess = np.maximum(preds * (1.0 - preds) * weights , eps)
    return grad, hess

def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'amex_metric', amex_metric(y_true, y_pred), True


Test simple LightGBM classifier with goss boosting:

In [7]:
model = LGBMClassifier(boosting_type='goss', max_depth=5, random_state=0)
model = model.fit(X_train, y_train)

In [8]:
y_pred = model.predict(X_test)
print('Testing accuracy:', accuracy_score(y_pred, y_test))
print('AUC:', roc_auc_score(y_pred, y_test))

Testing accuracy: 0.8917555538607367
AUC: 0.8590256537064745


Use larger LightGBM model:

In [56]:
params = {'boosting_type': 'gbdt',
          'n_estimators': 5000,
          'num_leaves': 50,
          'learning_rate': 0.05,
          'colsample_bytree': 0.9,
          'min_child_samples': 2000,
          'max_bins': 500,
          'reg_alpha': 2,
          'objective': 'binary',
          'random_state': 0
}
lgbm = LGBMClassifier(**params).fit(X_train, y_train,                            
    callbacks=[log_evaluation(500)],
    eval_metric=['auc','binary_logloss']
)

In [57]:
y_pred = lgbm.predict(X_test)
print('Testing accuracy:', accuracy_score(y_pred, y_test))
print('AUC:', roc_auc_score(y_pred, y_test))

Testing accuracy: 0.8967782704858198
AUC: 0.8666642096455957


Using AMEX metric as evaluation function  with custom loss:

In [41]:
params = {
    'objective': 'binary',
    'boosting': 'dart',
    'seed': 42,
    'num_leaves': 100,
    'learning_rate': 0.01,
    'feature_fraction': 0.20,
    'bagging_freq': 10,
    'bagging_fraction': 0.50,
    'n_jobs': -1,
    'lambda_l2': 2,
    'min_data_in_leaf': 40,
    'verbose': -1,
    }
lgb_train = lgb.Dataset(X_train, y_train.values)
custom_lgb = lgb.train(
    params = params,
    train_set = lgb_train,
    num_boost_round = 3000,
    valid_sets = [lgb_train],
    feval = lgb_amex_metric,
    fobj = weighted_logloss,
    callbacks=[log_evaluation(500)]
)

[500]	training's amex_metric: 0.781532
[1000]	training's amex_metric: 0.815322
[1500]	training's amex_metric: 0.842245
[2000]	training's amex_metric: 0.867017
[2500]	training's amex_metric: 0.889338
[3000]	training's amex_metric: 0.909366


In [45]:
preds = custom_lgb.predict(X_test)
score = amex_metric(y_test, preds)
print('AMEX score:', score)
y_pred = np.argsort(np.argsort(preds))/len(preds) 
y_pred = (y_pred > 0.5).astype(int)
print('Testing accuracy:', accuracy_score(y_pred, y_test))
print('AUC:', roc_auc_score(y_pred, y_test))

AMEX score: 0.7729905505508963
Testing accuracy: 0.7538868853709293
AUC: 0.7538842589543411


Load test data and create file for submission

In [47]:
df_test = pd.read_parquet('features-dask-test.parquet')
df_test = df_test[X_train.columns]
df_test.shape

(924621, 619)

In [58]:
y_pred = lgbm.predict_proba(df_test)[:, 1]
# y_pred = custom_lgb.predict(df_test)

In [59]:
submission = pd.DataFrame(data={
    'customer_ID': df_test.reset_index()['customer_ID'], 
    'prediction': y_pred
})
submission.to_csv('submission.csv', index=None)