In [1]:
import itertools
import time
import pickle
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def get_data(filename):
    ids = []
    data = []
    labels = []
    with open(filename, 'rb') as f:
        while 1:
            try:
                d = pickle.load(f)
                features = []
                for from_tok, to_tok in itertools.product(['A', 'B', 'P'], repeat=2):
                    if from_tok != to_tok:
                        lb = from_tok + to_tok
                        if len(d[lb]) == 0:
                            break
                        max_att = d[lb][0]
                        for att in d[lb][1:]:
                            max_att = np.maximum(max_att, att)
                        d[lb] = max_att
                        features.append(max_att.flatten())
                if len(d[lb]) == 0:
                    continue
                labels.append(d['label'])
                ids.append(d['ID'])
                data.append(np.concatenate(features))
            except EOFError:
                break
    return ids, data, labels


def softmax(x, axis=None):
    x = x - x.max(axis=axis, keepdims=True)
    y = np.exp(x)
    return y / y.sum(axis=axis, keepdims=True)


LABEL_TO_INT = {'A': 0, 'B': 1, 'Neither': 2}

In [3]:
IDs_train1, X_train1, Y_train1 = get_data('../data/large-atts-mgap-validation.pkl')
IDs_train2, X_train2, Y_train2 = get_data('../data/large-atts-mgap-test.pkl')
IDs_train = IDs_train1 + IDs_train2
X_train = X_train1 + X_train2
Y_train = Y_train1 + Y_train2
IDs_test, X_test, Y_test = get_data('../data/large-atts-mgap-development.pkl')
Y_train = [LABEL_TO_INT[y] for y in Y_train]
Y_test = [LABEL_TO_INT[y] for y in Y_test]

In [4]:
len(X_train), len(X_test)

(2454, 2000)

In [5]:
X_train[0].shape

(2304,)

In [6]:
NTRAIN, NTEST = len(X_train), len(X_test)
X, y = np.array(X_train), np.array(Y_train)
X_test, y_test = np.array(X_test), np.array(Y_test)
print(X.shape, X_test.shape)
N_CLASSES = 3
EARLY_STOPPING = 300
lgbm_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': N_CLASSES,
    'metric': 'multi_logloss',
    # 'max_depth': 15,
    'num_leaves': 127,
    'feature_fraction': 0.2,
    # 'bagging_fraction': 0.8,
    # 'bagging_freq': 5,
    'learning_rate': 0.02,
    'lambda_l1': 0.0,
    'lambda_l2': 0.0,
    'verbose': -1,
    'nthread': 12
}

KFOLD, SHUF, RS = 5, True, 123
OOF_TRAIN = np.zeros((NTRAIN, N_CLASSES))
OOF_TEST = np.zeros((NTEST, N_CLASSES))
val_score_list = []
kf = StratifiedKFold(n_splits=KFOLD, shuffle=SHUF, random_state=RS)
# dtest = xgb.DMatrix(data=X_test)
for i, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    print("FOLD #", i, end=' ')
    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[val_idx], y[val_idx]

    lgtrain = lgb.Dataset(X_train, y_train)
    lgvalid = lgb.Dataset(X_valid, y_valid)

    modelstart = time.time()
    lgb_clf = lgb.train(
        lgbm_params,
        lgtrain,
        num_boost_round=1000,
        valid_sets=[lgvalid],
        valid_names=['valid'],
        early_stopping_rounds=EARLY_STOPPING,
        verbose_eval=False
    )

    val_preds = lgb_clf.predict(X_valid)
    err = log_loss(y_valid, val_preds)
    test_preds = lgb_clf.predict(X_test, raw_score=True)
    test_preds = softmax(test_preds, axis=1)
    err_test = log_loss(y_test, test_preds)
    OOF_TEST += test_preds
    val_preds = lgb_clf.predict(X_valid, raw_score=True)
    val_preds = softmax(val_preds, axis=1)
    OOF_TRAIN[val_idx] = val_preds

    print('Log Loss: %.5f - %.5f' % (err, err_test))
    val_score_list.append(err)
    print("Model Runtime: %0.2f Minutes"%((time.time() - modelstart)/60))

OOF_TEST /= KFOLD
print("Loss folds", val_score_list)
print("Test log loss: %.5f" % log_loss(y_test, OOF_TEST))

(2454, 2304) (2000, 2304)
FOLD # 0 Log Loss: 0.27223 - 0.28475
Model Runtime: 0.94 Minutes
FOLD # 1 Log Loss: 0.31708 - 0.28848
Model Runtime: 0.90 Minutes
FOLD # 2 Log Loss: 0.32548 - 0.29182
Model Runtime: 0.90 Minutes
FOLD # 3 Log Loss: 0.30772 - 0.28774
Model Runtime: 0.90 Minutes
FOLD # 4 Log Loss: 0.34796 - 0.29443
Model Runtime: 0.82 Minutes
Loss folds [0.2722298799505061, 0.31708409932120973, 0.32547661697333713, 0.30771756904606656, 0.3479556276601703]
Test log loss: 0.28587


In [7]:
submission = pd.read_csv('../input/sample_submission_stage_1.csv', index_col='ID')

for _id, pred in zip(IDs_test, OOF_TEST):
    submission.loc[_id, 'A'] = pred[0]
    submission.loc[_id, 'B'] = pred[1]
    submission.loc[_id, 'NEITHER'] = pred[2]
submission.to_csv('../output/lightgbm-mgap.csv')