In [1]:
import itertools
import time
import pickle
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
def max_list(array_list):
    max_att = array_list[0]
    for att in array_list[1:]:
        max_att = np.maximum(max_att, att)
    return max_att.flatten()

def get_data(filename):
    ids = []
    data = []
    labels = []
    with open(filename, 'rb') as f:
        while 1:
            try:
                d = pickle.load(f)
                features = []
                for from_tok, to_tok in itertools.product(['A', 'B', 'P'], repeat=2):
                    if from_tok != to_tok:
                        lb = from_tok + to_tok
                        if len(d[lb]) == 0:
                            break
                        d[lb] = max_list(d[lb])
                        features.append(d[lb])
                if len(d[lb]) == 0:
                    features.append(np.zeros_like(data[-1]))
                else:
                    d['PN'] = np.zeros(shape=d['AP'].shape) if len(d['PN']) == 0 else max_list(d['PN'])
                    d['NP'] = np.zeros(shape=d['AP'].shape) if len(d['NP']) == 0 else max_list(d['NP'])
                    features += [d['PN'], d['NP']]
                labels.append(d['label'])
                ids.append(d['ID'])
#                 data.append(np.concatenate(features + [(d['PA']-d['PB']).flatten(), (d['AP']-d['BP']).flatten()]))
                data.append(np.concatenate(features))
            except EOFError:
                break
    return ids, data, labels


def softmax(x, axis=None):
    x = x - x.max(axis=axis, keepdims=True)
    y = np.exp(x)
    return y / y.sum(axis=axis, keepdims=True)


LABEL_TO_INT = {'A': 0, 'B': 1, 'Neither': 2}

In [10]:
IDs_train1, X_train1, Y_train1 = get_data('../data/large-onames-atts-gap-validation.pkl')
IDs_train2, X_train2, Y_train2 = get_data('../data/large-onames-atts-gap-test.pkl')
IDs_train = IDs_train1 + IDs_train2
X_train = X_train1 + X_train2
Y_train = Y_train1 + Y_train2
IDs_test, X_test, Y_test = get_data('../data/large-onames-atts-gap-development.pkl')
Y_train = [LABEL_TO_INT[y] for y in Y_train]
Y_test = [LABEL_TO_INT[y] for y in Y_test]

In [5]:
X_train[0].shape

(3072,)

In [11]:
NTRAIN, NTEST = len(X_train), len(X_test)
X, y = np.array(X_train), np.array(Y_train)
X_test, y_test = np.array(X_test), np.array(Y_test)
print(X.shape, X_test.shape)
N_CLASSES = 3
EARLY_STOPPING = 300
lgbm_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': N_CLASSES,
    'metric': 'multi_logloss',
    'max_depth': 4,
    'num_leaves': 32,
    'feature_fraction': 0.1,
    # 'bagging_fraction': 0.8,
    # 'bagging_freq': 5,
    'learning_rate': 0.02,
    'lambda_l1': 0.0,
    'lambda_l2': 0.0,
    'verbose': -1,
    'nthread': 12
}

KFOLD, SHUF, RS = 5, True, 123
OOF_TRAIN = np.zeros((NTRAIN, N_CLASSES))
OOF_TEST = np.zeros((NTEST, N_CLASSES))
val_score_list = []
kf = StratifiedKFold(n_splits=KFOLD, shuffle=SHUF, random_state=RS)
# dtest = xgb.DMatrix(data=X_test)
for i, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    print("FOLD #", i, end=' ')
    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[val_idx], y[val_idx]

    lgtrain = lgb.Dataset(X_train, y_train)
    lgvalid = lgb.Dataset(X_valid, y_valid)

    modelstart = time.time()
    lgb_clf = lgb.train(
        lgbm_params,
        lgtrain,
        num_boost_round=1000,
        valid_sets=[lgvalid],
        valid_names=['valid'],
        early_stopping_rounds=EARLY_STOPPING,
        verbose_eval=False
    )

    val_preds = lgb_clf.predict(X_valid)
    err = log_loss(y_valid, val_preds)
    test_preds = lgb_clf.predict(X_test, raw_score=True)
    test_preds = softmax(test_preds, axis=1)
    err_test = log_loss(y_test, test_preds)
    OOF_TEST += test_preds
    val_preds = lgb_clf.predict(X_valid, raw_score=True)
    val_preds = softmax(val_preds, axis=1)
    OOF_TRAIN[val_idx] = val_preds

    print('Log Loss: %.5f - %.5f' % (err, err_test))
    val_score_list.append(err)
    print("Model Runtime: %0.2f Minutes"%((time.time() - modelstart)/60))

OOF_TEST /= KFOLD
print("Average %.5f - %.5f" % ((sum(val_score_list)/KFOLD), log_loss(y_test, OOF_TEST)))

(2454, 3072) (2000, 3072)
FOLD # 0 Log Loss: 0.27091 - 0.28590
Model Runtime: 0.17 Minutes
FOLD # 1 Log Loss: 0.32328 - 0.28494
Model Runtime: 0.15 Minutes
FOLD # 2 Log Loss: 0.33010 - 0.28968
Model Runtime: 0.15 Minutes
FOLD # 3 Log Loss: 0.31534 - 0.28847
Model Runtime: 0.16 Minutes
FOLD # 4 Log Loss: 0.35779 - 0.29453
Model Runtime: 0.13 Minutes
Loss folds [0.27090965770793146, 0.3232841328123608, 0.33009500282383397, 0.3153405310299799, 0.3577861453693777]
Test log loss: 0.28492


In [7]:
# test_raw_df = pd.read_csv('../input/gap-development.tsv', sep='\t')
# res = np.concatenate((OOF_TEST,
#                       np.expand_dims(y_test, axis=1),
#                       np.expand_dims(OOF_TEST[np.arange(len(y_test)), y_test], axis=1)),
#                      axis=1)
# res_df = pd.DataFrame(data=res, columns=['Prob_A', 'Prob_B', 'Prob_N', 'Gold', 'Prob_Gold'])
# res_df = pd.concat((test_raw_df, res_df), axis=1)
# res_df.sort_values('Prob_Gold', inplace=True)
# res_df.to_csv('res_.csv', sep='\t', index=False)

In [8]:
# submission = pd.read_csv('../input/sample_submission_stage_1.csv', index_col='ID')

# for _id, pred in zip(IDs_test, OOF_TEST):
#     submission.loc[_id, 'A'] = pred[0]
#     submission.loc[_id, 'B'] = pred[1]
#     submission.loc[_id, 'NEITHER'] = pred[2]
# submission.to_csv('../output/lightgbm-mgap.csv')