In [1]:
# 0.888
import pandas as pd
import numpy as np

In [2]:
data_path = './data'
res_path = './res'
import os
training_data = pd.read_csv(os.path.join(data_path, 'train.csv'))
test_data = pd.read_csv(os.path.join(data_path, 'test.csv'))
sample = pd.read_csv(os.path.join(data_path, 'sample_submission.csv'))

In [3]:
labels = training_data['target']
training_features = training_data.drop(['ID_code', 'target'], axis=1)

In [4]:
test_IDs = test_data['ID_code']
test_features = test_data.drop('ID_code', axis=1)

In [5]:
import sklearn.model_selection
import sklearn.metrics
import lightgbm as lgb


This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [6]:
# import xgboost as xgb
def cv(x, y, params={}, splits=5):
    clf = lgb.LGBMClassifier(**params)
#     clf = xgb.XGBClassifier(**params)
    kfold = sklearn.model_selection.StratifiedKFold(splits, shuffle=True)
    cv_score = sklearn.model_selection.cross_validate(clf, x, y, cv=kfold, scoring={
        'accuracy': 'accuracy',
        'f1': 'f1_micro',
        'roc_auc': 'roc_auc',
    })
    return cv_score

In [14]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'max_depth': 8,
    'num_leaves': 200,
    'min_child_samples': 1000,
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'scale_pos_weight': float(len(labels) - labels.sum()) / labels.sum(),
    'boost_from_average': True,
    'min_child_weight': 4,
    'reg_alpha': 3,
    'reg_lambda': 10,
    'subsample': 0.7,
    'colsample_bytree':0.7, 
    'subsample_freq': 1,
    'n_jobs': -1,
}
cv_result = cv(training_features.values, labels.values, params=params, splits=5)
for scorer, score in cv_result.items():
    print('%s: %s' % (scorer, score))
    print('Average %s: %f' % (scorer, score.mean()))

fit_time: [30.4914639  30.29646802 31.95959902 30.46685195 29.93957233]
Average fit_time: 30.630791
score_time: [1.38056397 1.30526996 1.31311703 1.319314   1.33913183]
Average score_time: 1.331479
test_accuracy: [0.87370316 0.87335317 0.87565    0.87542189 0.87392185]
Average test_accuracy: 0.874410
train_accuracy: [0.92139326 0.92101201 0.9224625  0.92109424 0.9206255 ]
Average train_accuracy: 0.921318
test_f1: [0.87370316 0.87335317 0.87565    0.87542189 0.87392185]
Average test_f1: 0.874410
train_f1: [0.92139326 0.92101201 0.9224625  0.92109424 0.9206255 ]
Average train_f1: 0.921318
test_roc_auc: [0.891505   0.89153119 0.89038104 0.8988492  0.89559974]
Average test_roc_auc: 0.893573
train_roc_auc: [0.98259488 0.98234633 0.98302549 0.98233931 0.98211916]
Average train_roc_auc: 0.982485


In [15]:
def train(x, y, params={}):
    clf = lgb.LGBMClassifier(**params)
    clf.fit(x, y)
    return clf

In [16]:
model = train(training_features.values, labels.values, params=params)

In [17]:
result = model.predict_proba(test_features.values)[:, 1]

In [18]:
output = pd.DataFrame()
output['ID_code'] = sample['ID_code']

In [19]:
output['target'] = result

In [20]:
output.to_csv(os.path.join(res_path, 'res.csv'), index=False)

In [22]:
training_features.shape

(200000, 200)