In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sb
import lightgbm as lgb
import math
import matplotlib.pyplot as plt
# from sklearn import model_selection
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score

In [2]:
train_data = pd.read_csv("train_small.csv", header=None)

In [3]:
test_data = pd.read_csv("test_small.csv")

In [14]:
# parameters are from grid search and experimentation
param = {
    'bagging_freq': 4, # seems to work
    'bagging_fraction': 1.0, # seems to work ... 0.5
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.05, # from grid search and seems to work ... 0.1
    'learning_rate': 0.01,
    'max_depth': -1, # no max depth
    'metric':'auc', 
    'min_data_in_leaf': 50, # limit overfitting ... 40
    'min_sum_hessian_in_leaf': 5, # limit overfitting
    'num_leaves': 15, # limit overfitting
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary',
    'verbosity': 1
}

In [15]:
# get train and test feature names
features = [c for c in train_data.columns if c not in [0, 1]]
test_features = [c for c in test_data.columns if c not in ['ID_code']]

test_predictions = np.zeros(len(test_data))
train_predictions = np.zeros(len(train_data))
# get target and training data
target = train_data[1]
train = train_data[features]

# using 10 folds
splits = 10
skf = StratifiedKFold(n_splits=splits, shuffle=False)
skf.get_n_splits(train, target)

# using lightbm over catboost because no categorical features
# using lightbm over xgboost because it's faster
# train one lightbm for every fold, and sum up the predictions made by all folds on the test data set
for train_index, val_index in skf.split(train, target):
    train_fold = lgb.Dataset(train.iloc[train_index][features], label=target.iloc[train_index])
    val_fold = lgb.Dataset(train.iloc[val_index][features], label=target.iloc[val_index])
    clf = lgb.train(param, train_fold, 100000, valid_sets = [train_fold, val_fold], verbose_eval=1000, early_stopping_rounds = 3000)
    
    test_predictions += clf.predict(test_data[test_features], num_iteration=clf.best_iteration) / splits
    train_predictions += clf.predict(train_data[features], num_iterations=clf.best_iteration) / splits

Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.925891	valid_1's auc: 0.877603
[2000]	training's auc: 0.944042	valid_1's auc: 0.884338
[3000]	training's auc: 0.956496	valid_1's auc: 0.888079
[4000]	training's auc: 0.966322	valid_1's auc: 0.890212
[5000]	training's auc: 0.974074	valid_1's auc: 0.891406
[6000]	training's auc: 0.980443	valid_1's auc: 0.891934
[7000]	training's auc: 0.98547	valid_1's auc: 0.892233
[8000]	training's auc: 0.989355	valid_1's auc: 0.892106
[9000]	training's auc: 0.992301	valid_1's auc: 0.891907
[10000]	training's auc: 0.994561	valid_1's auc: 0.891777
Early stopping, best iteration is:
[7271]	training's auc: 0.98666	valid_1's auc: 0.892369
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.926476	valid_1's auc: 0.875034
[2000]	training's auc: 0.944432	valid_1's auc: 0.882707
[3000]	training's auc: 0.956868	valid_1's auc: 0.885768
[4000]	training's auc: 0.966606	valid_1's auc: 0.88709

In [16]:
# print some information to get an idea of how well it worked
train_predictions_binary = list(map(lambda x : math.floor(x+0.5), train_predictions))
print(roc_auc_score(target.values, train_predictions_binary))

test_predictions_binary = list(map(lambda x : math.floor(x+0.5), test_predictions))
print(sum(test_predictions_binary)/len(test_predictions_binary))

0.7896743933766868
0.03336901448654821


In [17]:
# save test predictions to csv file with no headers
submission_df = pd.DataFrame(test_predictions_binary)
submission_df.to_csv("submission.csv", encoding='utf-8', index=False)