In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sb
import lightgbm as lgb
import math
import matplotlib.pyplot as plt
# from sklearn import model_selection
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score

In [2]:
train_data = pd.read_csv("train_small.csv", header=None)

In [3]:
test_data = pd.read_csv("test_small.csv")

In [6]:
# parameters are from grid search and experimentation
param = {
    'bagging_freq': 4, # seems to work
    'bagging_fraction': 0.4, # seems to work ... 0.5
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.1, # from grid search and seems to work ... 0.1
    'learning_rate': 0.01,
    'max_depth': -1, # no max depth
    'metric':'auc', 
    'min_data_in_leaf': 200, # limit overfitting ... 40
    'min_sum_hessian_in_leaf': 0.01, # overfitting
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary',
    'verbosity': 1
}

In [7]:
# get train and test feature names
features = [c for c in train_data.columns if c not in [0, 1]]
test_features = [c for c in test_data.columns if c not in ['ID_code']]

test_predictions = np.zeros(len(test_data))
train_predictions = np.zeros(len(train_data))
# get target and training data
target = train_data[1]
train = train_data[features]

# using 10 folds
splits = 10
skf = StratifiedKFold(n_splits=splits, shuffle=False)
skf.get_n_splits(train, target)

# using lightbm over catboost because no categorical features
# using lightbm over xgboost because it's faster
# train one lightbm for every fold, and sum up the predictions made by all folds on the test data set
for train_index, val_index in skf.split(train, target):
    train_fold = lgb.Dataset(train.iloc[train_index][features], label=target.iloc[train_index])
    val_fold = lgb.Dataset(train.iloc[val_index][features], label=target.iloc[val_index])
    clf = lgb.train(param, train_fold, 100000, valid_sets = [train_fold, val_fold], verbose_eval=1000, early_stopping_rounds = 3000)
    
    test_predictions += clf.predict(test_data[test_features], num_iteration=clf.best_iteration) / splits
    train_predictions += clf.predict(train_data[features], num_iterations=clf.best_iteration) / splits

Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.948475	valid_1's auc: 0.885146
[2000]	training's auc: 0.970019	valid_1's auc: 0.891297
[3000]	training's auc: 0.983462	valid_1's auc: 0.892473
[4000]	training's auc: 0.991967	valid_1's auc: 0.893486
[5000]	training's auc: 0.996744	valid_1's auc: 0.893782
[6000]	training's auc: 0.998903	valid_1's auc: 0.892727
[7000]	training's auc: 0.999718	valid_1's auc: 0.892321
Early stopping, best iteration is:
[4532]	training's auc: 0.994894	valid_1's auc: 0.894185
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.948841	valid_1's auc: 0.884462
[2000]	training's auc: 0.97013	valid_1's auc: 0.887839
[3000]	training's auc: 0.983676	valid_1's auc: 0.887722
[4000]	training's auc: 0.992084	valid_1's auc: 0.887792
[5000]	training's auc: 0.996795	valid_1's auc: 0.887019
Early stopping, best iteration is:
[2279]	training's auc: 0.974582	valid_1's auc: 0.888429
Training until vali

In [8]:
# print some information to get an idea of how well it worked
train_predictions_binary = list(map(lambda x : math.floor(x+0.5), train_predictions))
print(roc_auc_score(target.values, train_predictions_binary))

test_predictions_binary = list(map(lambda x : math.floor(x+0.5), test_predictions))
print(sum(test_predictions_binary)/len(test_predictions_binary))

0.7853014281046395
0.030971240990508814


In [9]:
# save test predictions to csv file with no headers
submission_df = pd.DataFrame(test_predictions_binary)
submission_df.to_csv("submission.csv", encoding='utf-8', index=False)