In [31]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sb
import lightgbm as lgb
import math
import matplotlib.pyplot as plt
# from sklearn import model_selection
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score

In [22]:
train_data = pd.read_csv("train_small.csv", header=None)

In [23]:
test_data = pd.read_csv("test_small.csv")

In [24]:
# parameters are from grid search and experimentation
param = {
    'bagging_freq': 4, # seems to work
    'bagging_fraction': 0.5, # seems to work
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.1, # from grid search and seems to work
    'learning_rate': 0.01,
    'max_depth': -1, # a lot of data, limits overfitting in other ways
    'metric':'auc', 
    'min_data_in_leaf': 50, # limit overfitting
    'min_sum_hessian_in_leaf': 5, # limit overfitting
    'num_leaves': 15, # limit overfitting
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary',
    'verbosity': 1
}

In [32]:
# get train and test feature names
features = [c for c in train.columns if c not in [0, 1]]
test_features = [c for c in test.columns if c not in ['ID_code']]

test_predictions = np.zeros(len(test))
train_predictions = np.zeros(len(train))
# get target and training data
target = train_data[1]
train = train_data[features]

# using 10
splits = 10
skf = StratifiedKFold(n_splits=splits, shuffle=False)
skf.get_n_splits(train, target)

# using lightbm over catboost because no categorical features
# using lightbm over xgboost because it's faster
# train one lightbm for every fold, and sum up the predictions made by all folds on the test data set
for train_index, val_index in skf.split(train, target):
    train_fold = lgb.Dataset(train.iloc[train_index][features], label=target.iloc[train_index])
    val_fold = lgb.Dataset(train.iloc[val_index][features], label=target.iloc[val_index])
    clf = lgb.train(param, train_fold, 100000, valid_sets = [train_fold, val_fold], verbose_eval=1000, early_stopping_rounds = 3000)
    
    test_predictions += clf.predict(test[test_features], num_iteration=clf.best_iteration) / splits
    train_predictions += clf.predict(train[features], num_iterations=clf.best_iteration) / splits

Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.947417	valid_1's auc: 0.873757
[2000]	training's auc: 0.970914	valid_1's auc: 0.883702
[3000]	training's auc: 0.983563	valid_1's auc: 0.886764
[4000]	training's auc: 0.99124	valid_1's auc: 0.888399
[5000]	training's auc: 0.995745	valid_1's auc: 0.888479
[6000]	training's auc: 0.998183	valid_1's auc: 0.888153
[7000]	training's auc: 0.999346	valid_1's auc: 0.887713
[8000]	training's auc: 0.999801	valid_1's auc: 0.887247
Early stopping, best iteration is:
[5283]	training's auc: 0.996551	valid_1's auc: 0.888587
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.944326	valid_1's auc: 0.874402
[2000]	training's auc: 0.969547	valid_1's auc: 0.886478
[3000]	training's auc: 0.983206	valid_1's auc: 0.88977
[4000]	training's auc: 0.99131	valid_1's auc: 0.89105
[5000]	training's auc: 0.995977	valid_1's auc: 0.89143
[6000]	training's auc: 0.998347	valid_1's auc: 0.891611
[7

In [33]:
# print some information to get an idea of how well it worked
train_predictions = list(map(lambda x : math.floor(x+0.5), train_predictions))
print(roc_auc_score(target.values, train_predictions))

test_predictions = list(map(lambda x : math.floor(x+0.5), test_predictions))
print(sum(test_predictions)/len(test_predictions))

0.7197365380527642
0.03030043531006922


In [35]:
# save test predictions to csv file with no headers
submission_df = pd.DataFrame(test_predictions)
submission_df.to_csv("submission.csv", encoding='utf-8', index=False)