# LightGBM Modelling

In [1]:
import numpy as np
import lightgbm as lgbm
from process_functions import data_reader

In [2]:
X_train, y_train, X_test, y_idcode = data_reader('../Data')

print("Training Size = {}".format(X_train.shape))
print("Testing Size = {}".format(X_test.shape))

Training Size = (200000, 200)
Testing Size = (200000, 200)


### Spliting training data

In [3]:
from sklearn.model_selection import train_test_split

train_features, test_features, train_target, test_target = train_test_split(X_train, y_train, test_size=0.33)

### Preparing LightGBM data

In [4]:
train_lgbm = lgbm.Dataset(data=train_features, label=train_target)
test_lgbm = train_lgbm.create_valid(data=test_features, label=test_target)

### Training Model

In [12]:
lgbm_parameter = {
    'num_leaves' : 3,
    'min_data_in_leaf' : 20,
    'max_depth' : 16,
    'bagging_fraction' : 0.4,
    'bagging_freq' : 5,
    'feature_fraction' : 0.1,
    'learning_rate' : 0.01,
    'boosting' : 'gbdt',
    'random_state' : 0,
    'num_boost_round' : 100000,
    'objective' : 'binary',
    'num_threads' : 3,
    'boosting_from_average' : True,
    'metric' : 'auc',
}

bst = lgbm.train(lgbm_parameter, train_set=train_lgbm, valid_sets=[train_lgbm, test_lgbm], \
                 verbose_eval=10000, early_stopping_rounds=10000)

# cv_model = lgbm.cv(lgbm_parameter, train_set=cv_data, num_boost_round=100, nfold=10)

Training until validation scores don't improve for 10000 rounds.
[10000]	training's auc: 0.90986	valid_1's auc: 0.896584
[20000]	training's auc: 0.92129	valid_1's auc: 0.900365
[30000]	training's auc: 0.92964	valid_1's auc: 0.899897
Early stopping, best iteration is:
[20266]	training's auc: 0.921515	valid_1's auc: 0.900374


In [13]:
full_data = lgbm.Dataset(data=X_train, label=y_train)

In [14]:
best_model = lgbm.train(params=bst.params, train_set=full_data, num_boost_round=bst.best_iteration)

In [15]:
y_pred = best_model.predict(data=X_test, num_iteration=bst.best_iteration)

In [16]:
final_pred = np.array(y_pred.round(), dtype=int)

In [17]:
final_pred.sum()

6809

In [18]:
bst.best_iteration

20266

In [19]:
best_model.params

{'num_leaves': 3,
 'min_data_in_leaf': 20,
 'max_depth': 16,
 'bagging_fraction': 0.4,
 'bagging_freq': 5,
 'feature_fraction': 0.1,
 'learning_rate': 0.01,
 'boosting': 'gbdt',
 'random_state': 0,
 'objective': 'binary',
 'num_threads': 3,
 'boosting_from_average': True,
 'metric': 'auc'}

## LightGBM Scikit-API Hyperparameter Tunning GridSearchCV

In [None]:
lgbm_model = lgbm.LGBMClassifier(boosting_type='gbdt')

In [None]:
from sklearn.model_selection import GridSearchCV

grid_parameter = {
#     'boosting_type' : ['gbdt', 'dart', 'goss'],
#     'num_leaves' : [10, 20, 30, 40, 50],
#     'max_depth ' : [1, 3, 5, 7, 10, -1],
#     'learning_rate' : [0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 1],
    'n_estimators' : [100],
#     'subsample_for_bin' : [10000, 50000, 100000, 200000],
    'objective' : ['regression'],
    'class_weight' : ['balanced'],
    
}

grid_clf = GridSearchCV(lgbm_model, grid_parameter, cv=10)

In [None]:
grid_clf.fit(X_train, y_train)

In [None]:
grid_clf.cv_results_

In [None]:
y_pred = grid_clf.predict(X_test)

# Generating Submission File

In [20]:
y_pred[540]

0.881783220169492

In [21]:
y_pred

array([0.08917919, 0.20815963, 0.21362887, ..., 0.00351414, 0.09946979,
       0.07257135])

In [22]:
import pandas as pd
sub_data = {'ID_code': np.array(y_idcode), 'target': y_pred}
submission = pd.DataFrame(data=sub_data)

submission.to_csv('../Submissions/submission_9.csv', index = False)

In [23]:
submission

Unnamed: 0,ID_code,target
0,test_0,0.089179
1,test_1,0.208160
2,test_2,0.213629
3,test_3,0.177947
4,test_4,0.046682
5,test_5,0.001909
6,test_6,0.005185
7,test_7,0.201242
8,test_8,0.002512
9,test_9,0.007757
