# LightGBM Modelling

In [1]:
import numpy as np
import lightgbm as lgbm
from process_functions import data_reader

In [2]:
X_train, y_train, X_test, y_idcode = data_reader('../Data')

print("Training Size = {}".format(X_train.shape))
print("Testing Size = {}".format(X_test.shape))

Training Size = (200000, 200)
Testing Size = (200000, 200)


### Spliting training data

In [None]:
from sklearn.model_selection import train_test_split

train_features, test_features, train_target, test_target = train_test_split(X_train, y_train, test_size=0.2)

### Preparing LightGBM data

In [None]:
# train_lgbm = lgbm.Dataset(data=train_features, label=train_target)
# test_lgbm = train_lgbm.create_valid(data=test_features, label=test_target)

In [66]:
cv_data = lgbm.Dataset(data=X_train, label=y_train)

### Training Model

In [67]:
lgbm_parameter = {
    'num_leaves' : 2,
    'min_data_in_leaf' : 42,
    'max_depth' : 16,
    'bagging_fraction' : 0.3,
    'bagging_freq' : 5,
    'feature_fraction' : 0.2,
    'learning_rate' : 0.01,
    'boosting' : 'gbdt',
    'random_state' : 0,
    'num_boost_round' : 200000,
    'objective' : 'regression',
    'num_threads' : 4,
    'boosting_from_average' : False,
    'metric' : ['auc', 'binary_logloss']
}

bst = lgbm.train(lgbm_parameter, train_set=cv_data)

# cv_model = lgbm.cv(lgbm_parameter, train_set=cv_data, num_boost_round=100, nfold=10)



In [68]:
y_pred = bst.predict(data=X_test)

In [69]:
final_pred = np.array(y_pred.round(), dtype=int)

In [70]:
final_pred.sum()

2344

In [54]:
bst.params

{'num_leaves': 2,
 'min_data_in_leaf': 42,
 'max_depth': 16,
 'bagging_fraction': 0.3,
 'bagging_freq': 5,
 'feature_fraction': 0.2,
 'learning_rate': 0.01,
 'boosting': 'gbdt',
 'random_state': 0,
 'objective': 'binary',
 'num_threads': 4,
 'boosting_from_average': False,
 'metric': ['auc', 'binary_logloss']}

## LightGBM Scikit-Model

In [55]:
lgbm_model = lgbm.LGBMClassifier(boosting_type='gbdt')

In [56]:
from sklearn.model_selection import GridSearchCV

grid_parameter = {
#     'boosting_type' : ['gbdt', 'dart', 'goss'],
#     'num_leaves' : [10, 20, 30, 40, 50],
#     'max_depth ' : [1, 3, 5, 7, 10, -1],
#     'learning_rate' : [0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 1],
    'n_estimators' : [100],
#     'subsample_for_bin' : [10000, 50000, 100000, 200000],
    'objective' : ['regression'],
    'class_weight' : ['balanced'],
    
}

grid_clf = GridSearchCV(lgbm_model, grid_parameter, cv=10)

In [57]:
grid_clf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [100], 'objective': ['binary'], 'class_weight': ['balanced']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [40]:
grid_clf.cv_results_



{'mean_fit_time': array([ 155.79812521,  253.17120355,  492.64050364, 2399.64586395]),
 'std_fit_time': array([1.13166909e+00, 1.09040949e+00, 2.58332513e+00, 2.46828423e+03]),
 'mean_score_time': array([ 3.17757523,  6.64564902, 16.09894419, 33.39405072]),
 'std_score_time': array([0.09211921, 0.09747835, 0.1893271 , 0.42823631]),
 'param_class_weight': masked_array(data=['balanced', 'balanced', 'balanced', 'balanced'],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[3000, 5000, 10000, 20000],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_objective': masked_array(data=['binary', 'binary', 'binary', 'binary'],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'class_weight': 'balanced',
   'n_estimators': 3000,
   'objective': 'binary'},
  {'class_weight': 'balanced',

In [41]:
y_pred = grid_clf.predict(X_test)

# Generating Submission File

In [73]:
y_pred[540]

0.6073177377802635

In [74]:
y_pred

array([ 0.15764146,  0.25772026,  0.26451196, ..., -0.07005141,
        0.24042684,  0.13570887])

In [63]:
import pandas as pd
sub_data = {'ID_code': np.array(y_idcode), 'target': y_pred}
submission = pd.DataFrame(data=sub_data)

submission.to_csv('../Submissions/submission_6.csv', index = False)

In [64]:
submission

Unnamed: 0,ID_code,target
0,test_0,0.101918
1,test_1,0.201397
2,test_2,0.190919
3,test_3,0.174248
4,test_4,0.041159
5,test_5,0.001563
6,test_6,0.004625
7,test_7,0.171027
8,test_8,0.001800
9,test_9,0.005609
