# Feature Engineering

In [19]:
import warnings
import numpy as np
import lightgbm as lgbm
from process_functions import train_test_loader

warnings.filterwarnings('ignore')

In [20]:
train_features, train_target, test_features, test_target = train_test_loader('../Data/Feature_Engineering')

print("Training Size = {}".format(train_features.shape))
print("Testing Size = {}".format(test_features.shape))

print("No. of positive in training = {}".format(train_target.sum()[0]))
print("No. of positive in testing = {}".format(test_target.sum()[0]))

Training Size = (140000, 200)
Testing Size = (60000, 200)
No. of positive in training = 13964
No. of positive in testing = 6134


### Preparing LightGBM data

In [21]:
train_lgbm = lgbm.Dataset(data=train_features, label=train_target)
test_lgbm = train_lgbm.create_valid(data=test_features, label=test_target)

### Training Model

In [None]:
lgbm_parameter = {
    'num_leaves' : 3,
    'min_data_in_leaf' : 20,
    'max_depth' : 16,
    'bagging_fraction' : 0.4,
    'bagging_freq' : 5,
    'feature_fraction' : 0.1,
    'learning_rate' : 0.01,
    'boosting' : 'gbdt',
    'random_state' : 0,
    'num_boost_round' : 100000,
    'objective' : 'binary',
    'num_threads' : 3,
    'boosting_from_average' : True,
    'metric' : 'auc',
}

bst = lgbm.train(lgbm_parameter, train_set=train_lgbm, valid_sets=[train_lgbm, test_lgbm], \
                 verbose_eval=10000, early_stopping_rounds=10000)

# cv_model = lgbm.cv(lgbm_parameter, train_set=cv_data, num_boost_round=100, nfold=10)

Training until validation scores don't improve for 10000 rounds.
[10000]	training's auc: 0.909846	valid_1's auc: 0.895208
[20000]	training's auc: 0.921065	valid_1's auc: 0.899274


##### Feature Importance

In [None]:
import matplotlib.pyplot as plt

feature_imp = bst.feature_importance(importance_type='gain')

plt.figure()
plt.bar(np.arange(200), np.sort(feature_imp)[::-1])

In [None]:
for i in np.sort(feature_imp)[::-1][0:5]:
    print(i)
    print(np.where(feature_imp==i))

## Adding more features

In [None]:
train_ft, train_tar, test_ft, test_tar = train_features, train_target, test_features, test_target

print("Training Features = {}".format(train_ft.shape))
print("Testing Features = {}".format(test_ft.shape))

#### Adding more features in training data

In [None]:
train_ft['var_200'] = np.square(train_ft.var_81)/np.sqrt(train_ft.var_81)
train_ft['var_201'] = np.square(train_ft.var_139)
train_ft['var_202'] = np.square(train_ft.var_12)/np.sqrt(train_ft.var_12)
train_ft['var_203'] = np.square(train_ft.var_110)/np.sqrt(train_ft.var_110)
train_ft['var_204'] = np.square(train_ft.var_6)/np.sqrt(train_ft.var_6)

#### Adding more features in testing data

In [None]:
test_ft['var_200'] = np.square(test_ft.var_81)/np.sqrt(test_ft.var_81)
test_ft['var_201'] = np.square(test_ft.var_139)
test_ft['var_202'] = np.square(test_ft.var_12)/np.sqrt(test_ft.var_12)
test_ft['var_203'] = np.square(test_ft.var_110)/np.sqrt(test_ft.var_110)
test_ft['var_204'] = np.square(test_ft.var_6)/np.sqrt(test_ft.var_6)

In [None]:
print("Training Features = {}".format(train_ft.shape))
print("Testing Features = {}".format(test_ft.shape))
print("--------------------------------------------")
print("Training Target = {}".format(train_tar.shape))
print("Testing Target = {}".format(test_tar.shape))

### Preparing LightGBM data

In [None]:
feature_train = lgbm.Dataset(data=train_ft, label=train_tar)
feature_test = train_lgbm.create_valid(data=test_ft, label=test_tar)

### Training Model

In [None]:
lgbm_parameter = {
    'num_leaves' : 3,
    'min_data_in_leaf' : 20,
    'max_depth' : 16,
    'bagging_fraction' : 0.4,
    'bagging_freq' : 5,
    'feature_fraction' : 0.1,
    'learning_rate' : 0.01,
    'boosting' : 'gbdt',
    'random_state' : 0,
    'num_boost_round' : 100000,
    'objective' : 'binary',
    'num_threads' : 3,
    'boost_from_average' : True,
    'metric' : 'auc',
}

ft_bst = lgbm.train(lgbm_parameter, train_set=feature_train, valid_sets=[feature_train, feature_test], \
                    verbose_eval=10000, early_stopping_rounds=10000)

# cv_model = lgbm.cv(lgbm_parameter, train_set=cv_data, num_boost_round=100, nfold=10)

In [None]:
train_features.shape