In [10]:
import pandas as pd
import numpy as np

from lightgbm import LGBMClassifier
from bayes_opt import BayesianOptimization

from sklearn.model_selection import cross_val_score

from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import QuantileTransformer
from sklearn.pipeline import make_pipeline

In [3]:
df = pd.read_csv("./train.csv").drop('id',axis=1)
df_clean = df.drop('defects',axis=1)
print(df.columns)

Index(['loc', 'v(g)', 'ev(g)', 'iv(g)', 'n', 'v', 'l', 'd', 'i', 'e', 'b', 't',
       'lOCode', 'lOComment', 'lOBlank', 'locCodeAndComment', 'uniq_Op',
       'uniq_Opnd', 'total_Op', 'total_Opnd', 'branchCount', 'defects'],
      dtype='object')


In [4]:
for column in df_clean.columns:
    df_clean[column] = np.log1p(df_clean.pop(column))

In [5]:
X = df_clean
y = df['defects']

In [6]:
def lgbm_cl_bo(min_child_samples, colsample_bytree, learning_rate, num_leaves, reg_alpha, reg_lambda):
    
    params_lgbm = {}
    params_lgbm['min_child_samples'] = round(min_child_samples)
    params_lgbm['colsample_bytree'] = colsample_bytree
    params_lgbm['learning_rate'] = learning_rate
    params_lgbm['num_leaves'] = round(num_leaves)
    params_lgbm['reg_alpha'] = reg_alpha
    params_lgbm['reg_lambda'] = reg_lambda    
       
    params_lgbm['boosting_type'] ='gbdt'   # Manual optimization
    params_lgbm['objective'] ='binary'     # Manual optimization
    params_lgbm['subsample'] = 1.0
    params_lgbm['max_bin'] = 1023
    params_lgbm['n_jobs'] = -1

    scores = cross_val_score(LGBMClassifier(**params_lgbm, random_state=2920), X, y, scoring='roc_auc', cv=5).mean()
    score = scores.mean()
    return score

In [None]:
params_lgbm ={'min_child_samples':(800, 1200),
              'colsample_bytree':(0.3, 1.0),
              'learning_rate':(0.005, 0.1),
              'num_leaves':(20, 60),
              'reg_alpha':(0.0, 10.0),
              'reg_lambda':(0.0, 5.0)}

lgbm_bo = BayesianOptimization(lgbm_cl_bo, params_lgbm, random_state=2920)
lgbm_bo.maximize(n_iter=30, init_points=20)

In [8]:
pmax_bayes = lgbm_bo.max['params']
pmax_bayes

{'colsample_bytree': 0.5687937442159554,
 'learning_rate': 0.07712378501966154,
 'min_child_samples': 864.0979874652943,
 'num_leaves': 23.238382472492944,
 'reg_alpha': 0.6515291800331102,
 'reg_lambda': 3.1182674170838522}

In [9]:
model = LGBMClassifier(n_estimators= 20000, 
                       learning_rate= 0.07,
                       objective= 'binary', 
                       boosting_type= 'gbdt', 
                       
                       subsample= 1.0,
                       num_leaves= 23,  
                       max_bin= 1023,
                       n_jobs= -1,
                           
                       reg_alpha= 0.65,
                       reg_lambda= 3.12,
                       colsample_bytree= 0.568,
                       min_child_samples= 864,     
                       random_state= 1920)

In [12]:
transformed = pd.DataFrame(QuantileTransformer(output_distribution='normal').fit_transform(X))

pipeline = make_pipeline(QuantileTransformer(output_distribution='normal'), model)
pipeline.fit(X, y)

[LightGBM] [Info] Number of positive: 23064, number of negative: 78699
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009451 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8505
[LightGBM] [Info] Number of data points in the train set: 101763, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.226644 -> initscore=-1.227357
[LightGBM] [Info] Start training from score -1.227357


In [13]:
cross_val_score(pipeline, X, y, scoring='roc_auc', cv=10).mean()

[LightGBM] [Info] Number of positive: 20757, number of negative: 70829
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007767 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8455
[LightGBM] [Info] Number of data points in the train set: 91586, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.226639 -> initscore=-1.227385
[LightGBM] [Info] Start training from score -1.227385
[LightGBM] [Info] Number of positive: 20757, number of negative: 70829
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010762 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8461
[LightGBM] [Info] Number of data points in the train set: 91586, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.226639 -> initscore=-1.227385
[LightGBM] [Info] Start training from score -1.227385
[LightGBM] [

0.746201782733674

In [14]:
test_X = pd.read_csv("./test.csv").drop('id',axis=1)
results = pipeline.predict_proba(test_X)[:, 1]

submission = pd.DataFrame()

submission['id'] =  np.arange(101763,101763 + len(results))
submission['defects'] = results

print(submission)
submission.to_csv("./submission3.0.csv",index=False)

           id   defects
0      101763  0.764512
1      101764  0.636366
2      101765  0.625447
3      101766  0.696000
4      101767  0.150330
...       ...       ...
67837  169600  0.302343
67838  169601  0.139168
67839  169602  0.186478
67840  169603  0.175149
67841  169604  0.687957

[67842 rows x 2 columns]
