In [1]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import GridSearchCV

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier 
import lightgbm as lgb

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
print(train.shape)
print(test.shape)

(200000, 202)
(200000, 201)


In [4]:
test['target'] = -1
comb = pd.concat([train,test],axis=0,sort=True)

In [5]:
#frequency encoding
for i in range(200):
    fe = comb['var_'+str(i)].value_counts()
    name = 'var_'+str(i)+'_FE'
    comb[name] = comb['var_'+str(i)].map(fe)
train = comb[:len(train)]
test = comb[len(train):]

In [6]:
print(train.shape)
print(test.shape)

(200000, 402)
(200000, 402)


In [7]:
target = train['target']
del train['target']
del test['target']
oof = np.zeros(len(train))
predictions = test[['ID_code']]

test_id = test['ID_code']

train.drop('ID_code',axis=1 , inplace=True)
test.drop('ID_code',axis=1 , inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [17]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 1,
    'learning_rate': 0.0083,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1
}

In [18]:
# frolds 지정
num_folds = 5
#.columns를 통해 변수 지정
features = [c for c in train.columns ]

#계층별 kfold
folds = StratifiedKFold(n_splits=num_folds, shuffle=False, random_state=2319)

oof = np.zeros(len(train))
predictions = np.zeros(len(target))

print('Light GBM Model')
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    
    X_train, y_train = train.iloc[trn_idx][features], target.iloc[trn_idx]
    X_valid, y_valid = train.iloc[val_idx][features], target.iloc[val_idx]
    
    
    print("Fold idx:{}".format(fold_ + 1))
    trn_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_valid, label=y_valid)
    
    clf = lgb.train(param, trn_data, 100000, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 4000)
    oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))

Light GBM Model
Fold idx:1
Training until validation scores don't improve for 4000 rounds.
[1000]	training's auc: 0.88686	valid_1's auc: 0.862692
[2000]	training's auc: 0.916524	valid_1's auc: 0.887533
[3000]	training's auc: 0.930331	valid_1's auc: 0.896773
[4000]	training's auc: 0.938656	valid_1's auc: 0.901296
[5000]	training's auc: 0.945267	valid_1's auc: 0.903284
[6000]	training's auc: 0.951194	valid_1's auc: 0.904319
[7000]	training's auc: 0.956445	valid_1's auc: 0.904958
[8000]	training's auc: 0.961253	valid_1's auc: 0.904933
[9000]	training's auc: 0.965682	valid_1's auc: 0.905199
[10000]	training's auc: 0.969807	valid_1's auc: 0.905051
[11000]	training's auc: 0.973506	valid_1's auc: 0.905091
[12000]	training's auc: 0.976883	valid_1's auc: 0.904803
[13000]	training's auc: 0.979916	valid_1's auc: 0.90472
Early stopping, best iteration is:
[9587]	training's auc: 0.968158	valid_1's auc: 0.90534
Fold idx:2
Training until validation scores don't improve for 4000 rounds.
[1000]	trainin

In [19]:
# submission
sub_df = pd.DataFrame({"ID_code":test_id})
sub_df["target"] = predictions
sub_df.to_csv("lgboost_oof_value_count.csv", index=False)