In [2]:
import numpy as np
import pandas as pd
import sklearn as sk
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold

train_set = pd.read_csv('train.csv')
test_set = pd.read_csv('test.csv')

train_set.drop(['ID_code'], axis = 1, inplace = True)
train_set.head()

idx = train_set.columns.values[1:]
for df in [test_set, train_set]:
    df['sum'] = df[idx].sum(axis=1)  
    df['min'] = df[idx].min(axis=1)
    df['max'] = df[idx].max(axis=1)
    df['mean'] = df[idx].mean(axis=1)
    df['std'] = df[idx].std(axis=1)
    df['skew'] = df[idx].skew(axis=1)
    df['kurt'] = df[idx].kurtosis(axis=1)
    df['med'] = df[idx].median(axis=1)

from sklearn.model_selection import train_test_split
y = np.array(train_set)[:, 0]
X = np.array(train_set)[:, 1:]

from sklearn.preprocessing import StandardScaler
scaller = StandardScaler()
X = scaller.fit_transform(X)

from sklearn.preprocessing import QuantileTransformer
q_scaller = QuantileTransformer()
X = q_scaller.fit_transform(X)

np.shape(X)

(200000, 208)

In [3]:
params = {'num_leaves': 9,
         'min_data_in_leaf': 42,
         'objective': 'binary',
         'max_depth': 16,
         'learning_rate': 0.0123,
         'boosting': 'gbdt',
         'bagging_freq': 5,
         'bagging_fraction': 0.8,
         'feature_fraction': 0.8201,
         'bagging_seed': 11,
         'reg_alpha': 1.728910519108444,
         'reg_lambda': 4.9847051755586085,
         'random_state': 42,
         'metric': 'auc',
         'verbosity': -1,
         'subsample': 0.81,
         'min_gain_to_split': 0.01077313523861969,
         'min_child_weight': 19.428902804238373,
         'num_threads': 4}


In [4]:
test_X = np.array(test_set)[:, 1:]
test_X = test_X.astype(float)
test_X = scaller.transform(test_X)
test_X = q_scaller.transform(test_X)

In [5]:
folds = StratifiedKFold(n_splits=5, shuffle=True)

In [6]:
folds.get_n_splits(X, y)

5

In [9]:
y_pred = np.zeros(len(test_X))
for train_index, test_index in folds.split(X, y):
    train_data = lgb.Dataset(X[train_index,:], label = y[train_index])
    val_data = lgb.Dataset(X[test_index,:], label = y[test_index])
    model = lgb.train(params, train_data, num_boost_round=15000,
                  valid_sets=[train_data, val_data],
                  verbose_eval = 1000, early_stopping_rounds = 200)
    y_pred += model.predict(test_X,
                           num_iteration=model.best_iteration)
y_pred /= 5

Training until validation scores don't improve for 200 rounds.
[1000]	training's auc: 0.88941	valid_1's auc: 0.86355
[2000]	training's auc: 0.913579	valid_1's auc: 0.882586
[3000]	training's auc: 0.924802	valid_1's auc: 0.890298
[4000]	training's auc: 0.931667	valid_1's auc: 0.894451
[5000]	training's auc: 0.936712	valid_1's auc: 0.896671
[6000]	training's auc: 0.941098	valid_1's auc: 0.897912
[7000]	training's auc: 0.945359	valid_1's auc: 0.898304
[8000]	training's auc: 0.949333	valid_1's auc: 0.8986
Early stopping, best iteration is:
[7945]	training's auc: 0.949136	valid_1's auc: 0.898662
Training until validation scores don't improve for 200 rounds.
[1000]	training's auc: 0.889969	valid_1's auc: 0.861768
[2000]	training's auc: 0.91458	valid_1's auc: 0.8805
[3000]	training's auc: 0.925959	valid_1's auc: 0.88812
[4000]	training's auc: 0.932749	valid_1's auc: 0.89161
[5000]	training's auc: 0.937657	valid_1's auc: 0.89323
[6000]	training's auc: 0.942025	valid_1's auc: 0.894008
[7000]	tr

In [None]:
# train_data = lgb.Dataset(X_train, label = y_train)
# val_data = lgb.Dataset(X_test, label = y_test)
# model = lgb.train(param, train_data, num_boost_round=15000,
#                   valid_sets=[train_data, val_data],
#                   verbose_eval = 1000, early_stopping_rounds = 200)

In [None]:
# full_data = lgb.Dataset(X, label = y)
# model_2 = lgb.cv(param, full_data, num_boost_round=15000,
#                  nfold = 10, verbose_eval = 1000, 
#                  early_stopping_rounds = 300)

In [None]:
# from sklearn.metrics import confusion_matrix
# cm = confusion_matrix(y, (model.predict(X) > 0.5))
# print(cm)
# print((cm[0,1]+ cm[1,1])/np.sum(cm))

In [10]:
anwser = pd.concat([test_set['ID_code'],
                     pd.DataFrame(y_pred)], axis=1)


anwser.set_axis(['ID_code', 'target'], axis = 1)
anwser.head()

anwser.to_csv('anwser.csv', index=False)

  """


In [None]:
model.save_model('lgbmodel')