In [1]:
import pandas as pd
import numpy as np
import gc
from sklearn.model_selection import KFold,StratifiedKFold
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import os

In [2]:
train_data = pd.read_csv('data/age_train.csv', header=None)
test_data = pd.read_csv('data/age_test.csv', header=None)
train_data.columns = ['uId', 'age_group']
test_data.columns = ['uId']
data_id = pd.concat([train_data, test_data], axis=0, sort=False)['uId']
data_id = data_id.reset_index()
del data_id['index']

In [3]:
f1 = pd.read_csv('lgb_feature/f1.csv')
f2 = pd.read_csv('lgb_feature/f2.csv')
f3 = pd.read_csv('lgb_feature/f3.csv')
f4 = pd.read_csv('lgb_feature/f4.csv')
f5 = pd.read_csv('lgb_feature/f5.csv')
f6 = pd.read_csv('lgb_feature/f6.csv')


In [6]:
# train_lgb_snake = pd.read_hdf('lgb_feature/lgb_633_proba.hdf', key='train')
# test_lgb_snake = pd.read_hdf('lgb_feature/lgb_633_proba.hdf', key='test')
# lgb_snake = pd.concat([train_lgb_snake, test_lgb_snake], sort=False, axis=0)
# del lgb_snake['age_group']
# lgb_snake.columns = ['snake1', 'snake2', 'snake3', 'snake4', 'snake5', 'snake6']
# lgb_snake = lgb_snake.reset_index()
# del lgb_snake['uid']

In [5]:
# train_lgb_snake = pd.read_hdf('lgb_feature/ctr_649.hdf', key='train')
# test_lgb_snake = pd.read_hdf('lgb_feature/ctr_649.hdf', key='test')
# ctr_snake = pd.concat([train_lgb_snake, test_lgb_snake], sort=False, axis=0)
# ctr_snake.columns = ['ctr_snake1', 'ctr_snake2', 'ctr_snake3', 'ctr_snake4',
#                                  'ctr_snake5', 'ctr_snake6']
# ctr_snake = ctr_snake.reset_index()
# del ctr_snake['index']
# ctr_snake

In [7]:
feature = pd.concat([
                    f1, f2, f3,
                    f4, f5, f6,
#     f7,
#                     nn_1, nn_2, nn_3,
#                     lgb_snake, ctr_snake
#                     snake_1
], axis=1, sort=False)
train_feature = feature[:len(train_data)]
train_feature['label'] = train_data['age_group'] - 1
test_feature = feature[len(train_data):]

not_cols = ['label']
cols = [col for col in train_feature.columns if col not in not_cols]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [12]:
def evaluate_5_fold(train_df, test_df, cols, test=False):
    kf = KFold(n_splits=5, shuffle=True, random_state=1017)
    y_test = 0
    oof_train = np.zeros((train_df.shape[0], 6))
    for i, (train_index, val_index) in enumerate(kf.split(train_df[cols],train_df.label)):
        X_train, y_train = train_df.loc[train_index, cols], train_df.label.values[train_index]
        X_val, y_val = train_df.loc[val_index, cols], train_df.label.values[val_index]

        lgb_train = lgb.Dataset(
            X_train, y_train)
        lgb_eval = lgb.Dataset(
            X_val, y_val,
            reference=lgb_train)
        params = {
            'boosting_type': 'gbdt',
            'learning_rate' : 0.04, 
            'verbose': 0,
            'num_leaves':256,
            # 'max_depth':8, 
            # 'max_bin':10, 
            # 'lambda_l2': 1, 
            'min_child_weight':30,
            "num_class":6,
            'objective':'multiclass', 
            'feature_fraction':0.4,
            'bagging_fraction':0.7, # 0.9是目前最优的
            'bagging_freq':5,  # 3是目前最优的
#           'min_data': 500,
            'seed': 1017,
            'metric':'multi_error',
            'nthread': 50,
            # 'silent': True,
        }

        gbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=40000,
                        valid_sets=lgb_eval,
                        early_stopping_rounds=100,
                        verbose_eval=100,
                        )
        y_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
        if test:
            y_test += gbm.predict(test_df[cols], num_iteration=gbm.best_iteration)
        oof_train[val_index] = y_pred
        break
    auc = accuracy_score(train_df.label.values, np.argmax(oof_train, axis=1))
    y_test /= 5
    feature_list = pd.DataFrame()
    feature_list['names'] = cols
    feature_list['imp'] = gbm.feature_importance()
    feature_list = feature_list.sort_values(by='imp', ascending=False)
    print(feature_list)
    print('5 Fold auc:', auc)
    gc.collect()
    return auc, oof_train, y_test, feature_list

In [13]:
auc, oof_train, y_test, imp = evaluate_5_fold(train_feature, test_feature, cols, True)

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_error: 0.39606
[200]	valid_0's multi_error: 0.388186
[300]	valid_0's multi_error: 0.384359
[400]	valid_0's multi_error: 0.38241
[500]	valid_0's multi_error: 0.381095
[600]	valid_0's multi_error: 0.380135
[700]	valid_0's multi_error: 0.379585
[800]	valid_0's multi_error: 0.379226
[900]	valid_0's multi_error: 0.378865
[1000]	valid_0's multi_error: 0.378688
[1100]	valid_0's multi_error: 0.378451
[1200]	valid_0's multi_error: 0.378296
[1300]	valid_0's multi_error: 0.378252
[1400]	valid_0's multi_error: 0.3781
[1500]	valid_0's multi_error: 0.378028
Early stopping, best iteration is:
[1432]	valid_0's multi_error: 0.377975
              names    imp
72    romLeftRation  39549
106  date_std_times  35945
102             app  34946
88      app_len_std  33663
93         date_std  33047
..              ...    ...
81          carrier   5364
82             os_1   4750
66       FFuncTimes   2744
83             os_2  

In [16]:
stack_lgb_train = pd.DataFrame(oof_train)
stack_lgb_test = pd.DataFrame(y_test)
stack_lgb = pd.DataFrame(np.around(pd.concat([stack_lgb_train, stack_lgb_test], axis=0, sort=False), 6))
stack_lgb.to_csv('result/lgb_stack_feature.csv', index=False)

In [17]:
result = np.argmax(y_test, axis=1) + 1
put_result = pd.DataFrame()
put_result['id'] = test_data['uId']
put_result['label'] = result
put_result.to_csv('result/submission.csv', index=False)