In [34]:
import pandas as pd
import numpy as np

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

import os
from os.path import expanduser

import json

from sklearn.model_selection  import train_test_split

In [59]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'

MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 25
NB_WORDS_DATA_FILE = 'nb_words.json'

In [36]:
DATA_IN_PATH = expanduser('~/.kaggle/competitions/quora-question-pairs/')
train_data = os.path.join(DATA_IN_PATH, 'train.csv')
test_data = os.path.join(DATA_IN_PATH, 'test.csv')

In [37]:
train_df = pd.read_csv(train_data, encoding='utf-8')
train_df = train_df.dropna() #drop empty

In [38]:
test_df = pd.read_csv(test_data, encoding='utf-8')
test_df = test_df.dropna() #drop empty

In [39]:
train_questions = list(train_df['question1']) + list(train_df['question2'])
train_tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
train_tokenizer.fit_on_texts(train_questions)
word_index = train_tokenizer.word_index

train_question1_word_sequences = train_tokenizer.texts_to_sequences(train_df.question1)
train_question2_word_sequences = train_tokenizer.texts_to_sequences(train_df.question2)

print("Words in index: %d" % len(word_index))

Words in index: 95595


In [40]:
test_questions = list(test_df['question1']) + list(test_df['question2'])
test_tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
test_tokenizer.fit_on_texts(test_questions)
word_index = test_tokenizer.word_index

test_question1_word_sequences = test_tokenizer.texts_to_sequences(test_df.question1)
test_question2_word_sequences = test_tokenizer.texts_to_sequences(test_df.question2)

print("Words in index: %d" % len(word_index))

Words in index: 101312


In [55]:
is_duplicateTrain = train_df.is_duplicate

In [56]:
train_q1_data = pad_sequences(train_question1_word_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
train_q2_data = pad_sequences(train_question2_word_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
labels = np.array(is_duplicateTrain, dtype=int)
print('Shape of question1 data tensor:', train_q1_data.shape)
print('Shape of question2 data tensor:', train_q2_data.shape)
print('Shape of label tensor:', labels.shape)

Shape of question1 data tensor: (404287, 25)
Shape of question2 data tensor: (404287, 25)
Shape of label tensor: (404287,)


In [43]:
test_q1_data = pad_sequences(test_question1_word_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
test_q2_data = pad_sequences(test_question2_word_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
labels = np.array(is_duplicateTrain, dtype=int)
print('Shape of question1 data tensor:', test_q1_data.shape)
print('Shape of question2 data tensor:', test_q2_data.shape)
print('Shape of label tensor:', labels.shape)

Shape of question1 data tensor: (2345790, 25)
Shape of question2 data tensor: (2345790, 25)
Shape of label tensor: (404287,)


In [44]:
data_prepro_configs = {'vocab': train_tokenizer.word_index, 'vocab_size': len(train_tokenizer.word_counts)}

In [45]:
train_input = np.stack((train_q1_data, train_q2_data), axis=1) 
train_label = labels

In [46]:
train_input, eval_input, train_label, eval_label = train_test_split(train_input, train_label, test_size=0.2, random_state=4242)

In [47]:
import xgboost as xgb

In [48]:
params = {} # 파라미터를 통해 XGB모델에 넣어 주자 
params['objective'] = 'binary:logistic' # 로지스틱 예측을 통해서 
params['eval_metric'] = 'rmse' # root mean square error를 사용 

In [49]:
# XGB에서 DMatrix는 텍스트 입력 형식 지원
train_data = xgb.DMatrix(train_input[:,0,:], label=train_label) # 학습 데이터 읽어 오기
eval_data = xgb.DMatrix(eval_input[:,0,:], label=eval_label) # 평가 데이터 읽어 오기

# 손실을 보기 위해 손실 값을 측정하기 위해 값을 쌍으로 넣는다. 
list = [(train_data, 'train'), (eval_data, 'valid')]

In [50]:
bst = xgb.train(params, train_data, 1000, list, early_stopping_rounds=10) # 학습

[15:02:32] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 120 extra nodes, 0 pruned nodes, max_depth=6
[0]	train-rmse:0.481806	valid-rmse:0.482044
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 10 rounds.
[15:02:32] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 126 extra nodes, 0 pruned nodes, max_depth=6
[1]	train-rmse:0.471209	valid-rmse:0.471745
[15:02:32] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 124 extra nodes, 0 pruned nodes, max_depth=6
[2]	train-rmse:0.464538	valid-rmse:0.46544
[15:02:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 124 extra nodes, 0 pruned nodes, max_depth=6
[3]	train-rmse:0.460162	valid-rmse:0.46139
[15:02:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 122 extra nodes, 0 pruned nodes, max_depth=6
[4]	train-rmse:0.457468	valid-rmse:0.458893
[15:02:34] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 124 

[15:02:54] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 116 extra nodes, 0 pruned nodes, max_depth=6
[52]	train-rmse:0.431124	valid-rmse:0.437094
[15:02:55] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=6
[53]	train-rmse:0.431037	valid-rmse:0.43703
[15:02:55] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 120 extra nodes, 0 pruned nodes, max_depth=6
[54]	train-rmse:0.430526	valid-rmse:0.436727
[15:02:56] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 104 extra nodes, 0 pruned nodes, max_depth=6
[55]	train-rmse:0.430239	valid-rmse:0.436542
[15:02:56] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 120 extra nodes, 0 pruned nodes, max_depth=6
[56]	train-rmse:0.429932	valid-rmse:0.436321
[15:02:56] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 124 extra nodes, 0 pruned nodes, max_depth=6
[57]	train-rmse:0.429349	valid-rmse:0.435867
[15:02:57] src/tree/updater_prune.cc:74: tree pruning 

[15:03:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=6
[105]	train-rmse:0.420123	valid-rmse:0.430332
[15:03:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 126 extra nodes, 0 pruned nodes, max_depth=6
[106]	train-rmse:0.419823	valid-rmse:0.430179
[15:03:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 74 extra nodes, 0 pruned nodes, max_depth=6
[107]	train-rmse:0.419739	valid-rmse:0.430134
[15:03:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 52 extra nodes, 0 pruned nodes, max_depth=6
[108]	train-rmse:0.419704	valid-rmse:0.430129
[15:03:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 0 pruned nodes, max_depth=6
[109]	train-rmse:0.419689	valid-rmse:0.430125
[15:03:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 64 extra nodes, 0 pruned nodes, max_depth=6
[110]	train-rmse:0.419659	valid-rmse:0.430121
[15:03:18] src/tree/updater_prune.cc:74: tree pruni

[15:03:37] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 84 extra nodes, 0 pruned nodes, max_depth=6
[157]	train-rmse:0.411891	valid-rmse:0.426292
[15:03:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 74 extra nodes, 0 pruned nodes, max_depth=6
[158]	train-rmse:0.411848	valid-rmse:0.426273
[15:03:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 120 extra nodes, 0 pruned nodes, max_depth=6
[159]	train-rmse:0.411636	valid-rmse:0.426163
[15:03:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 114 extra nodes, 0 pruned nodes, max_depth=6
[160]	train-rmse:0.411411	valid-rmse:0.426059
[15:03:39] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 106 extra nodes, 0 pruned nodes, max_depth=6
[161]	train-rmse:0.411199	valid-rmse:0.425956
[15:03:39] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 88 extra nodes, 0 pruned nodes, max_depth=6
[162]	train-rmse:0.411127	valid-rmse:0.425926
[15:03:40] src/tree/updater_prune.cc:74: tree pru

[15:03:59] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 114 extra nodes, 0 pruned nodes, max_depth=6
[209]	train-rmse:0.406418	valid-rmse:0.423982
[15:03:59] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=6
[210]	train-rmse:0.406353	valid-rmse:0.423968
[15:04:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 122 extra nodes, 0 pruned nodes, max_depth=6
[211]	train-rmse:0.406155	valid-rmse:0.423895
[15:04:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 122 extra nodes, 0 pruned nodes, max_depth=6
[212]	train-rmse:0.405982	valid-rmse:0.423824
[15:04:01] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 24 extra nodes, 0 pruned nodes, max_depth=6
[213]	train-rmse:0.405977	valid-rmse:0.423825
[15:04:01] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 116 extra nodes, 0 pruned nodes, max_depth=6
[214]	train-rmse:0.405807	valid-rmse:0.423767
[15:04:02] src/tree/updater_prune.cc:74: tree pr

[15:04:21] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 116 extra nodes, 0 pruned nodes, max_depth=6
[261]	train-rmse:0.400145	valid-rmse:0.421519
[15:04:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 90 extra nodes, 0 pruned nodes, max_depth=6
[262]	train-rmse:0.400063	valid-rmse:0.421502
[15:04:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 88 extra nodes, 0 pruned nodes, max_depth=6
[263]	train-rmse:0.39999	valid-rmse:0.421496
[15:04:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 18 extra nodes, 0 pruned nodes, max_depth=6
[264]	train-rmse:0.399982	valid-rmse:0.421496
[15:04:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 94 extra nodes, 0 pruned nodes, max_depth=6
[265]	train-rmse:0.399845	valid-rmse:0.421419
[15:04:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 98 extra nodes, 0 pruned nodes, max_depth=6
[266]	train-rmse:0.399755	valid-rmse:0.421407
[15:04:24] src/tree/updater_prune.cc:74: tree prunin

[15:04:43] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 98 extra nodes, 0 pruned nodes, max_depth=6
[313]	train-rmse:0.395139	valid-rmse:0.419592
[15:04:43] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 108 extra nodes, 0 pruned nodes, max_depth=6
[314]	train-rmse:0.394974	valid-rmse:0.41952
[15:04:44] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 110 extra nodes, 0 pruned nodes, max_depth=6
[315]	train-rmse:0.39479	valid-rmse:0.419396
[15:04:44] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 102 extra nodes, 0 pruned nodes, max_depth=6
[316]	train-rmse:0.394662	valid-rmse:0.419349
[15:04:45] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 116 extra nodes, 0 pruned nodes, max_depth=6
[317]	train-rmse:0.394539	valid-rmse:0.419301
[15:04:45] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 98 extra nodes, 0 pruned nodes, max_depth=6
[318]	train-rmse:0.394429	valid-rmse:0.419219
[15:04:45] src/tree/updater_prune.cc:74: tree prun

[15:05:04] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 0 pruned nodes, max_depth=6
[365]	train-rmse:0.390285	valid-rmse:0.417763
[15:05:05] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 102 extra nodes, 0 pruned nodes, max_depth=6
[366]	train-rmse:0.390122	valid-rmse:0.41769
[15:05:05] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=6
[367]	train-rmse:0.390077	valid-rmse:0.417697
[15:05:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 38 extra nodes, 0 pruned nodes, max_depth=6
[368]	train-rmse:0.390045	valid-rmse:0.417696
[15:05:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 86 extra nodes, 0 pruned nodes, max_depth=6
[369]	train-rmse:0.389991	valid-rmse:0.417683
[15:05:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 110 extra nodes, 0 pruned nodes, max_depth=6
[370]	train-rmse:0.389797	valid-rmse:0.417617
[15:05:07] src/tree/updater_prune.cc:74: tree pruni

[15:05:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 122 extra nodes, 0 pruned nodes, max_depth=6
[417]	train-rmse:0.386201	valid-rmse:0.416303
[15:05:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 86 extra nodes, 0 pruned nodes, max_depth=6
[418]	train-rmse:0.386124	valid-rmse:0.416296
[15:05:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 120 extra nodes, 0 pruned nodes, max_depth=6
[419]	train-rmse:0.385969	valid-rmse:0.416255
[15:05:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 116 extra nodes, 0 pruned nodes, max_depth=6
[420]	train-rmse:0.385818	valid-rmse:0.416227
[15:05:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 118 extra nodes, 0 pruned nodes, max_depth=6
[421]	train-rmse:0.385682	valid-rmse:0.416179
[15:05:28] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 74 extra nodes, 0 pruned nodes, max_depth=6
[422]	train-rmse:0.385648	valid-rmse:0.416181
[15:05:28] src/tree/updater_prune.cc:74: tree pr

[15:05:48] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=6
[469]	train-rmse:0.382044	valid-rmse:0.415073
[15:05:48] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 66 extra nodes, 0 pruned nodes, max_depth=6
[470]	train-rmse:0.381971	valid-rmse:0.415032
[15:05:49] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 116 extra nodes, 0 pruned nodes, max_depth=6
[471]	train-rmse:0.381848	valid-rmse:0.414974
[15:05:49] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 116 extra nodes, 0 pruned nodes, max_depth=6
[472]	train-rmse:0.381738	valid-rmse:0.414933
[15:05:50] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 76 extra nodes, 0 pruned nodes, max_depth=6
[473]	train-rmse:0.381703	valid-rmse:0.414934
[15:05:50] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 50 extra nodes, 0 pruned nodes, max_depth=6
[474]	train-rmse:0.381676	valid-rmse:0.414934
[15:05:51] src/tree/updater_prune.cc:74: tree prun

[15:06:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 90 extra nodes, 0 pruned nodes, max_depth=6
[521]	train-rmse:0.377746	valid-rmse:0.41359
[15:06:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 34 extra nodes, 0 pruned nodes, max_depth=6
[522]	train-rmse:0.377738	valid-rmse:0.413591
[15:06:11] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=6
[523]	train-rmse:0.377734	valid-rmse:0.413591
[15:06:11] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 98 extra nodes, 0 pruned nodes, max_depth=6
[524]	train-rmse:0.377615	valid-rmse:0.413565
[15:06:11] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 88 extra nodes, 0 pruned nodes, max_depth=6
[525]	train-rmse:0.377554	valid-rmse:0.413572
[15:06:12] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=6
[526]	train-rmse:0.37754	valid-rmse:0.413573
[15:06:12] src/tree/updater_prune.cc:74: tree pruning 

[15:06:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 34 extra nodes, 0 pruned nodes, max_depth=6
[573]	train-rmse:0.372689	valid-rmse:0.412053
[15:06:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 108 extra nodes, 0 pruned nodes, max_depth=6
[574]	train-rmse:0.372599	valid-rmse:0.412047
[15:06:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 92 extra nodes, 0 pruned nodes, max_depth=6
[575]	train-rmse:0.372539	valid-rmse:0.412025
[15:06:34] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 68 extra nodes, 0 pruned nodes, max_depth=6
[576]	train-rmse:0.372493	valid-rmse:0.412016
[15:06:34] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 40 extra nodes, 0 pruned nodes, max_depth=6
[577]	train-rmse:0.372481	valid-rmse:0.412019
[15:06:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 80 extra nodes, 0 pruned nodes, max_depth=6
[578]	train-rmse:0.372403	valid-rmse:0.412003
[15:06:35] src/tree/updater_prune.cc:74: tree pruni

[15:06:55] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 50 extra nodes, 0 pruned nodes, max_depth=6
[625]	train-rmse:0.369539	valid-rmse:0.411296
[15:06:56] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=6
[626]	train-rmse:0.369537	valid-rmse:0.411297
[15:06:56] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 86 extra nodes, 0 pruned nodes, max_depth=6
[627]	train-rmse:0.369495	valid-rmse:0.411268
[15:06:57] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 70 extra nodes, 0 pruned nodes, max_depth=6
[628]	train-rmse:0.369446	valid-rmse:0.411254
[15:06:57] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 88 extra nodes, 0 pruned nodes, max_depth=6
[629]	train-rmse:0.369418	valid-rmse:0.411249
[15:06:58] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 22 extra nodes, 0 pruned nodes, max_depth=6
[630]	train-rmse:0.369413	valid-rmse:0.41125
[15:06:58] src/tree/updater_prune.cc:74: tree pruning

[15:07:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 80 extra nodes, 0 pruned nodes, max_depth=6
[677]	train-rmse:0.365865	valid-rmse:0.410213
[15:07:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 90 extra nodes, 0 pruned nodes, max_depth=6
[678]	train-rmse:0.365755	valid-rmse:0.410173
[15:07:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 118 extra nodes, 0 pruned nodes, max_depth=6
[679]	train-rmse:0.365602	valid-rmse:0.410128
[15:07:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 110 extra nodes, 0 pruned nodes, max_depth=6
[680]	train-rmse:0.365436	valid-rmse:0.410044
[15:07:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 78 extra nodes, 0 pruned nodes, max_depth=6
[681]	train-rmse:0.365361	valid-rmse:0.410017
[15:07:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 92 extra nodes, 0 pruned nodes, max_depth=6
[682]	train-rmse:0.365263	valid-rmse:0.41
[15:07:21] src/tree/updater_prune.cc:74: tree pruning 

[15:07:41] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 100 extra nodes, 0 pruned nodes, max_depth=6
[729]	train-rmse:0.36181	valid-rmse:0.408957
[15:07:41] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 84 extra nodes, 0 pruned nodes, max_depth=6
[730]	train-rmse:0.361746	valid-rmse:0.408946
[15:07:42] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 106 extra nodes, 0 pruned nodes, max_depth=6
[731]	train-rmse:0.361654	valid-rmse:0.408901
[15:07:42] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 92 extra nodes, 0 pruned nodes, max_depth=6
[732]	train-rmse:0.361561	valid-rmse:0.408884
[15:07:42] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 70 extra nodes, 0 pruned nodes, max_depth=6
[733]	train-rmse:0.361506	valid-rmse:0.408871
[15:07:43] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=6
[734]	train-rmse:0.361492	valid-rmse:0.408869
[15:07:43] src/tree/updater_prune.cc:74: tree pruni

[15:08:03] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 94 extra nodes, 0 pruned nodes, max_depth=6
[781]	train-rmse:0.357927	valid-rmse:0.407865
[15:08:03] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 98 extra nodes, 0 pruned nodes, max_depth=6
[782]	train-rmse:0.357867	valid-rmse:0.407848
[15:08:04] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 42 extra nodes, 0 pruned nodes, max_depth=6
[783]	train-rmse:0.35786	valid-rmse:0.407852
[15:08:04] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 44 extra nodes, 0 pruned nodes, max_depth=6
[784]	train-rmse:0.357852	valid-rmse:0.407852
[15:08:04] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 106 extra nodes, 0 pruned nodes, max_depth=6
[785]	train-rmse:0.35777	valid-rmse:0.407829
[15:08:05] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 28 extra nodes, 0 pruned nodes, max_depth=6
[786]	train-rmse:0.357767	valid-rmse:0.407829
[15:08:05] src/tree/updater_prune.cc:74: tree pruning

[15:08:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 114 extra nodes, 0 pruned nodes, max_depth=6
[833]	train-rmse:0.354432	valid-rmse:0.407106
[15:08:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=6
[834]	train-rmse:0.354405	valid-rmse:0.407092
[15:08:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 92 extra nodes, 0 pruned nodes, max_depth=6
[835]	train-rmse:0.354373	valid-rmse:0.407077
[15:08:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 74 extra nodes, 0 pruned nodes, max_depth=6
[836]	train-rmse:0.354334	valid-rmse:0.407074
[15:08:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 78 extra nodes, 0 pruned nodes, max_depth=6
[837]	train-rmse:0.35431	valid-rmse:0.407076
[15:08:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 90 extra nodes, 0 pruned nodes, max_depth=6
[838]	train-rmse:0.354289	valid-rmse:0.40707
[15:08:27] src/tree/updater_prune.cc:74: tree pruning

[15:08:46] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 94 extra nodes, 0 pruned nodes, max_depth=6
[885]	train-rmse:0.351751	valid-rmse:0.406548
[15:08:47] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=6
[886]	train-rmse:0.351733	valid-rmse:0.406547
[15:08:47] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 90 extra nodes, 0 pruned nodes, max_depth=6
[887]	train-rmse:0.351708	valid-rmse:0.40654
[15:08:47] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 120 extra nodes, 0 pruned nodes, max_depth=6
[888]	train-rmse:0.351546	valid-rmse:0.406474
[15:08:48] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=6
[889]	train-rmse:0.351512	valid-rmse:0.406475
[15:08:48] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 64 extra nodes, 0 pruned nodes, max_depth=6
[890]	train-rmse:0.351494	valid-rmse:0.406468
[15:08:49] src/tree/updater_prune.cc:74: tree prunin

In [51]:
test_input = np.stack((test_q1_data, test_q2_data), axis=1) 

In [52]:
test_input.shape

(2345790, 2, 25)

In [53]:
test_data = xgb.DMatrix(test_input[:,0,:])
test_predict = bst.predict(test_data)

In [60]:
sub = pd.DataFrame()
sub['test_id'] = test_df['test_id']
sub['is_duplicate'] = test_predict
sub.to_csv(DATA_OUT_PATH+'simple_xgb.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: './data_out/simple_xgb.csv'