In [1]:
import pandas as pd
import numpy as np

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

import os
from os.path import expanduser

import json

from sklearn.model_selection  import train_test_split

In [2]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'

TRAIN_Q1_DATA_FILE = 'train_q1.npy'
TRAIN_Q2_DATA_FILE = 'train_q2.npy'
TRAIN_LABEL_DATA_FILE = 'train_label.npy'

TEST_Q1_DATA_FILE = 'test_q1.npy'
TEST_Q2_DATA_FILE = 'test_q2.npy'

MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 25
NB_WORDS_DATA_FILE = 'nb_words.json'

In [3]:
DATA_IN_PATH = expanduser('~/.kaggle/competitions/quora-question-pairs/')
test_data = os.path.join(DATA_IN_PATH, 'test.csv')
test_data = pd.read_csv(test_data, encoding='utf-8')
test_data = test_data.dropna()

In [4]:
train_q1_data = np.load(open(DATA_IN_PATH + TRAIN_Q1_DATA_FILE, 'rb'))
train_q2_data = np.load(open(DATA_IN_PATH + TRAIN_Q2_DATA_FILE, 'rb'))
labels = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA_FILE, 'rb'))

In [5]:
train_input = np.stack((train_q1_data, train_q2_data), axis=1) 
train_label = labels

In [7]:
train_input

array([[[   2,    3,    1, ...,    0,    0,    0],
        [   2,    3,    1, ...,    0,    0,    0]],

       [[   2,    3,    1, ...,    0,    0,    0],
        [   2,   43,  182, ...,    0,    0,    0]],

       [[   4,   13,    5, ...,    0,    0,    0],
        [   4,   13,  361, ...,    0,    0,    0]],

       ...,

       [[   2,    3,   49, ...,    0,    0,    0],
        [  75,   65, 4389, ...,    0,    0,    0]],

       [[   2,    3,    1, ...,    0,    0,    0],
        [   5,   72,  309, ..., 5021,   12, 6555]],

       [[   2,    3,   39, ...,    0,    0,    0],
        [   2,    3,   19, ...,    0,    0,    0]]], dtype=int32)

In [8]:
train_input, eval_input, train_label, eval_label = train_test_split(train_input, train_label, test_size=0.2, random_state=4242)

In [9]:
import xgboost as xgb

In [10]:
params = {} # 파라미터를 통해 XGB모델에 넣어 주자 
params['objective'] = 'binary:logistic' # 로지스틱 예측을 통해서 
params['eval_metric'] = 'rmse' # root mean square error를 사용 

In [14]:
train_input.shape

(323429, 2, 25)

In [18]:
train_input.sum(axis=1).shape

(323429, 25)

In [19]:
# XGB에서 DMatrix는 텍스트 입력 형식 지원
train_data = xgb.DMatrix(train_input.sum(axis=1), label=train_label) # 학습 데이터 읽어 오기
eval_data = xgb.DMatrix(eval_input.sum(axis=1), label=eval_label) # 평가 데이터 읽어 오기

list = [(train_data, 'train'), (eval_data, 'valid')]

In [27]:
bst = xgb.train(params, train_data, num_boost_round = 1000, evals = list, early_stopping_rounds=10) # 학습

[16:18:58] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 124 extra nodes, 0 pruned nodes, max_depth=6
[0]	train-rmse:0.478157	valid-rmse:0.478636
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 10 rounds.
[16:18:58] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 126 extra nodes, 0 pruned nodes, max_depth=6
[1]	train-rmse:0.465495	valid-rmse:0.466397
[16:18:59] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 122 extra nodes, 0 pruned nodes, max_depth=6
[2]	train-rmse:0.457994	valid-rmse:0.459145
[16:18:59] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 124 extra nodes, 0 pruned nodes, max_depth=6
[3]	train-rmse:0.453083	valid-rmse:0.454668
[16:19:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 124 extra nodes, 0 pruned nodes, max_depth=6
[4]	train-rmse:0.449342	valid-rmse:0.451254
[16:19:01] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12

[16:19:24] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 82 extra nodes, 0 pruned nodes, max_depth=6
[52]	train-rmse:0.4216	valid-rmse:0.429076
[16:19:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=6
[53]	train-rmse:0.421417	valid-rmse:0.428901
[16:19:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 98 extra nodes, 0 pruned nodes, max_depth=6
[54]	train-rmse:0.421341	valid-rmse:0.428904
[16:19:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 126 extra nodes, 0 pruned nodes, max_depth=6
[55]	train-rmse:0.420929	valid-rmse:0.428697
[16:19:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 106 extra nodes, 0 pruned nodes, max_depth=6
[56]	train-rmse:0.420793	valid-rmse:0.428585
[16:19:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 96 extra nodes, 0 pruned nodes, max_depth=6
[57]	train-rmse:0.42065	valid-rmse:0.428519
[16:19:27] src/tree/updater_prune.cc:74: tree pruning end, 

[16:19:51] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 92 extra nodes, 0 pruned nodes, max_depth=6
[105]	train-rmse:0.410683	valid-rmse:0.423735
[16:19:51] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 116 extra nodes, 0 pruned nodes, max_depth=6
[106]	train-rmse:0.410539	valid-rmse:0.423683
[16:19:52] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 100 extra nodes, 0 pruned nodes, max_depth=6
[107]	train-rmse:0.410368	valid-rmse:0.423622
[16:19:52] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 64 extra nodes, 0 pruned nodes, max_depth=6
[108]	train-rmse:0.410268	valid-rmse:0.423546
[16:19:53] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 92 extra nodes, 0 pruned nodes, max_depth=6
[109]	train-rmse:0.410193	valid-rmse:0.42353
[16:19:53] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 0 pruned nodes, max_depth=6
[110]	train-rmse:0.410174	valid-rmse:0.423523
[16:19:54] src/tree/updater_prune.cc:74: tree pruni

[16:20:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 122 extra nodes, 0 pruned nodes, max_depth=6
[157]	train-rmse:0.402854	valid-rmse:0.420519
[16:20:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 122 extra nodes, 0 pruned nodes, max_depth=6
[158]	train-rmse:0.402609	valid-rmse:0.420397
[16:20:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 78 extra nodes, 0 pruned nodes, max_depth=6
[159]	train-rmse:0.402537	valid-rmse:0.420395
[16:20:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 122 extra nodes, 0 pruned nodes, max_depth=6
[160]	train-rmse:0.402367	valid-rmse:0.420373
[16:20:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 94 extra nodes, 0 pruned nodes, max_depth=6
[161]	train-rmse:0.402245	valid-rmse:0.420362
[16:20:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 92 extra nodes, 0 pruned nodes, max_depth=6
[162]	train-rmse:0.402182	valid-rmse:0.42036
[16:20:19] src/tree/updater_prune.cc:74: tree prun

[16:20:42] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 102 extra nodes, 0 pruned nodes, max_depth=6
[209]	train-rmse:0.39652	valid-rmse:0.418792
[16:20:42] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 118 extra nodes, 0 pruned nodes, max_depth=6
[210]	train-rmse:0.39631	valid-rmse:0.418729
[16:20:43] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 126 extra nodes, 0 pruned nodes, max_depth=6
[211]	train-rmse:0.396037	valid-rmse:0.418612
[16:20:43] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 120 extra nodes, 0 pruned nodes, max_depth=6
[212]	train-rmse:0.395838	valid-rmse:0.418513
[16:20:44] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 64 extra nodes, 0 pruned nodes, max_depth=6
[213]	train-rmse:0.395797	valid-rmse:0.418509
[16:20:44] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 104 extra nodes, 0 pruned nodes, max_depth=6
[214]	train-rmse:0.395669	valid-rmse:0.418471
[16:20:45] src/tree/updater_prune.cc:74: tree pru

[16:21:07] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 0 pruned nodes, max_depth=6
[261]	train-rmse:0.389892	valid-rmse:0.417003
[16:21:08] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 64 extra nodes, 0 pruned nodes, max_depth=6
[262]	train-rmse:0.389865	valid-rmse:0.416999
[16:21:08] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 92 extra nodes, 0 pruned nodes, max_depth=6
[263]	train-rmse:0.389756	valid-rmse:0.416971
[16:21:08] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 118 extra nodes, 0 pruned nodes, max_depth=6
[264]	train-rmse:0.389604	valid-rmse:0.416943
[16:21:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=6
[265]	train-rmse:0.389579	valid-rmse:0.416939
[16:21:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 42 extra nodes, 0 pruned nodes, max_depth=6
[266]	train-rmse:0.389566	valid-rmse:0.416941
[16:21:10] src/tree/updater_prune.cc:74: tree pruni

[16:21:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 124 extra nodes, 0 pruned nodes, max_depth=6
[313]	train-rmse:0.383967	valid-rmse:0.41549
[16:21:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 84 extra nodes, 0 pruned nodes, max_depth=6
[314]	train-rmse:0.38391	valid-rmse:0.415496
[16:21:34] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 112 extra nodes, 0 pruned nodes, max_depth=6
[315]	train-rmse:0.38375	valid-rmse:0.415469
[16:21:34] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 74 extra nodes, 0 pruned nodes, max_depth=6
[316]	train-rmse:0.38372	valid-rmse:0.415463
[16:21:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 112 extra nodes, 0 pruned nodes, max_depth=6
[317]	train-rmse:0.383639	valid-rmse:0.415438
[16:21:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 84 extra nodes, 0 pruned nodes, max_depth=6
[318]	train-rmse:0.383573	valid-rmse:0.41542
[16:21:36] src/tree/updater_prune.cc:74: tree pruning 

[16:21:58] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=6
[365]	train-rmse:0.379256	valid-rmse:0.414514
[16:21:59] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 36 extra nodes, 0 pruned nodes, max_depth=6
[366]	train-rmse:0.379245	valid-rmse:0.414517
[16:21:59] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 116 extra nodes, 0 pruned nodes, max_depth=6
[367]	train-rmse:0.379115	valid-rmse:0.414488
[16:22:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 114 extra nodes, 0 pruned nodes, max_depth=6
[368]	train-rmse:0.378982	valid-rmse:0.41446
[16:22:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 120 extra nodes, 0 pruned nodes, max_depth=6
[369]	train-rmse:0.378842	valid-rmse:0.414445
[16:22:01] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 84 extra nodes, 0 pruned nodes, max_depth=6
[370]	train-rmse:0.378789	valid-rmse:0.414451
[16:22:01] src/tree/updater_prune.cc:74: tree prun

[16:22:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 38 extra nodes, 0 pruned nodes, max_depth=6
[417]	train-rmse:0.374009	valid-rmse:0.413331
[16:22:24] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 46 extra nodes, 0 pruned nodes, max_depth=6
[418]	train-rmse:0.373992	valid-rmse:0.413331
[16:22:24] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=6
[419]	train-rmse:0.373974	valid-rmse:0.413341
[16:22:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=6
[420]	train-rmse:0.37396	valid-rmse:0.413343
[16:22:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 118 extra nodes, 0 pruned nodes, max_depth=6
[421]	train-rmse:0.373767	valid-rmse:0.413248
[16:22:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 118 extra nodes, 0 pruned nodes, max_depth=6
[422]	train-rmse:0.373597	valid-rmse:0.413208
[16:22:26] src/tree/updater_prune.cc:74: tree pruni

[16:22:49] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 50 extra nodes, 0 pruned nodes, max_depth=6
[469]	train-rmse:0.369108	valid-rmse:0.412336
[16:22:50] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=6
[470]	train-rmse:0.369088	valid-rmse:0.412334
[16:22:50] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 38 extra nodes, 0 pruned nodes, max_depth=6
[471]	train-rmse:0.369082	valid-rmse:0.412335
[16:22:51] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 70 extra nodes, 0 pruned nodes, max_depth=6
[472]	train-rmse:0.369048	valid-rmse:0.412339
[16:22:51] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 0 pruned nodes, max_depth=6
[473]	train-rmse:0.369032	valid-rmse:0.412342
[16:22:52] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 106 extra nodes, 0 pruned nodes, max_depth=6
[474]	train-rmse:0.368934	valid-rmse:0.412312
[16:22:52] src/tree/updater_prune.cc:74: tree pruni

In [21]:
test_q1_data = np.load(open(DATA_IN_PATH + TEST_Q1_DATA_FILE, 'rb'))
test_q2_data = np.load(open(DATA_IN_PATH + TEST_Q2_DATA_FILE, 'rb'))

In [22]:
test_input = np.stack((test_q1_data, test_q2_data), axis=1) 

In [23]:
test_input.shape

(2345790, 2, 25)

In [25]:
test_run = xgb.DMatrix(test_input.sum(axis=1))
test_predict = bst.predict(test_run)

In [26]:
sub = pd.DataFrame()
sub['test_id'] = test_data['test_id']
sub['is_duplicate'] = test_predict
sub.to_csv(DATA_OUT_PATH + 'simple_xgb.csv', index=False)