In [1]:
import pandas as pd
import numpy as np

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

import os
from os.path import expanduser

import json

from sklearn.cross_validation  import train_test_split



In [2]:
MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 25

Q1_TRAINING_DATA_FILE = 'q1_train.npy'
Q2_TRAINING_DATA_FILE = 'q2_train.npy'
LABEL_TRAINING_DATA_FILE = 'label_train.npy'
NB_WORDS_DATA_FILE = 'nb_words.json'

In [3]:
quora_path = expanduser('~/.kaggle/competitions/quora-question-pairs/')
quora_train = os.path.join(quora_path, 'train.csv')
quora_test = os.path.join(quora_path, 'test.csv')

In [4]:
df_train = pd.read_csv(quora_train, encoding='utf-8')
df_train = df_train.dropna() #drop empty

In [5]:
df_test = pd.read_csv(quora_test, encoding='utf-8')
df_test = df_test.dropna() #drop empty

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
questionsTrain = list(df_train['question1']) + list(df_train['question2'])
tokenizerTrain = Tokenizer(num_words=MAX_NB_WORDS)
tokenizerTrain.fit_on_texts(questionsTrain)
word_index = tokenizerTrain.word_index

question1WordSequencesTrain = tokenizerTrain.texts_to_sequences(df_train.question1)
question2WordSequencesTrain = tokenizerTrain.texts_to_sequences(df_train.question2)

print("Words in index: %d" % len(word_index))

Words in index: 95595


In [7]:
questionsTest = list(df_test['question1']) + list(df_test['question2'])
tokenizerTest = Tokenizer(num_words=MAX_NB_WORDS)
tokenizerTest.fit_on_texts(questionsTest)
word_index = tokenizerTest.word_index

question1WordSequencesTest = tokenizerTest.texts_to_sequences(df_test.question1)
question2WordSequencesTest = tokenizerTest.texts_to_sequences(df_test.question2)

print("Words in index: %d" % len(word_index))

Words in index: 101312


In [8]:
is_duplicateTrain = df_train.is_duplicate

In [9]:
q1_dataTrain = pad_sequences(question1WordSequencesTrain, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
q2_dataTrain = pad_sequences(question2WordSequencesTrain, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
labels = np.array(is_duplicateTrain, dtype=int)
print('Shape of question1 data tensor:', q1_dataTrain.shape)
print('Shape of question2 data tensor:', q2_dataTrain.shape)
print('Shape of label tensor:', labels.shape)

Shape of question1 data tensor: (404287, 25)
Shape of question2 data tensor: (404287, 25)
Shape of label tensor: (404287,)


In [10]:
q1_dataTest = pad_sequences(question1WordSequencesTest, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
q2_dataTest = pad_sequences(question2WordSequencesTest, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

print('Shape of question1 data tensor:', q1_dataTest.shape)
print('Shape of question2 data tensor:', q2_dataTest.shape)

Shape of question1 data tensor: (3563466, 25)
Shape of question2 data tensor: (3563466, 25)


In [11]:
data_prepro_configs = {'vocab': tokenizerTrain.word_index, 'vocab_size': len(tokenizerTrain.word_counts)}

In [12]:
np.save(open(Q1_TRAINING_DATA_FILE, 'wb'), q1_dataTrain)
np.save(open(Q2_TRAINING_DATA_FILE, 'wb'), q2_dataTrain)
np.save(open(LABEL_TRAINING_DATA_FILE, 'wb'), labels)

In [13]:
with open(NB_WORDS_DATA_FILE, 'w') as f:
    json.dump(data_prepro_configs, f)

In [14]:
q1_data = np.load(open(Q1_TRAINING_DATA_FILE, 'rb'))
q2_data = np.load(open(Q2_TRAINING_DATA_FILE, 'rb'))
labels = np.load(open(LABEL_TRAINING_DATA_FILE, 'rb'))

In [15]:
xTrain = np.stack((q1_data, q2_data), axis=1) 
yTrain = labels

In [16]:
xTrain, xValid, yTrain, yValid = train_test_split(xTrain, yTrain, test_size=0.2, random_state=4242)

In [17]:
import xgboost as xgb

In [18]:
params = {} # 파라미터를 통해 XGB모델에 넣어 주자 
params['objective'] = 'binary:logistic' # 로지스틱 예측을 통해서 
params['eval_metric'] = 'rmse' # root mean square error를 사용 

In [19]:
# XGB에서 DMatrix는 텍스트 입력 형식 지원
dTrain = xgb.DMatrix(xTrain[:,0,:], label=yTrain) # 학습 데이터 읽어 오기
dValid = xgb.DMatrix(xValid[:,0,:], label=yValid) # 평가 데이터 읽어 오기

# 손실을 보기 위해 손실 값을 측정하기 위해 값을 쌍으로 넣는다. 
list = [(dTrain, 'train'), (dValid, 'valid')]

In [20]:
bst = xgb.train(params, dTrain, 1000, list, early_stopping_rounds=10) # 학습

[0]	train-rmse:0.481806	valid-rmse:0.482044
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 10 rounds.
[1]	train-rmse:0.471209	valid-rmse:0.471745
[2]	train-rmse:0.464538	valid-rmse:0.46544
[3]	train-rmse:0.460162	valid-rmse:0.46139
[4]	train-rmse:0.457468	valid-rmse:0.458893
[5]	train-rmse:0.455024	valid-rmse:0.456556
[6]	train-rmse:0.453488	valid-rmse:0.455215
[7]	train-rmse:0.452004	valid-rmse:0.453863
[8]	train-rmse:0.451075	valid-rmse:0.453064
[9]	train-rmse:0.449758	valid-rmse:0.451913
[10]	train-rmse:0.449161	valid-rmse:0.45142
[11]	train-rmse:0.448302	valid-rmse:0.450652
[12]	train-rmse:0.44642	valid-rmse:0.448803
[13]	train-rmse:0.445608	valid-rmse:0.448078
[14]	train-rmse:0.445248	valid-rmse:0.447842
[15]	train-rmse:0.444906	valid-rmse:0.447605
[16]	train-rmse:0.444447	valid-rmse:0.447267
[17]	train-rmse:0.443626	valid-rmse:0.446542
[18]	train-rmse:0.443237	valid-rmse:0.446238
[19]	train-rms

[179]	train-rmse:0.40941	valid-rmse:0.425217
[180]	train-rmse:0.409402	valid-rmse:0.425219
[181]	train-rmse:0.409395	valid-rmse:0.425218
[182]	train-rmse:0.409109	valid-rmse:0.425013
[183]	train-rmse:0.408943	valid-rmse:0.424988
[184]	train-rmse:0.408748	valid-rmse:0.424868
[185]	train-rmse:0.408474	valid-rmse:0.424706
[186]	train-rmse:0.408323	valid-rmse:0.424656
[187]	train-rmse:0.408169	valid-rmse:0.424603
[188]	train-rmse:0.408083	valid-rmse:0.424597
[189]	train-rmse:0.408006	valid-rmse:0.424576
[190]	train-rmse:0.40793	valid-rmse:0.424557
[191]	train-rmse:0.40789	valid-rmse:0.424549
[192]	train-rmse:0.407772	valid-rmse:0.424481
[193]	train-rmse:0.407729	valid-rmse:0.424472
[194]	train-rmse:0.407712	valid-rmse:0.424471
[195]	train-rmse:0.407606	valid-rmse:0.424444
[196]	train-rmse:0.407434	valid-rmse:0.424382
[197]	train-rmse:0.407318	valid-rmse:0.42434
[198]	train-rmse:0.407199	valid-rmse:0.424307
[199]	train-rmse:0.406974	valid-rmse:0.424184
[200]	train-rmse:0.406865	valid-rmse:0

[359]	train-rmse:0.390753	valid-rmse:0.417868
[360]	train-rmse:0.390686	valid-rmse:0.417863
[361]	train-rmse:0.390635	valid-rmse:0.417863
[362]	train-rmse:0.390575	valid-rmse:0.417857
[363]	train-rmse:0.390523	valid-rmse:0.417858
[364]	train-rmse:0.390404	valid-rmse:0.417833
[365]	train-rmse:0.390285	valid-rmse:0.417763
[366]	train-rmse:0.390122	valid-rmse:0.41769
[367]	train-rmse:0.390077	valid-rmse:0.417697
[368]	train-rmse:0.390045	valid-rmse:0.417696
[369]	train-rmse:0.389991	valid-rmse:0.417683
[370]	train-rmse:0.389797	valid-rmse:0.417617
[371]	train-rmse:0.389709	valid-rmse:0.417581
[372]	train-rmse:0.38947	valid-rmse:0.417452
[373]	train-rmse:0.389345	valid-rmse:0.41738
[374]	train-rmse:0.389242	valid-rmse:0.417359
[375]	train-rmse:0.389132	valid-rmse:0.417316
[376]	train-rmse:0.388974	valid-rmse:0.417246
[377]	train-rmse:0.388905	valid-rmse:0.41722
[378]	train-rmse:0.388853	valid-rmse:0.417204
[379]	train-rmse:0.388796	valid-rmse:0.417183
[380]	train-rmse:0.388707	valid-rmse:0

[538]	train-rmse:0.376109	valid-rmse:0.4131
[539]	train-rmse:0.375931	valid-rmse:0.413066
[540]	train-rmse:0.375901	valid-rmse:0.413053
[541]	train-rmse:0.375751	valid-rmse:0.413023
[542]	train-rmse:0.375572	valid-rmse:0.41301
[543]	train-rmse:0.3755	valid-rmse:0.412989
[544]	train-rmse:0.375468	valid-rmse:0.412983
[545]	train-rmse:0.37539	valid-rmse:0.412955
[546]	train-rmse:0.375239	valid-rmse:0.412889
[547]	train-rmse:0.375125	valid-rmse:0.412858
[548]	train-rmse:0.374985	valid-rmse:0.412791
[549]	train-rmse:0.3749	valid-rmse:0.412779
[550]	train-rmse:0.374841	valid-rmse:0.412769
[551]	train-rmse:0.374727	valid-rmse:0.412732
[552]	train-rmse:0.374653	valid-rmse:0.41271
[553]	train-rmse:0.374591	valid-rmse:0.41269
[554]	train-rmse:0.374576	valid-rmse:0.412693
[555]	train-rmse:0.374451	valid-rmse:0.412663
[556]	train-rmse:0.374323	valid-rmse:0.412637
[557]	train-rmse:0.374214	valid-rmse:0.412595
[558]	train-rmse:0.37418	valid-rmse:0.412604
[559]	train-rmse:0.37406	valid-rmse:0.412572


[718]	train-rmse:0.363288	valid-rmse:0.409457
[719]	train-rmse:0.36321	valid-rmse:0.409421
[720]	train-rmse:0.363185	valid-rmse:0.409405
[721]	train-rmse:0.363099	valid-rmse:0.40937
[722]	train-rmse:0.36305	valid-rmse:0.409368
[723]	train-rmse:0.363014	valid-rmse:0.409377
[724]	train-rmse:0.362966	valid-rmse:0.409368
[725]	train-rmse:0.362795	valid-rmse:0.409321
[726]	train-rmse:0.36269	valid-rmse:0.409302
[727]	train-rmse:0.362586	valid-rmse:0.409272
[728]	train-rmse:0.362518	valid-rmse:0.409246
[729]	train-rmse:0.362446	valid-rmse:0.409219
[730]	train-rmse:0.362312	valid-rmse:0.409181
[731]	train-rmse:0.362186	valid-rmse:0.40915
[732]	train-rmse:0.362065	valid-rmse:0.409117
[733]	train-rmse:0.361966	valid-rmse:0.409072
[734]	train-rmse:0.361808	valid-rmse:0.409011
[735]	train-rmse:0.361667	valid-rmse:0.408974
[736]	train-rmse:0.361533	valid-rmse:0.408935
[737]	train-rmse:0.361499	valid-rmse:0.408943
[738]	train-rmse:0.361468	valid-rmse:0.408948
[739]	train-rmse:0.361321	valid-rmse:0.

In [21]:
xTest = np.stack((q1_dataTest, q2_dataTest), axis=1) 

In [22]:
xTest.shape

(3563466, 2, 25)

In [23]:
dTest = xgb.DMatrix(xTest[:,0,:])
pTest = bst.predict(dTest)

In [24]:
sub = pd.DataFrame()
sub['test_id'] = df_test['test_id']
sub['is_duplicate'] = pTest
sub.to_csv('simple_xgb.csv', index=False)