In [1]:
import pandas as pd
import numpy as np

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

import os
from os.path import expanduser

import json

from sklearn.cross_validation  import train_test_split

  from ._conv import register_converters as _register_converters


In [2]:
MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 25

Q1_TRAINING_DATA_FILE = 'q1_train.npy'
Q2_TRAINING_DATA_FILE = 'q2_train.npy'
LABEL_TRAINING_DATA_FILE = 'label_train.npy'
NB_WORDS_DATA_FILE = 'nb_words.json'

In [3]:
quora_path = expanduser('~/.kaggle/competitions/quora-question-pairs/')
quora_train = os.path.join(quora_path, 'train.csv')
quora_test = os.path.join(quora_path, 'test.csv')

In [4]:
df_train = pd.read_csv(quora_train, encoding='utf-8')
df_train = df_train.dropna() #drop empty

In [5]:
questions = list(df_train.question1) + list(df_train.question2)
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(questions)
word_index = tokenizer.word_index

question1_word_sequences = tokenizer.texts_to_sequences(df_train.question1)
question2_word_sequences = tokenizer.texts_to_sequences(df_train.question2)

print("Words in index: %d" % len(word_index))

Words in index: 95595


In [6]:
is_duplicate = df_train.is_duplicate

In [7]:
q1_data = pad_sequences(question1_word_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
q2_data = pad_sequences(question2_word_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
labels = np.array(is_duplicate, dtype=int)
print('Shape of question1 data tensor:', q1_data.shape)
print('Shape of question2 data tensor:', q2_data.shape)
print('Shape of label tensor:', labels.shape)

Shape of question1 data tensor: (404287, 25)
Shape of question2 data tensor: (404287, 25)
Shape of label tensor: (404287,)


In [8]:
data_prepro_configs = {'vocab': tokenizer.word_index, 'vocab_size': len(tokenizer.word_counts)}

In [9]:
np.save(open(Q1_TRAINING_DATA_FILE, 'wb'), q1_data)
np.save(open(Q2_TRAINING_DATA_FILE, 'wb'), q2_data)
np.save(open(LABEL_TRAINING_DATA_FILE, 'wb'), labels)

In [10]:
with open(NB_WORDS_DATA_FILE, 'w') as f:
    json.dump(data_prepro_configs, f)

In [11]:
q1_data = np.load(open(Q1_TRAINING_DATA_FILE, 'rb'))
q2_data = np.load(open(Q2_TRAINING_DATA_FILE, 'rb'))
labels = np.load(open(LABEL_TRAINING_DATA_FILE, 'rb'))

In [12]:
xTrain = np.stack((q1_data, q2_data), axis=1) 
yTrain = labels

In [14]:
xTrain, xValid, yTrain, yValid = train_test_split(xTrain, yTrain, test_size=0.2, random_state=4242)

In [15]:
import xgboost as xgb

In [16]:
params = {} # 파라미터를 통해 XGB모델에 넣어 주자 
params['objective'] = 'binary:logistic' # 로지스틱 예측을 통해서 
params['eval_metric'] = 'rmse' # root mean square error를 사용 

In [17]:
# XGB에서 DMatrix는 텍스트 입력 형식 지원
dTrain = xgb.DMatrix(xTrain[:,0,:], label=yTrain) # 학습 데이터 읽어 오기
dValid = xgb.DMatrix(xValid[:,0,:], label=yValid) # 평가 데이터 읽어 오기


# 손실을 보기 위해 손실 값을 측정하기 위해 값을 쌍으로 넣는다. 
list = [(dTrain, 'train'), (dValid, 'valid')]

In [None]:
bst = xgb.train(params, dTrain, 1000, list, early_stopping_rounds=10) # 학습

In [None]:
print("Accuracy: %f" % forest.score(yTrain, yValid))  # 검증함수로 정확도 측정

In [None]:
dTest = xgb.DMatrix(xTest)
pTest = bst.predict(dTest)

sub = pd.DataFrame()
sub['test_id'] = dfTest['test_id']
sub['is_duplicate'] = pTest
sub.to_csv('simple_xgb.csv', index=False)

In [None]:
>> kaggle competitions submit -c quora-question-pairs -f simple_xgb.csv -m "Message"