In [1]:
import pandas as pd 
import numpy as np 
import os 
import json 
from sklearn.model_selection import train_test_split

In [2]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'

TRAIN_Q1_DATA_FILE = 'train_q1.npy'
TRAIN_Q2_DATA_FILE = 'train_q2.npy'
TRAIN_LABEL_DATA_FILE = 'train_label.npy'

train_q1_data = np.load(open(DATA_IN_PATH + TRAIN_Q1_DATA_FILE, 'rb'))
train_q2_data = np.load(open(DATA_IN_PATH + TRAIN_Q2_DATA_FILE, 'rb'))
train_labels = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA_FILE, 'rb'))

In [3]:
train_input = np.stack((train_q1_data, train_q2_data), axis = 1)
#넘파이 스택은 콘케터네이션이랑은 다른데 이 축을 새로 생성해서 붙인다는것이다. 

In [4]:
print(train_q1_data.shape, train_q2_data.shape)

(298526, 31) (298526, 31)


In [5]:
print(train_input.shape)
#axis 1 에 해당하는 축을 새로 만들어 붙인것임 

(298526, 2, 31)


In [6]:
print(train_input[0])

[[    4    21     1    87 22993   229    84     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    2    11     1   512    10     1    87  7287   229     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]]


In [8]:
train_input, eval_input, train_label, eval_label = train_test_split(
    train_input, train_labels, test_size = 0.2, random_state=4242)

# 설치
- (lstm-env) $ conda install -c anaconda py-xgboost

In [9]:
import xgboost as xgb
#예는 입력데이터를 d모델(콘다에서 운영)로 바꿔줘야 들어간다.

In [10]:
train_data = xgb.DMatrix(train_input.sum(axis =1), label=train_label)
eval_data = xgb.DMatrix(eval_input.sum(axis =1), label=eval_label)

data_list = [(train_data, 'train'), (eval_data, 'valid')]


In [12]:
print(data_list[0])
#DMatrix객체가 들어간다.

(<xgboost.core.DMatrix object at 0x0000024C192D9520>, 'train')


In [14]:
#하이퍼파라메팅
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'rmse' #rmse는 root mean square error

bst = xgb.train(params, train_data, num_boost_round = 1000, evals = data_list,
               early_stopping_rounds = 10)

#num_boost_round는 몇번 반복할 것이냐를 지정해주는 부분임 
#evals 는 검증시에 사용할 데이터
#early stopping round는 몇회이상 향상이 안될경우 그만둠

[0]	train-rmse:0.48366	valid-rmse:0.48404
[1]	train-rmse:0.47329	valid-rmse:0.47416
[2]	train-rmse:0.46646	valid-rmse:0.46766
[3]	train-rmse:0.46192	valid-rmse:0.46333
[4]	train-rmse:0.45839	valid-rmse:0.46016
[5]	train-rmse:0.45574	valid-rmse:0.45771
[6]	train-rmse:0.45351	valid-rmse:0.45574
[7]	train-rmse:0.45162	valid-rmse:0.45393
[8]	train-rmse:0.45000	valid-rmse:0.45248
[9]	train-rmse:0.44889	valid-rmse:0.45152
[10]	train-rmse:0.44726	valid-rmse:0.45007
[11]	train-rmse:0.44587	valid-rmse:0.44889
[12]	train-rmse:0.44515	valid-rmse:0.44829
[13]	train-rmse:0.44263	valid-rmse:0.44591
[14]	train-rmse:0.44207	valid-rmse:0.44549
[15]	train-rmse:0.44164	valid-rmse:0.44515
[16]	train-rmse:0.44120	valid-rmse:0.44484
[17]	train-rmse:0.43968	valid-rmse:0.44342
[18]	train-rmse:0.43914	valid-rmse:0.44307
[19]	train-rmse:0.43890	valid-rmse:0.44293
[20]	train-rmse:0.43724	valid-rmse:0.44143
[21]	train-rmse:0.43680	valid-rmse:0.44109
[22]	train-rmse:0.43605	valid-rmse:0.44061
[23]	train-rmse:0.435

In [15]:
TEST_Q1_DATA_FILE = 'test_q1.npy'
TEST_Q2_DATA_FILE = 'test_q2.npy'
TEST_ID_DATA_FILE = 'test_id.npy'

test_q1_data = np.load(open(DATA_IN_PATH + TEST_Q1_DATA_FILE,'rb'))
test_q2_data = np.load(open(DATA_IN_PATH + TEST_Q2_DATA_FILE,'rb'))
test_id_data = np.load(open(DATA_IN_PATH + TEST_ID_DATA_FILE,'rb'), 
                      allow_pickle=True)


In [16]:
test_input = np.stack((test_q1_data, test_q2_data), axis = 1)
test_data = xgb.DMatrix(test_input.sum(axis=1))
test_predict = bst.predict(test_data)

In [30]:
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)
    
output = pd.DataFrame({'test_id':test_id_data, 'is_duplicate':test_predict})
output.to_csv(DATA_OUT_PATH + 'simple_xgb.csv', index=False)

ValueError: All arrays must be of the same length

In [31]:
len(test_id_data)

2345796

In [32]:
len(test_predict)

298526

In [28]:
test_id_data[:3]

array([0, 1, 2], dtype=object)

In [29]:
test_predict[:3]

array([0.13279204, 0.01544462, 0.09106369], dtype=float32)