In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [6]:
path = r'round1_ijcai_18_train_20180301.txt'
train_data = pd.read_table(path, delim_whitespace=True, header=0)

In [28]:
train_data.shape

(478138, 27)

In [12]:
data_to_use =['item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level', 'user_gender_id',
       'user_age_level', 'user_occupation_id', 'user_star_level', 'context_page_id','shop_review_num_level',
              'shop_review_positive_rate', 'shop_star_level', 'shop_score_service', 'shop_score_delivery',
       'shop_score_description']

In [13]:
train_x = train_data[data_to_use]
train_y = train_data['is_trade']

In [27]:
train_x.head(5)

Unnamed: 0,item_price_level,item_sales_level,item_collected_level,item_pv_level,user_gender_id,user_age_level,user_occupation_id,user_star_level,context_page_id,shop_review_num_level,shop_review_positive_rate,shop_star_level,shop_score_service,shop_score_delivery,shop_score_description
0,3,3,4,14,1,3,5,3,6,4,1.0,2,1.0,1.0,1.0
1,3,3,4,14,0,2,5,6,1,4,1.0,2,1.0,1.0,1.0
2,3,3,4,14,0,3,5,4,1,4,1.0,2,1.0,1.0,1.0
3,3,3,4,14,1,4,5,6,16,4,1.0,2,1.0,1.0,1.0
4,3,3,4,14,0,2,5,1,1,4,1.0,2,1.0,1.0,1.0


In [44]:
train_data = train_x[:448138]
train_target = train_y[:448138]

valid_data = train_x[448138:]
valid_target = train_y[448138:]

In [46]:
train_data.columns

Index(['item_price_level', 'item_sales_level', 'item_collected_level',
       'item_pv_level', 'user_gender_id', 'user_age_level',
       'user_occupation_id', 'user_star_level', 'context_page_id',
       'shop_review_num_level', 'shop_review_positive_rate', 'shop_star_level',
       'shop_score_service', 'shop_score_delivery', 'shop_score_description'],
      dtype='object')

In [56]:
dtrain= xgb.DMatrix(train_data, label=train_target)
dvalid = xgb.DMatrix(valid_data)

params={'booster':'gbtree',
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth':7,
    'lambda':10,
    'subsample':0.75,
    'colsample_bytree':0.75,
    'min_child_weight':2,
    'eta': 0.020,
    'seed':0,
    'nthread':8,
     'silent':1}

watchlist = [(dtrain,'train')]

In [57]:
bst=xgb.train(params,dtrain,num_boost_round=300,evals=watchlist)

y_pred=bst.predict(dvalid)


[0]	train-auc:0.560586
[1]	train-auc:0.560586
[2]	train-auc:0.560586
[3]	train-auc:0.602611
[4]	train-auc:0.602611
[5]	train-auc:0.602611
[6]	train-auc:0.602611
[7]	train-auc:0.602973
[8]	train-auc:0.602973
[9]	train-auc:0.620537
[10]	train-auc:0.620118
[11]	train-auc:0.620118
[12]	train-auc:0.62152
[13]	train-auc:0.62152
[14]	train-auc:0.622145
[15]	train-auc:0.621723
[16]	train-auc:0.621723
[17]	train-auc:0.623112
[18]	train-auc:0.623488
[19]	train-auc:0.628037
[20]	train-auc:0.630229
[21]	train-auc:0.629203
[22]	train-auc:0.629033
[23]	train-auc:0.629221
[24]	train-auc:0.63015
[25]	train-auc:0.63015
[26]	train-auc:0.633747
[27]	train-auc:0.632307
[28]	train-auc:0.632495
[29]	train-auc:0.633932
[30]	train-auc:0.633094
[31]	train-auc:0.63514
[32]	train-auc:0.635066
[33]	train-auc:0.634739
[34]	train-auc:0.635067
[35]	train-auc:0.635038
[36]	train-auc:0.635035
[37]	train-auc:0.635595
[38]	train-auc:0.63613
[39]	train-auc:0.636133
[40]	train-auc:0.636028
[41]	train-auc:0.637964
[42]	tra

In [58]:
def logloss(y_label,y_prev):
    assert len(y_label)==len(y_prev)
    N = len(y_label)
    logloss = -(y_label*np.log(y_prev)+(1-y_label)*np.log(1-y_prev)).sum()/N
    return logloss

In [59]:
print(logloss(valid_target, y_pred))

0.08562151947437475


In [61]:
path_a = r'round1_ijcai_18_test_a_20180301.txt'
path_b = r'round1_ijcai_18_test_b_20180418.txt'
test_data_a = pd.read_table(path_a, delim_whitespace=True, header=0)
test_data_b = pd.read_table(path_b, delim_whitespace=True, header=0)
frames = [test_data_a, test_data_b]

In [62]:
test_data = pd.concat(frames)

In [66]:
test_data_final = test_data[data_to_use]
dtest = xgb.DMatrix(test_data_final)

In [67]:
y_final_prev = bst.predict(dtest)

In [70]:
y_final_prev

array([0.02456573, 0.03058962, 0.0299583 , ..., 0.02119252, 0.04408941,
       0.02029177], dtype=float32)

In [71]:
from pandas import DataFrame
result_y = DataFrame(y_final_prev, columns=['predicted_score'])
test_data_id = test_data['instance_id']

In [84]:
test_data_id.shape

(61259,)

In [85]:
final_result = DataFrame(test_data_id).join(result_y)

In [86]:
final_result.head(5)

Unnamed: 0,instance_id,predicted_score
0,2475218615076601065,0.024566
0,55144604125888,0.024566
1,398316874173557226,0.03059
1,221669167755726,0.03059
2,6586402638209028583,0.029958


In [91]:
np.savetxt('tianchi_ali.txt', final_result, fmt='%d %f', delimiter="\n", header="instance_id predicted_score")

In [95]:
test_b_final = test_data_b[data_to_use]
d_test_b = xgb.DMatrix(test_b_final )
y_final_prev_b = bst.predict(d_test_b)

result_y_b = DataFrame(y_final_prev_b, columns=['predicted_score'])
test_data_id_b = test_data_b['instance_id']
final_result_b = DataFrame(test_data_id_b).join(result_y_b)

In [96]:
np.savetxt('round1_ijcai_18_test_b_20180418.txt', final_result_a, fmt='%d %f', delimiter="\n", header="instance_id predicted_score")