In [None]:
# coding:utf-8
# created by Phoebe_px on 2017/6/26
import os
import pickle
import pandas as pd
from collections import defaultdict
from datetime import datetime,timedelta
import numpy as np
import xgboost as xgb
from sklearn.cross_validation import *
from sklearn.grid_search import GridSearchCV

### 处理数据

In [2]:
#字符日期转换成datetime日期型
def  convert2date(date):
    date=str(date)
    try:
        return datetime.strptime(date,'%Y-%m-%d')
    except ValueError:
        return np.datetime64('NaT')

In [None]:
# EDA过后，将 rank 当作类别变量处理 
def convert_rank(rank):
    if rank==3:
        return 1
    elif rank==5:
        return 2
    elif rank==7:
        return 3
    elif rank==9:
        return 4
    elif (rank>9 and rank<=15):
        return 5
    else:
        return 6

In [3]:
#脏数据列，EDA 发现‘user_confirmtime’列存在部分数据值大于0，此处我将其直接取负；同样的处理对于’user_avgadvanceddate’列
def dirty_advaceddate(data):
    if data<0:
        return -data
    else:
        return data
def dirty_confirmtime(data):
    if data>0:
        return -data
    else:
        return data

### 清洗数据，增加特征

In [4]:
def clean_data(datadir,istrain=True):
    data=pd.read_csv(datadir,delimiter='\t')
    cleandir='../dataset/cleandata/'
    origin_feature=data.columns
    if not os.path.exists('../dataset/origin_feature.pkl'):
        pickle.dump(origin_feature,open('../dataset/origin_feature.pkl','wb'))
    if len(os.listdir(cleandir))<=2:
        #lastord date diff
        data['diff_date']=(((data['orderdate'].map(convert2date)).sub(data['orderdate_lastord'].map(convert2date),axis=0)).astype(timedelta).map(lambda x: np.nan if pd.isnull(x) else x.days))
        #rank split to six class (3，5,7,9,11-15，>15)
        data['rank']=data['rank'].map(convert_rank)
        data['rank_lastord']=data['rank_lastord'].map(convert_rank)
        #diff between rank 、hotelid 、 basicroomid 、roomid 、star
        data['rank_diff']=(data['rank']==data['rank_lastord']).map(int)
        data['hotel_diff']=(data.hotelid==data.hotelid_lastord).astype(int)
        data['basicroomid_diff']=(data.basicroomid==data.basicroomid_lastord).astype(int)
        data['roomid_diff']=(data.roomid==data.roomid_lastord).astype(int)
        data['star_diff']=(data['star']==data['star_lastord']).map(int)
        #clean dirty data
        data['user_avgadvanceddate']=data['user_avgadvanceddate'].map(dirty_advaceddate)
        data['user_confirmtime']=data['user_confirmtime'].map(dirty_confirmtime)
        #discount ratio
        data['discount_lastord']=data['return_lastord']/data['price_last_lastord']
        data['discount']=data['returnvalue']/data['price_deduct']
        data['discount_gap']=data['discount']-data['discount_lastord']
        #price diff
        data['price_gap']=(data['price_deduct']-data['returnvalue'])-(data['price_last_lastord']-data['return_lastord'])
        #last order gap in price and hotel_minprice、basic_minprice 
        data['pirce_hotelmin_last_gap']=(data['price_last_lastord'])-(data['hotel_minprice_lastord'])
        data['price_basicmin_last_gap']=(data['price_last_lastord'])-(data['basic_minprice_lastord'])
        #diff roomservice
        data['roomservice_2_gap']=data['roomservice_2']-data['roomservice_2_lastord']
        data['roomservice_3_gap']=data['roomservice_3']-data['roomservice_3_lastord']
        data['roomservice_4_gap']=data['roomservice_4']-data['roomservice_4_lastord']
        data['roomservice_5_gap']=data['roomservice_5']-data['roomservice_5_lastord']
        data['roomservice_6_gap']=data['roomservice_6']-data['roomservice_6_lastord']
        data['roomservice_8_gap']=data['roomservice_8']-data['roomservice_8_lastord']
        #diff roomtag
        data['roomtag_2_gap']=data['roomtag_2']-data['roomtag_2_lastord']
        data['roomtag_3_gap']=data['roomtag_3']-data['roomtag_3_lastord']
        data['roomtag_4_gap']=(data['roomtag_4']==data['roomtag_4_lastord']).map(int)
        data['roomtag_5_gap']=(data['roomtag_5']==data['roomtag_5_lastord']).map(int)
        data['roomtag_6_gap']=(data['roomtag_6']==data['roomtag_6_lastord']).map(int)
        # gap in price and hotel_minprice、basic_minprice 
        temp=data[['orderid','uid','hotelid','basicroomid','roomid','price_deduct']]
        temp['price_deduct']=temp['price_deduct']
        df_hotel_minprice=temp.groupby(['orderid','uid','hotelid'],as_index=False).min().rename(columns={'price_deduct': 'hotel_minprice'})[['orderid','uid','hotelid','hotel_minprice']]
        df_basci_minprice=temp.groupby(['orderid','uid','hotelid','basicroomid'],as_index=False).min().rename(columns={'price_deduct': 'basic_minprice'})[['orderid','uid','hotelid','basicroomid','basic_minprice']]
        data=pd.merge(data,df_hotel_minprice,how='left', on=['orderid','uid','hotelid'])
        data=pd.merge(data,df_basci_minprice,how='left',on=['orderid','uid','hotelid','basicroomid'])
        data['pirce_hotelmin_gap']=(data['price_deduct'])-(data['hotel_minprice'])
        data['price_basicmin_gap']=(data['price_deduct'])-(data['basic_minprice'])
        # add features list
        newcol=list(set(list(data.columns)).difference(set(origin_feature)))+['orderid','roomid']
        #string type columns
        strCols=['orderid','uid','orderdate','hotelid','basicroomid','roomid','orderid_lastord','orderdate_lastord','hotelid_lastord','roomid_lastord','basicroomid_lastord']
        #class type features
        classCols=['star', 'rank','rank_diff','hotel_diff','basicroomid_diff','roomservice_1', 'roomservice_2', 'roomservice_3', 'roomservice_4', 'roomservice_5', 'roomservice_6', 'roomservice_7', 'roomservice_8', 'roomtag_1', 'roomtag_4', 'roomtag_5', 'roomtag_6','roomservice_2_lastord','roomservice_3_lastord','roomservice_4_lastord','roomservice_5_lastord','roomservice_6_lastord','roomservice_8_lastord','roomtag_4_lastord','roomtag_5_lastord','roomtag_6_lastord','star_lastord','rank_lastord','roomtag_4_gap','roomtag_5_gap','roomtag_6_gap']
        # trash features
        delCols=['orderdate','orderid_lastord','orderbehavior_3_ratio_1month','orderbehavior_4_ratio_1month','orderbehavior_5_ratio_1month']
        #standard scale
        numCols=list(set(list(data.columns)).difference(set(strCols+classCols+delCols+['orderlabel'])))
        for co in numCols:
            data[co]=(data[co]-data[co].mean())/data[co].std(ddof=0)
            
        #one-hot encode
        for co in classCols:
            data = pd.concat([data,pd.get_dummies(data[co].fillna(-1), prefix=co)],axis=1)

        if istrain:
            data.to_csv(cleandir+'cleantrain.csv',index=None)
        else:
            data.to_csv(cleandir+'cleantest.csv',index=None)
    else:
        if istrain:
            data=pd.read_csv(cleandir+'cleantrain.csv')
        else:
            data=pd.read_csv(cleandir+'cleantest.csv')

### 产生训练集、测试集

**Notes：** `train1，train2`分别是训练集中4月14-16日、4月18-20日的`[feature,label]`矩阵（DataFrame）,`train_val`是训练集中4月17日的`[feature,label]`矩阵，用于验证模型，评估准确性，`train_val_index`是对应的`['orderid','roomid','orderlabel']`,`test`是处理后的测试集的feature矩阵,`test_index`与test相对应的`['orderid','roomid']`

In [6]:
def gene_trainset():
    if not os.path.exists('../dataset/train_and_pred/train1.csv'):
        data=pd.read_csv('../dataset/cleandata/cleantrain.csv')
        pos=data.loc[data['orderlabel']==1]
        # Reduce negative samples
        neg=data.loc[(data['price_deduct']>=-3)&(data['price_deduct']<=3)&(data['orderlabel']==0)]
        data=pd.concat([pos,neg],axis=0,ignore_index=True)
        strCols=['orderid','uid','orderdate','hotelid','basicroomid','roomid','orderid_lastord','orderdate_lastord','hotelid_lastord','roomid_lastord','basicroomid_lastord']
        classCols=['star', 'rank','roomservice_1', 'roomservice_2', 'roomservice_3', 'roomservice_4', 'roomservice_5', 'roomservice_6', 'roomservice_7', 'roomservice_8', 'roomtag_1', 'roomtag_4', 'roomtag_5', 'roomtag_6','roomservice_2_lastord','roomservice_3_lastord','roomservice_4_lastord','roomservice_5_lastord','roomservice_6_lastord','roomservice_8_lastord','roomtag_4_lastord','roomtag_5_lastord','roomtag_6_lastord','star_lastord','rank_lastord','roomtag_4_gap','roomtag_5_gap','roomtag_6_gap']
        delCols=['orderdate','orderid_lastord','orderbehavior_3_ratio_1month','orderbehavior_4_ratio_1month','orderbehavior_5_ratio_1month']
        #according to orderdate split training set to 3 parts ,train_val for evaluation
        train1=data.loc[data.orderdate.isin(['2013-04-14', '2013-04-15', '2013-04-16'])].drop(strCols+classCols+delCols,axis=1)
        train2=data.loc[data.orderdate.isin(['2013-04-18', '2013-04-19', '2013-04-20'])].drop(strCols+classCols+delCols,axis=1)
        train_val=data.loc[data.orderdate=='2013-04-17']

        train_val_index=train_val[['orderid','roomid']]
        train_val=train_val.drop(strCols+classCols+delCols,axis=1)
        train1.to_csv('../dataset/train_and_pred/train1.csv',index=None)
        train2.to_csv('../dataset/train_and_pred/train2.csv',index=None)
        train_val.to_csv('../dataset/train_and_pred/train_val.csv',index=None)
        train_val_index.to_csv('../dataset/train_and_pred/train_val_index.csv',index=None)
    else:
        train1=pd.read_csv('../dataset/train_and_pred/train1.csv')
        train2=pd.read_csv('../dataset/train_and_pred/train2.csv')
        train_val=pd.read_csv('../dataset/train_and_pred/train_val.csv')
        train_val_index=pd.read_csv('../dataset/train_and_pred/new_train_val_index.csv')
    return train1,train2,train_val,train_val_index

In [7]:
def gene_testset():
    if not os.path.exists('../dataset/train_and_pred/test_feature.csv'):
        data=pd.read_csv('../dataset/cleandata/cleantest.csv')
        data=data.loc[(data['price_deduct']>=-3)&(data['price_deduct']<=3)]
        strCols=['orderid','uid','orderdate','hotelid','basicroomid','roomid','orderid_lastord','orderdate_lastord','hotelid_lastord','roomid_lastord','basicroomid_lastord']
        classCols=['star', 'rank','roomservice_1', 'roomservice_2', 'roomservice_3', 'roomservice_4', 'roomservice_5', 'roomservice_6', 'roomservice_7', 'roomservice_8', 'roomtag_1', 'roomtag_4', 'roomtag_5', 'roomtag_6','roomservice_2_lastord','roomservice_3_lastord','roomservice_4_lastord','roomservice_5_lastord','roomservice_6_lastord','roomservice_8_lastord','roomtag_4_lastord','roomtag_5_lastord','roomtag_6_lastord','star_lastord','rank_lastord','roomtag_4_gap','roomtag_5_gap','roomtag_6_gap']
        delCols=['orderdate','orderid_lastord','orderbehavior_3_ratio_1month','orderbehavior_4_ratio_1month','orderbehavior_5_ratio_1month']
        data_index=data[['orderid','roomid']]
        data=data.drop(strCols+classCols+delCols,axis=1)
        data.to_csv('../dataset/train_and_pred/test_feature.csv',index=None)
        data_index.to_csv('../dataset/train_and_pred/test_index.csv',index=None)
    else:
        data=pd.read_csv('../dataset/train_and_pred/test_feature.csv')
        data_index=pd.read_csv('../dataset/train_and_pred/test_index.csv')
    return data,data_index

### 进一步增加特征，产生训练集、测试集 （feature - label ）

In [5]:
#futher generate features
def further_clean_data(datadir,istrain=True):
    data = pd.read_csv(datadir, delimiter='\t')
    cleandir = '../dataset/cleandata/'
    origin_feature = list(data.columns)
    if len(os.listdir(cleandir))<=4:
        #gap in rank vs. avg_rank 、star vs. avg_star,avggoldstar,avgrecommendlevel
        data['rank_avg_diff']=data['rank']-data['user_rank_ratio']
        data['star_avg_diff']=data['star']-data['user_avgstar']
        data['star_avggold_diff']=data['star']-data['user_avggoldstar']
        data['star_recommend_diff']=data['star']-data['user_avgrecommendlevel']
        #gap in avgroomarea vs. minarea,maxarea
        data['compare_basic_minarea']=data['user_avgroomarea']-data['basic_minarea']
        data['compare_basic_maxarea'] = data['basic_maxarea'] - data['user_avgroomarea']
        # roomservice_4 and roomservice_6 is or not max ratio
        df_roomservice_4=data[['user_roomservice_4_0ratio','user_roomservice_4_1ratio','user_roomservice_4_2ratio','user_roomservice_4_3ratio','user_roomservice_4_4ratio','user_roomservice_4_5ratio']]
        data['roomservice_4_ismax']=(data['roomservice_4']==(df_roomservice_4.idxmax(axis=1).apply(lambda x: list(df_roomservice_4.columns).index(x) if x in list(df_roomservice_4.columns) else np.nan))).astype(int)
        df_roomservice_6 = data[['user_roomservice_6_0ratio', 'user_roomservice_6_1ratio', 'user_roomservice_6_2ratio']]
        data['roomservice_6_ismax']=(data['roomservice_6']==(df_roomservice_6.idxmax(axis=1).apply(lambda x: list(df_roomservice_6.columns).index(x) if x in list(df_roomservice_6.columns) else np.nan))).astype(int)
        #price gap in holiday and workday
        data['compare_price_holiday']=data['price_deduct']-data['user_avgdealpriceholiday']
        data['compare_price_workday'] = data['price_deduct'] - data['user_avgdealpriceworkday']
        data['compare_price_aveprice']=data['price_deduct']-data['user_avgprice']
        data['compare_price_max']=data['user_maxprice']-data['price_deduct']
        #basic_comment_ratio scale and Partition
        data['basic_comment_ratio'] = (data['basic_comment_ratio'] - 0) / (data['basic_comment_ratio'].max() - 0)
        data['basic_comment_ratio'] = data['basic_comment_ratio'].apply(lambda p: -1 if p < 0 else round(p, 1))

        #new cols to standard
        addCols=list(set(list(data.columns)).difference(set(origin_feature)))+['basic_comment_ratio']
        newfeature=addCols+['orderid','roomid']
        addCols.remove('roomservice_4_ismax');addCols.remove('roomservice_6_ismax');addCols.remove('basic_comment_ratio')
        for co in addCols:
            data[co] = (data[co] - data[co].mean()) / data[co].std(ddof=0)
        data=data[newfeature]
        if istrain:
            data.to_csv(cleandir+'cleantrain2.csv',index=None)
        else:
            data.to_csv(cleandir+'cleantest2.csv',index=None)
    else:
        if istrain:
            data=pd.read_csv(cleandir+'cleantrain2.csv')
        else:
            data=pd.read_csv(cleandir+'cleantest2.csv')

In [9]:
def gene_trainset2():
    if not os.path.exists('../dataset/train_and_pred/new_train1.csv'):
        data1 = pd.read_csv('../dataset/cleandata/cleantrain.csv').drop('basic_comment_ratio',axis=1)
        data2 = pd.read_csv('../dataset/cleandata/cleantrain2.csv')
        data=pd.merge(data1,data2,how='left',on=['orderid','roomid'])
        pos = data.loc[data['orderlabel'] == 1]
        neg = data.loc[(data['price_deduct'] >= -3) & (data['price_deduct'] <= 3) & (data['orderlabel'] == 0)]
        data = pd.concat([pos, neg], axis=0, ignore_index=True)
        strCols = ['orderid', 'uid', 'orderdate', 'hotelid', 'basicroomid', 'roomid', 'orderid_lastord',
                   'orderdate_lastord', 'hotelid_lastord', 'roomid_lastord', 'basicroomid_lastord']
        classCols = ['star', 'rank', 'roomservice_1', 'roomservice_2', 'roomservice_3', 'roomservice_4',
                     'roomservice_5', 'roomservice_6', 'roomservice_7', 'roomservice_8', 'roomtag_1', 'roomtag_4',
                     'roomtag_5', 'roomtag_6', 'roomservice_2_lastord', 'roomservice_3_lastord',
                     'roomservice_4_lastord', 'roomservice_5_lastord', 'roomservice_6_lastord', 'roomservice_8_lastord',
                     'roomtag_4_lastord', 'roomtag_5_lastord', 'roomtag_6_lastord', 'star_lastord', 'rank_lastord',
                     'roomtag_4_gap', 'roomtag_5_gap', 'roomtag_6_gap']
        delCols = ['orderdate', 'orderid_lastord', 'orderbehavior_3_ratio_1month', 'orderbehavior_4_ratio_1month',
                   'orderbehavior_5_ratio_1month']
        train1 = data.loc[data.orderdate.isin(['2013-04-14', '2013-04-15', '2013-04-16'])].drop(
            strCols + classCols + delCols, axis=1)
        train2 = data.loc[data.orderdate.isin(['2013-04-18', '2013-04-19', '2013-04-20'])].drop(
            strCols + classCols + delCols, axis=1)
        train_val = data.loc[data.orderdate == '2013-04-17']
        train_val_index = train_val[['orderid', 'roomid','orderlabel']]
        train_val = train_val.drop(strCols + classCols + delCols, axis=1)
        train1.to_csv('../dataset/train_and_pred/new_train1.csv', index=None)
        train2.to_csv('../dataset/train_and_pred/new_train2.csv', index=None)
        train_val.to_csv('../dataset/train_and_pred/new_train_val.csv', index=None)
        train_val_index.to_csv('../dataset/train_and_pred/recent_train_val_index.csv', index=None)
    else:
        train1 = pd.read_csv('../dataset/train_and_pred/new_train1.csv')
        train2 = pd.read_csv('../dataset/train_and_pred/new_train2.csv')
        train_val = pd.read_csv('../dataset/train_and_pred/new_train_val.csv')
        train_val_index = pd.read_csv('../dataset/train_and_pred/recent_train_val_index.csv')
    return train1, train2, train_val, train_val_index

def gene_testset2():
    if not os.path.exists('../dataset/train_and_pred/new_test_feature.csv'):
        data1=pd.read_csv('../dataset/cleandata/cleantest.csv').drop('basic_comment_ratio',axis=1)
        data2 = pd.read_csv('../dataset/cleandata/cleantest2.csv')
        data = pd.merge(data1, data2, how='left', on=['orderid', 'roomid'])
        data=data.loc[(data['price_deduct']>=-3)&(data['price_deduct']<=3)]
        strCols=['orderid','uid','orderdate','hotelid','basicroomid','roomid','orderid_lastord','orderdate_lastord','hotelid_lastord','roomid_lastord','basicroomid_lastord']
        classCols=['star', 'rank','roomservice_1', 'roomservice_2', 'roomservice_3', 'roomservice_4', 'roomservice_5', 'roomservice_6', 'roomservice_7', 'roomservice_8', 'roomtag_1', 'roomtag_4', 'roomtag_5', 'roomtag_6','roomservice_2_lastord','roomservice_3_lastord','roomservice_4_lastord','roomservice_5_lastord','roomservice_6_lastord','roomservice_8_lastord','roomtag_4_lastord','roomtag_5_lastord','roomtag_6_lastord','star_lastord','rank_lastord','roomtag_4_gap','roomtag_5_gap','roomtag_6_gap']
        delCols=['orderdate','orderid_lastord','orderbehavior_3_ratio_1month','orderbehavior_4_ratio_1month','orderbehavior_5_ratio_1month']
        data_index=data[['orderid','roomid']]
        data=data.drop(strCols+classCols+delCols,axis=1)
        data.to_csv('../dataset/train_and_pred/new_test_feature.csv',index=None)
        data_index.to_csv('../dataset/train_and_pred/new_test_index.csv',index=None)
    else:
        data=pd.read_csv('../dataset/train_and_pred/new_test_feature.csv')
        data_index=pd.read_csv('../dataset/train_and_pred/new_test_index.csv')
    return data,data_index

### GridSearchCV 对XGboost 调参，auc最大的参数训练模型

In [None]:
def xgbmodel_train(train):
    xgb_model = xgb.XGBClassifier()
    train_feature, train_label = train.drop('orderlabel', axis=1), train['orderlabel']

    parameters = {'nthread': [4],
                  'objective': ['binary:logistic'],
                  'learning_rate': [0.05,0.06,0.1],
                  'max_depth': [5, 6],
                  'min_child_weight': [1, 3],
                  'silent': [1],
                  'gamma': [0, 0.1],
                  'subsample': [0.6, 0.7, 0.8],
                  'colsample_bytree': [0.7, 0.5, 0.6],
                  'n_estimators': [5],
                  'missing': [-999],
                  'seed': [12455]}

    clf = GridSearchCV(xgb_model, parameters, n_jobs=1,
                       cv=StratifiedKFold(train['orderlabel'], n_folds=5, shuffle=True),
                       scoring='roc_auc',
                       verbose=2, refit=True)


    clf.fit(train_feature, train_label)
    best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
    print('AUC score:', score)
    for param_name in sorted(best_parameters.keys()):
        print('%s: %r' % (param_name, best_parameters[param_name]))

    pickle.dump(best_parameters, open('../result/' + str(score) + '_do_train.pkl', 'wb'))

### XGboost 训练模型

In [13]:
def xgbmodel(train,train_val_feature,train_val_index,test,test_index,i,eta,max_depth,rounds,min_child_weight,subsample,colsample_bytree):
    if not os.path.exists('../model/new_and_scale_xgb_'+str(i)+'.model'):
        train_feature,train_label=train.drop('orderlabel', axis=1), train['orderlabel']
        nu=train_label.values
        scale_pos_weight = (len(nu[nu == 0])) / float(len(nu[nu == 1]))
        parameters = {'nthread': 4,'objective': 'binary:logistic','learning_rate': eta,'max_depth': max_depth,'min_child_weight': min_child_weight,
                      'silent': 0,'gamma': 0,'subsample': subsample,'colsample_bytree': colsample_bytree,'n_estimators': 5,
                      'missing': -999,'scale_pos_weight': scale_pos_weight,'seed': 4789,'eval_metric':'auc','early_stopping_rounds': 100}
        X_train, X_test, y_train, y_test = train_test_split(train_feature,train_label, test_size=0.3, random_state=4789)
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test, label=y_test)
        evallist = [(dtest, 'eval'), (dtrain, 'train')]
        bst = xgb.train(parameters, dtrain,num_boost_round=rounds, evals=evallist)
        bst.save_model(os.path.join('../model',('new_and_scale_xgb_'+str(i)+'.model')))
    else:
        bst = xgb.Booster({'nthread': 4})  # init model
        bst.load_model('../model/new_and_scale_xgb_'+str(i)+'.model')  # load data

    train_val_f=xgb.DMatrix(train_val_feature)
    test_f = xgb.DMatrix(test)
    train_val_index[str(i)+'_'+'score']=bst.predict(train_val_f)
    test_index[str(i)+'_'+'score'] = bst.predict(test_f)
    return test_index,train_val_index

### 模型准确性评估

In [15]:
# 评测函数，data列=['orderid','roomid','orderlabel',score_name]
def evaluation(data,score_name):
    train_val_pred=data.sort_values([score_name], ascending=False).drop_duplicates('orderid').drop([score_name],axis=1).rename(columns={'roomid': 'pred'})
    train_val_true=data.loc[data.orderlabel==1,['orderid','roomid']].rename(columns={'roomid': 'true'})
    result=pd.merge(train_val_true, train_val_pred, how='inner', on='orderid')
    tp=result.loc[result.true==result.pred].shape[0]
    return tp/result.shape[0]

### 特征重要性分析

In [16]:
# 使用seaborn 作图查看重要性特征，XGboost.plot_importance()也可以
import seaborn as sns
def ceate_feature_map(features):
    outfile = open('../model/xgb.fmap', 'w')
    i = 0
    for feat in features:
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
        i = i + 1
    outfile.close()
def  plot_feature_importance(i,j):
    bst = xgb.Booster({'nthread': 4})  # init model
    bst.load_model('../model/new_and_scale_xgb_' + str(i) + '.model')
    importance = bst.get_fscore(fmap='../model/xgb.fmap')
    importance = sorted(importance.items(), key=lambda d:d[1], reverse = True)
    df = pd.DataFrame(importance, columns=['feature', 'fscore'])
    df['fscore'] = df['fscore'] / df['fscore'].sum()
    df.to_csv('../model/feat_importance_'+str(i)+'.csv', index=False)
    ax=sns.barplot(x='fscore',y='feature',data=df.ix[:j,:])
    ax.figure.savefig('../model/images_feature_importance/'+str(i)+'.png')

In [None]:
if __name__=='__main__':
    clean_data('../dataset/competition_train.txt')
    clean_data('../dataset/competition_test.txt',istrain=False)
    further_clean_data('../dataset/competition_train.txt')
    further_clean_data('../dataset/competition_test.txt',istrain=False)
    #获取数据
    train1,train2,train_val,train_val_index_origin=gene_trainset()
    train1,train2,train_val,train_val_index_origin=gene_trainset2()
    test,test_index_origin=gene_testset()
    test,test_index_origin=gene_testset2()
    feature_name=list(train1.columns)
    feature_name.remove('orderlabel')
    test=test[feature_name]
    #train_val=train_val.fillna(-999)
    #xgbmodel_train(train_val)
    train_val_feature,train_val_label=train_val.drop('orderlabel', axis=1),train_val['orderlabel']
    
    #最好成绩参数设置，线上score=0.458311
    train=pd.concat([train1,train2,train_val],axis=0,ignore_index=True)
    test_index,train_val_index=xgbmodel(train,train_val_feature,train_val_index_origin,test,test_index_origin,1,0.1,6,1200,3,0.7,0.5)
    pickle.dump(train_val_index,open('../result/new_and_scale_train_val_index_1.pkl','wb'))
    pickle.dump(test_index,open('../result/new_and_scale_test_index_1.pkl','wb'))
    test_index.to_csv('../result/new_and_scale_test_index_1.csv', index=False)
    train_val_index.to_csv('../result/new_and_scale_train_val_index_1.csv', index=False)
    print（evaluation(train_val_index,'1_score')）  #Validation set accuracy
    #输出最终预测结果
    test_index.sort_values(['1_score'], ascending=False).drop_duplicates('orderid').drop(['1_score'],axis=1).to_csv('../result/new_and_scale_1_result.csv',index=None)
    ceate_feature_map(feature_name)
    plot_feature_importance(1,30)  #输出靠前的30个特征    