In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost

In [None]:
#读取之前的特征集
month_label=pd.read_csv('month_label.csv')
shop_feature=pd.read_csv('shop_feature.csv')
time_feature=pd.read_csv('time_feature.csv')

In [None]:
#对店铺的部分特征进行空缺填补
shop_feature['ads_times']=shop_feature['ads_times'].fillna(shop_feature['ads_times'].mean())
shop_feature['cate_num']=shop_feature['cate_num'].fillna(shop_feature['cate_num'].mean())
shop_feature['pid_num']=shop_feature['pid_num'].fillna(shop_feature['pid_num'].mean())

In [None]:
time_feature['year']=time_feature['dt'].apply(lambda s: s[:4]).astype(int)
time_feature['month']=time_feature['dt'].apply(lambda s: s[5:7]).astype(int)

In [None]:
time_feature['holidy']=time_feature['dt'].apply(lambda s: s[:7])
time_feature['jd_holidy']=time_feature['dt'].apply(lambda s: s[:7])

In [None]:
holiday={'2016-08':8,'2016-09':8,'2016-10':13,'2016-11':8,'2016-12':9,
       '2017-01':12,'2017-02':9,'2017-03':8,'2017-04':11,'2017-05':10,'2017-06':8,'2017-07':10}

In [None]:
jd_holiday={'2016-08':0,'2016-09':10,'2016-10':0,'2016-11':30,'2016-12':3,
       '2017-01':9,'2017-02':0,'2017-03':26,'2017-04':21,'2017-05':24,'2017-06':11,'2017-07':21}

In [None]:
#时间特征的信息生成
time_feature['holidy']=time_feature['holidy'].map(holiday)
time_feature['jd_holidy']=time_feature['jd_holidy'].map(jd_holiday)

In [None]:
time_feature['base_month']=(time_feature.year-2016)*12+time_feature.month

In [None]:
month_dumm=pd.get_dummies(time_feature['month'],prefix='month')

In [None]:
time_feature=pd.concat([time_feature,month_dumm],axis=1)

In [None]:
#整理成总的数据集
data=pd.merge(time_feature,shop_feature,on='shop_id',how='left')
data=pd.merge(data,month_label,on=['shop_id','dt'],how='left')

In [None]:
feature = [x for x in data.columns if x not in ['label','year','dt']]

In [None]:
train=data[data.label.notnull()]
test=data[data.label.isnull()]

In [None]:
#生成训练数据和要测试的数据
xgbTrain = xgboost.DMatrix(train[feature].values, label=train['label'],missing=np.nan)
xgbVal = xgboost.DMatrix(train[feature].values, label=train['label'],missing=np.nan)
watchlist = [(xgbTrain, 'train'),(xgbVal, 'eval')]
xgbtest = xgboost.DMatrix(test[feature].values,missing=np.nan)

In [None]:
#定义部分函数代替xgboost的回归函数
def mapeObj(preds, dtrain):
    gaps = dtrain.get_label()
    grad = np.sign(preds - gaps) / gaps
    hess = 1 / np.abs(preds - gaps)
    return grad, hess

def huber_approx_obj(preds, dtrain):
    d = preds - dtrain.get_label() # remove .get_labels() for sklearn
    h = 10000000  # h is delta in the graphic
    scale = 1 + (d / h) ** 2
    scale_sqrt = np.sqrt(scale)
    grad = d / scale_sqrt
    hess = 1 / scale / scale_sqrt
    return grad, hess

def log_cosh_obj(preds, dtrain):
    x = preds - dtrain.get_label()
    grad = np.tanh(x)
    hess = 1 / np.cosh(x)**2
    return grad, hess

def fair_obj(preds, dtrain):
    """y = c * abs(x) - c * np.log(abs(abs(x) + c))"""
    x = preds - dtrain.get_label()
    c = 100000
    den = abs(x) + c
    grad = (c * x) / den
    hess = (c * c) / (den * den)
    return grad, hess

def wmaeEval(preds, dtrain):
    label = dtrain.get_label()
    return 'error', np.sum(np.abs(preds - label)) / np.sum(label)

In [None]:
#对部分参数的设定
param = {}
param['eta'] = 0.1
param['max_depth'] =8
param['mmin_child_weight'] = 4
param['subsample'] = 0.95
param['colsample_bytree'] = 0.5
param['silent'] = 1

In [None]:
num_round = 500
modle = xgboost.train(param, xgbTrain, num_round, watchlist, obj=fair_obj,feval=wmaeEval, early_stopping_rounds=100)

In [None]:
predsx = modle.predict(xgbtest)

In [None]:
test['label']=predsx

In [None]:
result=test[['shop_id','dt','label']]

In [None]:
#将最后三个月的预测数据进行相加构成90天的预测销售额
sale_amt = result.groupby('shop_id', as_index=False).agg({'label':['sum']})
sale_amt.columns = ['shop_id','label']

In [None]:
sale_amt.to_csv('2017121102last.csv',index=False,header=False)