In [1]:
import ccf2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import gc
import warnings
import lightgbm as lgb
import xgboost
from sklearn.model_selection import KFold,StratifiedKFold,train_test_split,cross_validate
from sklearn.metrics import mean_squared_error as mse
from sklearn.grid_search import GridSearchCV
warnings.filterwarnings('ignore')

path = '../input/'



# load data

In [2]:
data = ccf2.loaddata(path)

In [3]:
data.head()

Unnamed: 0,adcode,bodyType,id,model,province,regMonth,regYear,popularity,carCommentVolum,newsReplyVolum,label,mt
0,310000,0,0,0,上海,1,2016,1479.0,11.0,106.0,292.0,1
1,530000,0,0,0,云南,1,2016,1594.0,11.0,106.0,466.0,1
2,150000,0,0,0,内蒙古,1,2016,1479.0,11.0,106.0,257.0,1
3,110000,0,0,0,北京,1,2016,2370.0,11.0,106.0,408.0,1
4,510000,0,0,0,四川,1,2016,3562.0,11.0,106.0,610.0,1


# feature

## 构建特征

### 平移12特征

In [3]:
def genShitFeat(date,shift_list):
    shift_feat = []
    data['model_adcode'] = data['adcode'] + data['model']
    data['model_adcode_mt'] = data['model_adcode'] * 100 + data['mt']
    for i in shift_list:  ## 平移12个月
        shift_feat.append('shift_model_adcode_mt_label_{0}'.format(i))
        data['model_adcode_mt_{0}'.format(i)] = data['model_adcode_mt'] + i
        data_last = data[~data.label.isnull()].set_index('model_adcode_mt_{0}'.format(i))
        data['shift_model_adcode_mt_label_{0}'.format(i)] = data['model_adcode_mt'].map(data_last['label'])
    return data,shift_feat
# num_feat = ['regYear'] + shift_feat
# cate_feat = ['adcode', 'bodyType', 'model', 'regMonth']

# features = num_feat + cate_feat

### 分组特征

In [6]:
def genStatFeat(data,fea_list):
    df = data.copy()
    print('统计列的sum,mean,max,min,分位数0.2,分位数0.5,分位数0.8')
    stat_feat = []
    for f in fea_list:
        print('构造特征:',f)
        g1 = df.groupby([f])
        df1 = g1.agg({'label':["sum","mean","max","min"]})
        df1.columns = [f+'_sum',f+'_mean',f+'_max',f+'_mim']
        df1['%s_median2' % f] = g1['label'].quantile(0.2)
        df1['%s_median5' % f] = g1['label'].quantile(0.5)
        df1['%s_median8' % f] = g1['label'].quantile(0.8)
        df1.reset_index(inplace=True)
        df = df.merge(df1,'left',on=[f])
        stat_feat = stat_feat+list(df1.columns)
    return df,stat_feat

In [7]:
fea_list = ['adcode','bodyType','model','regMonth','mt']
data,stat_feat = genStatFeature(data,fea_list)

统计列的sum,mean,max,min,分位数0.2,分位数0.5,分位数0.8
构造特征: adcode
构造特征: bodyType
构造特征: model
构造特征: regMonth
构造特征: mt


In [9]:
data.columns

Index(['adcode', 'bodyType', 'id', 'model', 'province', 'regMonth', 'regYear',
       'popularity', 'carCommentVolum', 'newsReplyVolum', 'label', 'mt',
       'model_adcode', 'model_adcode_mt', 'model_adcode_mt_12',
       'shift_model_adcode_mt_label_12', 'adcode_sum', 'adcode_mean',
       'adcode_max', 'adcode_mim', 'adcode_median2', 'adcode_median5',
       'adcode_median8', 'bodyType_sum', 'bodyType_mean', 'bodyType_max',
       'bodyType_mim', 'bodyType_median2', 'bodyType_median5',
       'bodyType_median8', 'model_sum', 'model_mean', 'model_max', 'model_mim',
       'model_median2', 'model_median5', 'model_median8', 'regMonth_sum',
       'regMonth_mean', 'regMonth_max', 'regMonth_mim', 'regMonth_median2',
       'regMonth_median5', 'regMonth_median8', 'mt_sum', 'mt_mean', 'mt_max',
       'mt_mim', 'mt_median2', 'mt_median5', 'mt_median8'],
      dtype='object')

In [10]:
cate_feat = ['adcode', 'bodyType', 'model', 'regMonth']
num_feat = ['regYear'] + shift_feat
features = cate_feat+num_feat+stat_feat

## label

In [11]:
import math
# data['model_weight'] = data.groupby('model')['label'].transform('mean') 
# data['n_label'] = data['label'] / data['model_weight'] 
data['log_label'] = data['label'].apply(lambda x: math.log(x))

# 构建模型

## 拆分数据

In [12]:
train_idx = (data['mt'] <= 24)
test_idx = (data['mt'] > 24)

trainSet = data.loc[train_idx]
testSet = data.loc[test_idx]
# train_x = data[train_idx][features]
# train_y = data[train_idx]['n_label']
# train_y2 = data[train_idx]['label']
# valid_idx = (data['mt'].between(21, 24))
# valid_x = data[valid_idx][features]
# valid_y = data[valid_idx]['n_label']

In [13]:
trainSet.head()

Unnamed: 0,adcode,bodyType,id,model,province,regMonth,regYear,popularity,carCommentVolum,newsReplyVolum,...,regMonth_median5,regMonth_median8,mt_sum,mt_mean,mt_max,mt_mim,mt_median2,mt_median5,mt_median8,log_label
0,310000,0,0,0,上海,1,2016,1479.0,11.0,106.0,...,445.0,1016.2,1090983.0,826.502273,9433.0,13.0,236.8,525.0,1219.0,5.676754
1,530000,0,0,0,云南,1,2016,1594.0,11.0,106.0,...,445.0,1016.2,1090983.0,826.502273,9433.0,13.0,236.8,525.0,1219.0,6.144186
2,150000,0,0,0,内蒙古,1,2016,1479.0,11.0,106.0,...,445.0,1016.2,1090983.0,826.502273,9433.0,13.0,236.8,525.0,1219.0,5.549076
3,110000,0,0,0,北京,1,2016,2370.0,11.0,106.0,...,445.0,1016.2,1090983.0,826.502273,9433.0,13.0,236.8,525.0,1219.0,6.011267
4,510000,0,0,0,四川,1,2016,3562.0,11.0,106.0,...,445.0,1016.2,1090983.0,826.502273,9433.0,13.0,236.8,525.0,1219.0,6.413459


## cv 交叉验证

In [14]:
lgb_model = lgb.LGBMRegressor(
        num_leaves=32, reg_alpha=1, reg_lambda=0.1, objective='mse',
        max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=np.random.randint(1000),
        n_estimators=5000, subsample=0.8, colsample_bytree=0.8,
    )

In [35]:
from sklearn.utils.multiclass import type_of_target
type_of_target(train_y)

'continuous'

In [15]:
kfd = KFold(n_splits=5,random_state=2020)
# res_prob = pd.DataFrame()
res = pd.DataFrame()

index = 1
for train_idx,test_idx in kfd.split(trainSet):
    print('-------------------','Kflod:',index,'------------------')
    
    lgb_model.fit(trainSet.loc[train_idx][features],trainSet.loc[train_idx]['log_label'], 
                  eval_set=[(trainSet.loc[train_idx][features],trainSet.loc[train_idx]['log_label']),
                            (trainSet.loc[test_idx][features],trainSet.loc[test_idx]['log_label'])],
                  categorical_feature=cate_feat, 
                  early_stopping_rounds=500,
                  verbose=100)
#     x_pred = lgb_model.predict(train_x[test_idx], num_iteration=lgb_model.best_iteration_)
    test_pred = lgb_model.predict(testSet[features], num_iteration=lgb_model.best_iteration_)
    res['prob_%s' % str(index)] = test_pred
    index = index+1

------------------- Kflod: 1 ------------------
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l2: 0.111143	valid_1's l2: 0.273483
[200]	valid_0's l2: 0.0700075	valid_1's l2: 0.233471
[300]	valid_0's l2: 0.052738	valid_1's l2: 0.208684
[400]	valid_0's l2: 0.0446922	valid_1's l2: 0.201287
[500]	valid_0's l2: 0.0383955	valid_1's l2: 0.193044
[600]	valid_0's l2: 0.0334198	valid_1's l2: 0.187273
[700]	valid_0's l2: 0.0300569	valid_1's l2: 0.182358
[800]	valid_0's l2: 0.0271794	valid_1's l2: 0.176842
[900]	valid_0's l2: 0.0246294	valid_1's l2: 0.172476
[1000]	valid_0's l2: 0.0226258	valid_1's l2: 0.169775
[1100]	valid_0's l2: 0.0209663	valid_1's l2: 0.167508
[1200]	valid_0's l2: 0.0195687	valid_1's l2: 0.165397
[1300]	valid_0's l2: 0.0182099	valid_1's l2: 0.163148
[1400]	valid_0's l2: 0.0171791	valid_1's l2: 0.161783
[1500]	valid_0's l2: 0.0162108	valid_1's l2: 0.160643
[1600]	valid_0's l2: 0.0152529	valid_1's l2: 0.158974
[1700]	valid_0's l2: 0.0143842	valid

In [18]:
res.head()

Unnamed: 0,prob_1,prob_2,prob_3,prob_4,prob_5
0,273.230513,224.957759,303.018339,261.854215,253.321045
1,391.160666,313.691556,379.821034,313.162821,375.968877
2,159.261279,157.783039,187.892173,165.665984,178.203365
3,365.938111,292.096219,395.036865,325.225275,355.636621
4,444.689267,368.81492,462.291657,343.868262,439.648305


In [17]:
for i in range(1,6):  
    res['prob_%s' % str(i)] = res['prob_%s' % str(i)].apply(lambda x : math.exp(x))

In [19]:
sum_pred = res.sum(axis=1) / 5
sub = data[data['mt']>24][['id']]
sub.reset_index(drop=True,inplace=True)

sub['forecastVolum'] = sum_pred.astype(int)

In [20]:
sub.to_csv(path+'sub/sub_cv2.csv',index=False,header=True)