In [1]:
import ccf2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import gc
import warnings
import lightgbm as lgb
import xgboost
from sklearn.model_selection import KFold,StratifiedKFold,train_test_split,cross_validate
from sklearn.metrics import mean_squared_error as mse
from sklearn.grid_search import GridSearchCV
warnings.filterwarnings('ignore')

path = '../input/'



# load data

In [2]:
data = ccf2.loaddata(path)

In [3]:
data.head()

Unnamed: 0,adcode,bodyType,id,model,province,regMonth,regYear,popularity,carCommentVolum,newsReplyVolum,label,mt
0,310000,0,0,0,上海,1,2016,1479.0,11.0,106.0,292.0,1
1,530000,0,0,0,云南,1,2016,1594.0,11.0,106.0,466.0,1
2,150000,0,0,0,内蒙古,1,2016,1479.0,11.0,106.0,257.0,1
3,110000,0,0,0,北京,1,2016,2370.0,11.0,106.0,408.0,1
4,510000,0,0,0,四川,1,2016,3562.0,11.0,106.0,610.0,1


# feature

## 构建特征

In [4]:
shift_feat = []

data['model_adcode'] = data['adcode'] + data['model']
data['model_adcode_mt'] = data['model_adcode'] * 100 + data['mt']
for i in [12]:  ## 平移12个月
    shift_feat.append('shift_model_adcode_mt_label_{0}'.format(i))
    data['model_adcode_mt_{0}'.format(i)] = data['model_adcode_mt'] + i
    data_last = data[~data.label.isnull()].set_index('model_adcode_mt_{0}'.format(i))
    data['shift_model_adcode_mt_label_{0}'.format(i)] = data['model_adcode_mt'].map(data_last['label'])

num_feat = ['regYear'] + shift_feat
cate_feat = ['adcode', 'bodyType', 'model', 'regMonth']

features = num_feat + cate_feat

In [39]:
features

['regYear',
 'shift_model_adcode_mt_label_12',
 'adcode',
 'bodyType',
 'model',
 'regMonth']

In [12]:
data.head()

Unnamed: 0,adcode,bodyType,id,model,province,regMonth,regYear,popularity,carCommentVolum,newsReplyVolum,label,mt,model_adcode,model_adcode_mt,model_adcode_mt_12,shift_model_adcode_mt_label_12
0,310000,0,0,0,上海,1,2016,1479.0,11.0,106.0,292.0,1,310000,31000001,31000013,
1,530000,0,0,0,云南,1,2016,1594.0,11.0,106.0,466.0,1,530000,53000001,53000013,
2,150000,0,0,0,内蒙古,1,2016,1479.0,11.0,106.0,257.0,1,150000,15000001,15000013,
3,110000,0,0,0,北京,1,2016,2370.0,11.0,106.0,408.0,1,110000,11000001,11000013,
4,510000,0,0,0,四川,1,2016,3562.0,11.0,106.0,610.0,1,510000,51000001,51000013,


## label

In [6]:
import math
data['model_weight'] = data.groupby('model')['label'].transform('mean') 
data['n_label'] = data['label'] / data['model_weight'] # 把目标标准化

data['log_label'] = data['label'].apply(lambda x: math.log(x))

# 构建模型

## 拆分数据

In [7]:
train_idx = (data['mt'] <= 24)
test_idx = (data['mt'] > 24)

trainSet = data.loc[train_idx]
testSet = data.loc[test_idx]
# train_x = data[train_idx][features]
# train_y = data[train_idx]['n_label']
# train_y2 = data[train_idx]['label']
# valid_idx = (data['mt'].between(21, 24))
# valid_x = data[valid_idx][features]
# valid_y = data[valid_idx]['n_label']

In [10]:
trainSet.head()

Unnamed: 0,adcode,bodyType,id,model,province,regMonth,regYear,popularity,carCommentVolum,newsReplyVolum,label,mt,model_adcode,model_adcode_mt,model_adcode_mt_12,shift_model_adcode_mt_label_12,model_weight,n_label,log_label
0,310000,0,0,0,上海,1,2016,1479.0,11.0,106.0,292.0,1,310000,31000001,31000013,,444.518939,0.65689,5.676754
1,530000,0,0,0,云南,1,2016,1594.0,11.0,106.0,466.0,1,530000,53000001,53000013,,444.518939,1.048324,6.144186
2,150000,0,0,0,内蒙古,1,2016,1479.0,11.0,106.0,257.0,1,150000,15000001,15000013,,444.518939,0.578153,5.549076
3,110000,0,0,0,北京,1,2016,2370.0,11.0,106.0,408.0,1,110000,11000001,11000013,,444.518939,0.917846,6.011267
4,510000,0,0,0,四川,1,2016,3562.0,11.0,106.0,610.0,1,510000,51000001,51000013,,444.518939,1.37227,6.413459


## cv 交叉验证

In [8]:
lgb_model = lgb.LGBMRegressor(
        num_leaves=32, reg_alpha=1, reg_lambda=0.1, objective='mse',
        max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=np.random.randint(1000),
        n_estimators=5000, subsample=0.8, colsample_bytree=0.8,
    )

In [35]:
from sklearn.utils.multiclass import type_of_target
type_of_target(train_y)

'continuous'

In [17]:
kfd = KFold(n_splits=5,random_state=2020)
# res_prob = pd.DataFrame()
res = pd.DataFrame()

index = 1
for train_idx,test_idx in kfd.split(trainSet):
    print('-------------------','Kflod:',index,'------------------')
    
    lgb_model.fit(trainSet.loc[train_idx][features],trainSet.loc[train_idx]['log_label'], 
                  eval_names=['train', 'valid'],
                  eval_set=[(trainSet.loc[train_idx][features],trainSet.loc[train_idx]['log_label']),
                            (trainSet.loc[test_idx][features],trainSet.loc[test_idx]['log_label'])],
                  categorical_feature=cate_feat, 
                  early_stopping_rounds=100, 
                  verbose=100)
#     x_pred = lgb_model.predict(train_x[test_idx], num_iteration=lgb_model.best_iteration_)
    test_pred = lgb_model.predict(testSet[features], num_iteration=lgb_model.best_iteration_)
    res['prob_%s' % str(index)] = test_pred
    index = index+1

------------------- Kflod: 1 ------------------
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l2: 0.135951	valid_1's l2: 0.329237
[200]	valid_0's l2: 0.0820924	valid_1's l2: 0.27149
[300]	valid_0's l2: 0.0635164	valid_1's l2: 0.251641
[400]	valid_0's l2: 0.0536618	valid_1's l2: 0.243775
[500]	valid_0's l2: 0.0466546	valid_1's l2: 0.237612
[600]	valid_0's l2: 0.0419263	valid_1's l2: 0.23349
[700]	valid_0's l2: 0.0385063	valid_1's l2: 0.230328
[800]	valid_0's l2: 0.0346838	valid_1's l2: 0.225853
[900]	valid_0's l2: 0.0321341	valid_1's l2: 0.222667
[1000]	valid_0's l2: 0.0296277	valid_1's l2: 0.219298
[1100]	valid_0's l2: 0.0276604	valid_1's l2: 0.216835
[1200]	valid_0's l2: 0.0258542	valid_1's l2: 0.21419
[1300]	valid_0's l2: 0.0246499	valid_1's l2: 0.212409
[1400]	valid_0's l2: 0.0234724	valid_1's l2: 0.209851
[1500]	valid_0's l2: 0.0224182	valid_1's l2: 0.208485
[1600]	valid_0's l2: 0.0214813	valid_1's l2: 0.207616
[1700]	valid_0's l2: 0.0205434	valid_1

In [38]:
res.head()

Unnamed: 0,prob_1,prob_2,prob_3,prob_4,prob_5
0,284,264,335,269,240
1,409,392,449,385,355
2,163,174,173,190,189
3,394,338,368,294,339
4,461,449,425,376,445


In [40]:
def genSub(res,n_split):
    res2 = pd.DataFrame()
    for i in range(1,6):  
        res2['prob_%s' % str(i)] = res['prob_%s' % str(i)].apply(lambda x : math.exp(x))

    sum_pred = res2.sum(axis=1) / 5
    sub = data[data['mt']>24][['id']]
    sub.reset_index(drop=True,inplace=True)

    sub['forecastVolum'] = sum_pred.astype(int)
    return sub

In [35]:
sub.to_csv(path+'sub/sub_cv.csv',index=False,header=True)

In [41]:
lgb_model.feature_importances_

array([ 217, 1405, 2053,  513, 2608,  892])

In [37]:
features

['regYear',
 'shift_model_adcode_mt_label_12',
 'adcode',
 'bodyType',
 'model',
 'regMonth']