In [1]:
#模块导入
path = '../input'
import ccf2
import sys
import numpy as np
import pandas as pd
import os 
import gc
# from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelEncoder
import datetime
import time
import lightgbm as lgb
# import xgboost as xgb
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

In [2]:
## load data
data = ccf2.loaddata(path)

In [3]:
import math
# data['model_weight'] = data.groupby('model')['label'].transform('mean') 
# data['n_label'] = data['label'] / data['model_weight'] 
data['label'] = data['label'].apply(lambda x: math.log(x+1))
data['popularity'] = data['popularity'].apply(lambda x: math.log(x+1))
# data['log_carCommentVolum'] = data['carCommentVolum'].apply(lambda x: math.log(x+1))
# data['log_newsReplyVolum'] = data['newsReplyVolum'].apply(lambda x: math.log(x+1))

In [4]:
#评价指标

def score(data, pred='pred_label', label='label', group='model'):
    data['pred_label'] = data['pred_label'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    data_agg = data.groupby('model').agg({
        pred:  list,
        label: [list, 'mean']
    }).reset_index()
    data_agg.columns = ['_'.join(col).strip() for col in data_agg.columns]
    nrmse_score = []
    for raw in data_agg[['{0}_list'.format(pred), '{0}_list'.format(label), '{0}_mean'.format(label)]].values:
        nrmse_score.append(
            mse(raw[0], raw[1]) ** 0.5 / raw[2]
        )
    print(1 - np.mean(nrmse_score))
    return 1 - np.mean(nrmse_score)	

In [5]:
#模型选择

def get_model_type(train_x,train_y,valid_x,valid_y,m_type='lgb'):   
    if m_type == 'lgb':
        model = lgb.LGBMRegressor(
                                num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='mse',
                                max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=2019,
                                n_estimators=2000, subsample=0.9, colsample_bytree=0.7,
                                )
        model.fit(train_x, train_y, 
              eval_set=[(train_x, train_y),(valid_x, valid_y)], 
              categorical_feature=cate_feat, 
              early_stopping_rounds=100, verbose=100)      
    elif m_type == 'xgb':
        model = xgb.XGBRegressor(
                                max_depth=5 , learning_rate=0.05, n_estimators=2000, 
                                objective='reg:gamma', tree_method = 'hist',subsample=0.9, 
                                colsample_bytree=0.7, min_child_samples=5,eval_metric = 'rmse' 
                                )
        model.fit(train_x, train_y, 
              eval_set=[(train_x, train_y),(valid_x, valid_y)], 
              early_stopping_rounds=100, verbose=100)   
    return model

In [7]:
#模型训练

def get_train_model(df_, m, m_type='lgb'):
    df = df_.copy()
    # 数据集划分
    st = 13
    all_idx   = (df['mt'].between(st , m-1))
    train_idx = (df['mt'].between(st , m-5))
    valid_idx = (df['mt'].between(m-4, m-4))
    test_idx  = (df['mt'].between(m  , m  ))
    print('all_idx  :',st ,m-1)
    print('train_idx:',st ,m-5)
    print('valid_idx:',m-4,m-4)
    print('test_idx :',m  ,m  )  
    # 最终确认
    train_x = df[train_idx][features]
    train_y = df[train_idx]['label']
    valid_x = df[valid_idx][features]
    valid_y = df[valid_idx]['label']   
    # get model
    model = get_model_type(train_x,train_y,valid_x,valid_y,m_type)  
    # offline
    df['pred_label'] = model.predict(df[features])
    best_score = score(df[valid_idx]) 
    # online
    if m_type == 'lgb':
        model.n_estimators = model.best_iteration_ + 100
        model.fit(df[all_idx][features], df[all_idx]['label'], categorical_feature=cate_feat)
    elif m_type == 'xgb':
        model.n_estimators = model.best_iteration + 100
        model.fit(df[all_idx][features], df[all_idx]['label'])
    df['forecastVolum'] = model.predict(df[features]) 
    print('valid mean:',df[valid_idx]['pred_label'].mean())
    print('true  mean:',df[valid_idx]['label'].mean())
    print('test  mean:',df[test_idx]['forecastVolum'].mean())
    # 阶段结果
    sub = df[test_idx][['id']]
#     sub['forecastVolum'] = df[test_idx]['forecastVolum'].apply(lambda x: 0 if x < 0 else x).round().astype(int)  
    sub['forecastVolum'] = df[test_idx]['forecastVolum']
    return sub,df[valid_idx]['pred_label']

In [8]:
#逐步预测

for month in [25,26,27,28]: 
    m_type = 'lgb' 
    
    data_df, stat_feat = ccf2.get_shift_feat2(data,['label','popularity'],range(1,13))
    
    num_feat = ['regYear'] + stat_feat
    cate_feat = ['adcode','bodyType','model','regMonth']
    
    if m_type == 'lgb':
        for i in cate_feat:
            data_df[i] = data_df[i].astype('category')
    elif m_type == 'xgb':
        lbl = LabelEncoder()  
        for i in tqdm(cate_feat):
            data_df[i] = lbl.fit_transform(data_df[i].astype(str))
           
    features = num_feat + cate_feat
    print(len(features), len(set(features)))   
    
    sub,val_pred = get_train_model(data_df, month)   
    data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'salesVolume'] = sub['forecastVolum'].values
    data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'label'      ] = sub['forecastVolum'].values
sub = data.loc[(data.regMonth>=1)&(data.regYear==2018), ['id','salesVolume']]
sub.columns = ['id','forecastVolum']
# sub[['id','forecastVolum']].round().astype(int).to_csv('CCF_sales.csv', index=False)

29 29
all_idx  : 13 24
train_idx: 13 20
valid_idx: 21 21
test_idx : 25 25
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.0340572	valid_1's l2: 0.0562501
[200]	training's l2: 0.0220145	valid_1's l2: 0.0564519
Early stopping, best iteration is:
[108]	training's l2: 0.0324629	valid_1's l2: 0.0560316
0.935023903351
valid mean: 5.802413559652968
true  mean: 5.936753606760708
test  mean: 5.688307280318899
29 29
all_idx  : 13 25
train_idx: 13 21
valid_idx: 22 22
test_idx : 26 26
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.0347522	valid_1's l2: 0.0848145
[200]	training's l2: 0.0227288	valid_1's l2: 0.0839579
[300]	training's l2: 0.0176424	valid_1's l2: 0.08323
[400]	training's l2: 0.0143803	valid_1's l2: 0.0830078
[500]	training's l2: 0.0119228	valid_1's l2: 0.0826286
[600]	training's l2: 0.01003	valid_1's l2: 0.0821587
[700]	training's l2: 0.00857684	valid_1's l2: 0.0818848
[800]	training's l2: 0.00736456	valid_1

In [10]:
sub['forecastVolum'] = sub['forecastVolum'].apply(lambda x:math.exp(x)).round().astype(int)

In [12]:
sub[['id','forecastVolum']].to_csv('CCF_sales2.csv', index=False)

In [11]:
sub.head()

Unnamed: 0,id,forecastVolum
31680,1,255
31681,2,335
31682,3,163
31683,4,276
31684,5,407


In [13]:
features

['regYear',
 'shift_model_adcode_mt_label_1',
 'shift_model_adcode_mt_label_2',
 'shift_model_adcode_mt_label_3',
 'shift_model_adcode_mt_label_4',
 'shift_model_adcode_mt_label_5',
 'shift_model_adcode_mt_label_6',
 'shift_model_adcode_mt_label_7',
 'shift_model_adcode_mt_label_8',
 'shift_model_adcode_mt_label_9',
 'shift_model_adcode_mt_label_10',
 'shift_model_adcode_mt_label_11',
 'shift_model_adcode_mt_label_12',
 'shift_model_adcode_mt_popularity_1',
 'shift_model_adcode_mt_popularity_2',
 'shift_model_adcode_mt_popularity_3',
 'shift_model_adcode_mt_popularity_4',
 'shift_model_adcode_mt_popularity_5',
 'shift_model_adcode_mt_popularity_6',
 'shift_model_adcode_mt_popularity_7',
 'shift_model_adcode_mt_popularity_8',
 'shift_model_adcode_mt_popularity_9',
 'shift_model_adcode_mt_popularity_10',
 'shift_model_adcode_mt_popularity_11',
 'shift_model_adcode_mt_popularity_12',
 'adcode',
 'bodyType',
 'model',
 'regMonth']