In [2]:
#模块导入

import sys
import numpy as np
import pandas as pd
import os 
import gc
# from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelEncoder
import datetime
import time
import lightgbm as lgb
# import xgboost as xgb
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

In [9]:
#数据处理

path  = 'input/'
train_sales  = pd.read_csv(path+'Train/train_sales_data.csv')
train_search = pd.read_csv(path+'Train/train_search_data.csv')
train_user   = pd.read_csv(path+'Train/train_user_reply_data.csv')
evaluation_public = pd.read_csv(path+'evaluation_public.csv')
submit_example    = pd.read_csv(path+'submit_example.csv')
data = pd.concat([train_sales, evaluation_public], ignore_index=True)
data = data.merge(train_search, 'left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])
data = data.merge(train_user, 'left', on=['model', 'regYear', 'regMonth'])
data['label'] = data['salesVolume']
data['id'] = data['id'].fillna(0).astype(int)
data['bodyType'] = data['model'].map(train_sales.drop_duplicates('model').set_index('model')['bodyType'])
#LabelEncoder
for i in ['bodyType', 'model']:
    data[i] = data[i].map(dict(zip(data[i].unique(), range(data[i].nunique()))))
data['mt'] = (data['regYear'] - 2016) * 12 + data['regMonth']

#提取特征

def get_stat_feature(df_):   
    df = df_.copy()
    stat_feat = []
    df['model_adcode'] = df['adcode'] + df['model']
    df['model_adcode_mt'] = df['model_adcode'] * 100 + df['mt']
#     for col in tqdm(['label','popularity']):
    for col in ['label','popularity']:
        # shift
        for i in [1,2,3,4,5,6,7,8,9,10,11,12]:
            stat_feat.append('shift_model_adcode_mt_{}_{}'.format(col,i))
            df['model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'] + i
            df_last = df[~df[col].isnull()].set_index('model_adcode_mt_{}_{}'.format(col,i))
            df['shift_model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'].map(df_last[col])    
    return df,stat_feat
#评价指标

def score(data, pred='pred_label', label='label', group='model'):
    data['pred_label'] = data['pred_label'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    data_agg = data.groupby('model').agg({
        pred:  list,
        label: [list, 'mean']
    }).reset_index()
    data_agg.columns = ['_'.join(col).strip() for col in data_agg.columns]
    nrmse_score = []
    for raw in data_agg[['{0}_list'.format(pred), '{0}_list'.format(label), '{0}_mean'.format(label)]].values:
        nrmse_score.append(
            mse(raw[0], raw[1]) ** 0.5 / raw[2]
        )
    print(1 - np.mean(nrmse_score))
    return 1 - np.mean(nrmse_score)	

In [10]:
#模型选择

def get_model_type(train_x,train_y,valid_x,valid_y,m_type='lgb'):   
    if m_type == 'lgb':
        model = lgb.LGBMRegressor(
                                num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='mse',
                                max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=2019,
                                n_estimators=2000, subsample=0.9, colsample_bytree=0.7,
                                )
        model.fit(train_x, train_y, 
              eval_set=[(train_x, train_y),(valid_x, valid_y)], 
              categorical_feature=cate_feat, 
              early_stopping_rounds=100, verbose=100)      
    elif m_type == 'xgb':
        model = xgb.XGBRegressor(
                                max_depth=5 , learning_rate=0.05, n_estimators=2000, 
                                objective='reg:gamma', tree_method = 'hist',subsample=0.9, 
                                colsample_bytree=0.7, min_child_samples=5,eval_metric = 'rmse' 
                                )
        model.fit(train_x, train_y, 
              eval_set=[(train_x, train_y),(valid_x, valid_y)], 
              early_stopping_rounds=100, verbose=100)   
    return model

In [18]:
#模型训练

def get_train_model(df_, m, m_type='lgb'):
    df = df_.copy()
    # 数据集划分
    st = 13
    all_idx   = (df['mt'].between(st , m-1))
    train_idx = (df['mt'].between(st , m-5))
    valid_idx = (df['mt'].between(m-4, m-4))
    test_idx  = (df['mt'].between(m  , m  ))
    print('all_idx  :',st ,m-1)
    print('train_idx:',st ,m-5)
    print('valid_idx:',m-4,m-4)
    print('test_idx :',m  ,m  )  
    # 最终确认
    train_x = df[train_idx][features]
    train_y = df[train_idx]['label']
    valid_x = df[valid_idx][features]
    valid_y = df[valid_idx]['label']   
    # get model
    model = get_model_type(train_x,train_y,valid_x,valid_y,m_type)  
    # offline
    df['pred_label'] = model.predict(df[features])
#     best_score = score(df[valid_idx]) 
    # online
#     if m_type == 'lgb':
#         model.n_estimators = model.best_iteration_ + 100
#         model.fit(df[all_idx][features], df[all_idx]['label'], categorical_feature=cate_feat)
#     elif m_type == 'xgb':
#         model.n_estimators = model.best_iteration + 100
#         model.fit(df[all_idx][features], df[all_idx]['label'])
    df['forecastVolum'] = model.predict(df[features]) 
    print('valid mean:',df[valid_idx]['pred_label'].mean())
    print('true  mean:',df[valid_idx]['label'].mean())
    print('test  mean:',df[test_idx]['forecastVolum'].mean())
    # 阶段结果
    sub = df[test_idx][['id']]
    sub['forecastVolum'] = df[test_idx]['forecastVolum'].apply(lambda x: 0 if x < 0 else x).round().astype(int)  
    return sub,df[valid_idx]['pred_label']

In [19]:
#逐步预测

for month in [25,26,27,28]: 
    m_type = 'lgb' 
    
    data_df, stat_feat = get_stat_feature(data)
    
    num_feat = ['regYear'] + stat_feat
    cate_feat = ['adcode','bodyType','model','regMonth']
    
    if m_type == 'lgb':
        for i in cate_feat:
            data_df[i] = data_df[i].astype('category')
    elif m_type == 'xgb':
        lbl = LabelEncoder()  
        for i in tqdm(cate_feat):
            data_df[i] = lbl.fit_transform(data_df[i].astype(str))
           
    features = num_feat + cate_feat
    print(len(features), len(set(features)))   
    
    sub,val_pred = get_train_model(data_df, month)   
    data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'salesVolume'] = sub['forecastVolum'].values
    data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'label'      ] = sub['forecastVolum'].values	
sub = data.loc[(data.regMonth>=1)&(data.regYear==2018), ['id','salesVolume']]
sub.columns = ['id','forecastVolum']
sub[['id','forecastVolum']].round().astype(int).to_csv('CCF_sales.csv', index=False)

29 29
all_idx  : 13 24
train_idx: 13 20
valid_idx: 21 21
test_idx : 25 25
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 8492.89	valid_1's l2: 35153.6
[200]	training's l2: 4546.93	valid_1's l2: 31997.7
[300]	training's l2: 2975.08	valid_1's l2: 31250.1
[400]	training's l2: 2097.97	valid_1's l2: 30863.9
[500]	training's l2: 1548.26	valid_1's l2: 30670.3
[600]	training's l2: 1186.17	valid_1's l2: 30450.2
[700]	training's l2: 941.965	valid_1's l2: 30312.3
[800]	training's l2: 752.927	valid_1's l2: 30220
[900]	training's l2: 614.003	valid_1's l2: 30146.6
[1000]	training's l2: 507.21	valid_1's l2: 30122.4
[1100]	training's l2: 417.38	valid_1's l2: 30072.3
[1200]	training's l2: 349.745	valid_1's l2: 30037.1
[1300]	training's l2: 293.352	valid_1's l2: 30013.1
[1400]	training's l2: 248.668	valid_1's l2: 30001.2
Early stopping, best iteration is:
[1379]	training's l2: 257.61	valid_1's l2: 29993.3
valid mean: 597.5013335217335
true  mean: 649.3121212121212
te

In [14]:
features

['regYear',
 'shift_model_adcode_mt_label_1',
 'shift_model_adcode_mt_label_2',
 'shift_model_adcode_mt_label_3',
 'shift_model_adcode_mt_label_4',
 'shift_model_adcode_mt_label_5',
 'shift_model_adcode_mt_label_6',
 'shift_model_adcode_mt_label_7',
 'shift_model_adcode_mt_label_8',
 'shift_model_adcode_mt_label_9',
 'shift_model_adcode_mt_label_10',
 'shift_model_adcode_mt_label_11',
 'shift_model_adcode_mt_label_12',
 'shift_model_adcode_mt_popularity_1',
 'shift_model_adcode_mt_popularity_2',
 'shift_model_adcode_mt_popularity_3',
 'shift_model_adcode_mt_popularity_4',
 'shift_model_adcode_mt_popularity_5',
 'shift_model_adcode_mt_popularity_6',
 'shift_model_adcode_mt_popularity_7',
 'shift_model_adcode_mt_popularity_8',
 'shift_model_adcode_mt_popularity_9',
 'shift_model_adcode_mt_popularity_10',
 'shift_model_adcode_mt_popularity_11',
 'shift_model_adcode_mt_popularity_12',
 'adcode',
 'bodyType',
 'model',
 'regMonth']

In [21]:
data_df.describe()

Unnamed: 0,forecastVolum,id,regYear,salesVolume,popularity,carCommentVolum,newsReplyVolum,label,mt,model_adcode,...,model_adcode_mt_popularity_8,shift_model_adcode_mt_popularity_8,model_adcode_mt_popularity_9,shift_model_adcode_mt_popularity_9,model_adcode_mt_popularity_10,shift_model_adcode_mt_popularity_10,model_adcode_mt_popularity_11,shift_model_adcode_mt_popularity_11,model_adcode_mt_popularity_12,shift_model_adcode_mt_popularity_12
count,0.0,36960.0,36960.0,35640.0,31680.0,31680.0,31680.0,35640.0,36960.0,36960.0,...,36960.0,26400.0,36960.0,25080.0,36960.0,23760.0,36960.0,22440.0,36960.0,21120.0
mean,,382.452381,2016.714286,591.362823,2311.965941,250.479167,1989.809722,591.362823,14.5,347756.772727,...,34775700.0,2346.297008,34775700.0,2354.941348,34775700.0,2368.60101,34775700.0,2384.834804,34775700.0,2400.706723
std,,1104.799221,0.699864,762.726701,2980.547086,318.648963,2324.517694,762.726701,8.077856,136281.364257,...,13628140.0,2940.363155,13628140.0,2969.226412,13628140.0,2995.808904,13628140.0,3026.478218,13628140.0,3058.69944
min,,0.0,2016.0,0.0,25.0,0.0,0.0,0.0,1.0,110000.0,...,11000010.0,38.0,11000010.0,38.0,11000010.0,44.0,11000010.0,44.0,11000010.0,44.0
25%,,0.0,2016.0,175.0,700.0,54.0,335.0,175.0,7.75,230029.75,...,23002990.0,703.0,23002990.0,702.0,23002990.0,703.75,23002990.0,705.0,23002990.0,705.0
50%,,0.0,2017.0,353.0,1445.5,149.5,1432.5,353.0,14.5,355029.5,...,35502970.0,1473.0,35502970.0,1474.0,35502970.0,1482.0,35502980.0,1496.0,35502980.0,1507.0
75%,,0.0,2017.0,715.0,2816.0,307.0,2712.0,715.0,21.25,440029.25,...,44002950.0,2884.25,44002960.0,2894.25,44002960.0,2908.0,44002960.0,2924.0,44002960.0,2939.25
max,,5368.0,2018.0,15317.0,116300.0,2834.0,20770.0,15317.0,28.0,610059.0,...,61005940.0,116300.0,61005940.0,116300.0,61005940.0,116300.0,61005940.0,116300.0,61005940.0,116300.0
