In [1]:
from time import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error as mse
import lightgbm as lgb

from tqdm import tqdm
import gc
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

# 读取数据

In [2]:
data_path = '../../input/Round1/'

train_user_reply_data = pd.read_csv(data_path + 'train_user_reply_data.csv', encoding='utf-8')
train_search_data = pd.read_csv(data_path + 'train_search_data.csv', encoding='utf-8')
train_sales_data = pd.read_csv(data_path + 'train_sales_data.csv', encoding='utf-8')
evaluation_public = pd.read_csv(data_path + 'evaluation_public.csv', encoding='utf-8')

train_sales_data = train_sales_data.merge(train_search_data, on=['province', 'adcode', 'model', 'regYear', 'regMonth'])

# 固定车型和省份顺序-样本顺序

In [3]:
cars = ['f8a6975573af1b33', '2a2ab41f8f6ff1cb', 'd4efbebb087fd03f', '3e21824be728cbec',
        'ea489c253676aafc', '6155b214590c66e6', 'fc32b1a017b34efe', '9c1c7ee8ebdda299',
        'fde95ea242abd896', '7a7885e2d7c00bcf', '7245e0ee27b195cd', 'b25c4e2e3856af22',
        '7aab7fca2470987e', 'feabbf46658382b9', '04e66e578f653ab9', '5d7fb682edd0f937',
        'b4be3a4917289c82', '54fc07138d70374c', 'ef76a85c4b39f693', 'bb9fbec9a2833839',
        '3c974920a76ac9c1', '212083a9246d2fd3', '4f79773e600518a6', 'af6f4f548684e14d',
        '936168bd4850913d', 'cd5841d44fd7625e', '0797526c057dcf5b', 'a207df29ec9583f0',
        '3d7554f1f56dd664', '7023efdab9cedc03', 'da457d15788fe8ee', '12f8b7e14947c34d',
        '28e29f2c03dcd84c', '63065128401bb3ff', 'a432c483b5beb856', '37aa9169b575ef79',
        '17bc272c93f19d56', '61e73e32ad101892', '4a103c30d593fbbe', '2d0d2c3403909fdb',
        '6858d6dfe680bdf7', '17363f08d683d52b', '346393c2c6305fb1', '5b1c11c3efed5312',
        '97f15de12cfabbd5', 'a9a43d1a7ecbe75d', '7cf283430b3b5e38', 'c6833cb891626c17',
        'a28bb927b6fcb33c', 'dff803b4024d261d', '02aab221aabc03b9', 'f5d69960089c3614',
        '06880909932890ca', '79de4e4b24c35b04', 'd0f245b8781e3631', 'c06a2a387c0ee510',
        'cc21c7e91a3b5a0c', 'f270f6a489c6a9d7', '8c915fe4632fb9fa', 'c6cd4e0e073f5ac2']
provinces = ['浙江', '福建', '四川', '陕西', '安徽', '湖南', '广东', '云南', '上海', '山东',
             '湖北', '黑龙江', '江苏', '广西', '内蒙古', '辽宁', '北京', '重庆', '河北', '山西',
             '江西', '河南']

# 评估函数

In [4]:
def metrics(y_true, y_pred, model):
    data = pd.DataFrame({'model': model, 'salesVolume': y_true, 'label': y_pred})
    data['label'] = data['label'].map(lambda index: -index if index < 0 else index)
    res, count = 0, 0
    for index, cars in data.groupby('model'):
        a = np.array(cars['salesVolume'])
        b = np.array(cars['label'])
        temp = np.sqrt(np.sum((a - b) ** 2) / len(a)) / np.mean(a)
        res += temp
        count += 1
        print(temp)
    return 1 - (res / count)

# 获取训练/测试数据索引下标

In [5]:
def get_train_feature(windows_size, before):
    features = pd.DataFrame()
    for car in cars:
        for province in provinces:
            car_province_part = train_sales_data[(train_sales_data['model'] == car) & (train_sales_data['province'] == province)]
            car_province_part['label'] = car_province_part['salesVolume'].shift(-windows_size)
            car_province_part = car_province_part[before: 24-windows_size]
            features = pd.concat([features, car_province_part], axis=0)
    features.index = range(len(features))
    return features


def get_test_feature(windows_size, before):
    features = pd.DataFrame()
    for car in cars:
        for province in provinces:
            car_province_part = train_sales_data[(train_sales_data['model'] == car) & (train_sales_data['province'] == province)]
            car_province_part['label'] = car_province_part['salesVolume'].shift(-windows_size)
            car_province_part = car_province_part[-1:]
            features = pd.concat([features, car_province_part], axis=0)
    features.index = range(len(features))
    return features

# 特征提取

In [6]:
def get_basic_feature(windows_size, before, data_set_name):
    features = pd.DataFrame()
    for car in cars:
        for province in provinces:
            car_province_part = train_sales_data[(train_sales_data['model'] == car) & (train_sales_data['province'] == province)].copy()
            car_province_part['popularity'] = car_province_part['popularity'].apply(lambda index: np.log(index))     ###
            car_province_part['salesVolume'] = car_province_part['salesVolume'].apply(lambda index: np.log(index))   ###
            
            # 春节标记特征
            car_province_part['is_pring_festival'] = [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            car_province_part['distance_spring_festival'] = [1, 0, 1, 2, 3, 4, 5, 5, 4, 3, 2, 1, 0, 1, 2, 3, 4, 5, 6, 6, 5, 4, 3, 2]
            
            # 一阶差分
            for index in range(1, before, 1):
                car_province_part['salesVolume_' + str(index)] = car_province_part['salesVolume'].shift(index)
                car_province_part['salesVolume_diff_' + str(index)] = car_province_part['salesVolume'].diff(index)
                car_province_part['salesVolume_qoq_' + str(index)] = (car_province_part['salesVolume']
                                                                      / car_province_part['salesVolume_' + str(index)])

                car_province_part['popularity_' + str(index)] = car_province_part['popularity'].shift(index)
                car_province_part['popularity_diff_' + str(index)] = car_province_part['popularity'].diff(index)
                car_province_part['popularity_hb_' + str(index)] = (car_province_part['popularity']
                                                                    / car_province_part['popularity_' + str(index)])

            # 二阶差分
            for index in range(1, before - 1, 1):   
                car_province_part['salesVolume_diff2_{}'.format(str(index))] = car_province_part['salesVolume_diff_' + str(index)].diff(1)

            # 历史统计特征
            salesVolume = list(car_province_part['salesVolume'])
            popularity = list(car_province_part['popularity'])
            car_province_part['index'] = 1
            car_province_part['index'] = car_province_part['index'].cumsum()
            car_province_part['salesVolume_his'] = car_province_part['index'].map(lambda index: salesVolume[index - 7: index])
            car_province_part['popularity_his'] = car_province_part['index'].map(lambda index: popularity[index - 7: index])

            car_province_part['salesVolume_his_diff'] = car_province_part['salesVolume_his'].map(lambda index: np.diff(index))
            car_province_part['popularity_his_diff'] = car_province_part['popularity_his'].map(lambda index: np.diff(index))

            def pth(array):
                return np.max(array) - np.min(array)

            fea_name = ['max', 'min', 'aver', 'var', 'pth']
            fun_name = [np.max, np.min, np.average, np.var, pth]
            for i in range(len(fun_name)):
                car_province_part['salesVolume_his_' + fea_name[i]] = car_province_part['salesVolume_his'].apply(lambda index: 0 if len(index) == 0 else fun_name[i](index))                
                car_province_part['salesVolume_his_diff_' + fea_name[i]] = car_province_part['salesVolume_his_diff'].apply(lambda index: 0 if len(index) == 0 else fun_name[i](index))
            
            car_province_part.drop(['index', 'salesVolume_his', 'popularity_his', 'salesVolume_his_diff', 'popularity_his_diff'], axis=1, inplace=True)
            
            # 数据集划分
            if data_set_name == 'train':
                car_province_part = car_province_part[before: 24-windows_size]
            else:
                car_province_part = car_province_part[-1:]

            car_province_part.drop(['popularity'], axis=1, inplace=True)    ###  , 'day_count', 'day_salesVolume', 'popularity'
            features = pd.concat([features, car_province_part], axis=0, ignore_index=True)

    print(features.columns)
    return features

# Begin

In [7]:
test_prob_collection = pd.DataFrame()

# Model-LightGBM - 一月

In [8]:
size, pre = 1, 10  # 4
train_feature  = get_train_feature(size, pre)
test_feature  = get_test_feature(size, pre)

cols = ['province', 'adcode', 'model', 'regYear', 'regMonth', 'bodyType']   #  , 'salesVolume'

categorial_name = [0, 1, 2, 3, 4, 6, 7]
drop_cols = ['salesVolume', 'popularity']

temp_train = get_basic_feature(size, pre, 'train')
train_feature = train_feature.drop(drop_cols, axis=1).merge(temp_train, on=cols, how='left')
train_feature

temp_test = get_basic_feature(size, pre, 'test')
test_feature = test_feature.drop(drop_cols, axis=1).merge(temp_test, on=cols, how='left')
test_feature
train_feature.isnull().sum()
test_feature

submit = test_feature[['province', 'adcode', 'model']]
submit['regYear'] = 2018
submit['regMonth'] = 1
###############################

test_index = list(train_feature[(train_feature['regYear'] == 2017) & (train_feature['regMonth'] == 11)].index)

def drop_duplicate(n):
    return n not in test_index

train_index = list(filter(drop_duplicate, list(range(len(train_feature)))))

train_model = train_feature['model'].values[train_index]   # model
val_model = train_feature['model'].values[test_index]

model_set = dict()
for index in range(len(cars)):
    model_set[cars[index]] = index
train_feature['bodyType'] = train_feature['bodyType'].map({'Hatchback': 0, 'MPV': 1, 'SUV': 2, 'Sedan': 3})
train_feature['model'] = train_feature['model'].map(model_set)
test_feature['bodyType'] = test_feature['bodyType'].map({'Hatchback': 0, 'MPV': 1, 'SUV': 2, 'Sedan': 3})
test_feature['model'] = test_feature['model'].map(model_set)

train_label = train_feature[['label']]
train_feature.drop(['province', 'label'], axis=1, inplace=True)
test_feature.drop(['province', 'label'], axis=1, inplace=True)

train_label['log'] = train_label['label'].apply(lambda index: np.log2(index) + 1)
x_train = train_feature.values[train_index]
y_train = train_label['log'].values[train_index]
x_test = train_feature.values[test_index]
y_test = train_label['log'].values[test_index]

Index(['province', 'adcode', 'model', 'bodyType', 'regYear', 'regMonth',
       'salesVolume', 'is_pring_festival', 'distance_spring_festival',
       'salesVolume_1', 'salesVolume_diff_1', 'salesVolume_qoq_1',
       'popularity_1', 'popularity_diff_1', 'popularity_hb_1', 'salesVolume_2',
       'salesVolume_diff_2', 'salesVolume_qoq_2', 'popularity_2',
       'popularity_diff_2', 'popularity_hb_2', 'salesVolume_3',
       'salesVolume_diff_3', 'salesVolume_qoq_3', 'popularity_3',
       'popularity_diff_3', 'popularity_hb_3', 'salesVolume_4',
       'salesVolume_diff_4', 'salesVolume_qoq_4', 'popularity_4',
       'popularity_diff_4', 'popularity_hb_4', 'salesVolume_5',
       'salesVolume_diff_5', 'salesVolume_qoq_5', 'popularity_5',
       'popularity_diff_5', 'popularity_hb_5', 'salesVolume_6',
       'salesVolume_diff_6', 'salesVolume_qoq_6', 'popularity_6',
       'popularity_diff_6', 'popularity_hb_6', 'salesVolume_7',
       'salesVolume_diff_7', 'salesVolume_qoq_7', 'populari

In [9]:
# LightGBM model
params = {
    'boosting_type': 'gbdt',
    'objective': 'rmse',
    'metric': ['rmse'],   # 'l2', 'binary_logloss',
    'learning_rate': 0.03,
    'num_leaves': 2 ** 5 - 1,    # 2 ** 5 - 1
    # 'min_child_samples': 100,
    'max_depth': 6,    # 6
    'subsample': 0.8,   # 0.8
    'subsample_freq': 5,
    'colsample_bytree': 0.8,
    'seed': 2020,
    'nthread': -1,
    'verbose': 1,
}

lgb_train = lgb.Dataset(x_train, y_train.ravel())
lgb_eval = lgb.Dataset(x_test, y_test.ravel(), reference=lgb_train)
# num_boost_round: 5000   early_stopping_rounds:100
module = lgb.train(params,
                   lgb_train,
                   num_boost_round=5000,
                   valid_sets=lgb_eval,
                   early_stopping_rounds=100,
                   categorical_feature=categorial_name,
                   verbose_eval=100)

# feature importance
importance = module.feature_importance()
print('importance:\n', importance)

val = module.predict(x_test, num_iteration=module.best_iteration)
val = 2 ** (val - 1)
y_true = 2 ** (y_test.reshape(1, -1)[0] - 1)
nrmse = metrics(y_true, val, val_model.reshape(1, -1)[0])

iters = module.best_iteration + 100
train_all = np.vstack((x_train, x_test))
label_all = np.hstack((y_train, y_test))
lgb_data = lgb.Dataset(train_all, label_all.ravel())
model = lgb.train(params,
                  lgb_data,
                  num_boost_round=iters,
                  categorical_feature=categorial_name)

predict = model.predict(test_feature)
predict = 2 ** (predict - 1)
print('predict:\n', predict)

print('model train over, rmse:', nrmse)   
submit['forecastVolum'] = predict
test_prob_collection = pd.concat([test_prob_collection, submit], axis=0, ignore_index=True)
print('train_feature.shape: ', train_feature.shape)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.504982
[200]	valid_0's rmse: 0.477841
[300]	valid_0's rmse: 0.467237
[400]	valid_0's rmse: 0.462656
[500]	valid_0's rmse: 0.459201
[600]	valid_0's rmse: 0.457963
[700]	valid_0's rmse: 0.457999
Early stopping, best iteration is:
[636]	valid_0's rmse: 0.45705
importance:
 [1276 2810   82  164 1650  664  206  157  191  129  138   68  199  172
  203  269  233   74  143  153  122  215  151   49   82  110  118  125
  150   58  104   80  140  134  131   63   99   98  142  149  141   48
  118   96  134  157  167   50  101   99  186  146  147   76  102   85
  313  246  170   63  100   97  219  285  231  298  243  220  158  260
   85  184  127   95  237   34  172  127  160  143]
0.235664444383198
0.2639890713606915
0.3146114751514538
0.20079176350543748
0.3653391928575023
0.8147338099487671
0.2289502493130332
0.1838861452194883
0.1919407442466147
0.15769741247671323
0.32897853943959576
0.21132295023313058
0.110

# Model-LightGBM - 二月

In [10]:
size, pre = 2, 9  # 4
train_feature  = get_train_feature(size, pre)
test_feature  = get_test_feature(size, pre)

cols = ['province', 'adcode', 'model', 'regYear', 'regMonth', 'bodyType']   #  , 'salesVolume'

temp_train = get_basic_feature(size, pre, 'train')
train_feature = train_feature.drop(drop_cols, axis=1).merge(temp_train, on=cols, how='left')
train_feature

temp_test = get_basic_feature(size, pre, 'test')
test_feature = test_feature.drop(drop_cols, axis=1).merge(temp_test, on=cols, how='left')
test_feature
train_feature.isnull().sum()
test_feature

submit = test_feature[['province', 'adcode', 'model']]
submit['regYear'] = 2018
submit['regMonth'] = 2
######################################

test_index = list(train_feature[(train_feature['regYear'] == 2017) & (train_feature['regMonth'] == 10)].index)

def drop_duplicate(n):
    return n not in test_index

train_index = list(filter(drop_duplicate, list(range(len(train_feature)))))

train_model = train_feature['model'].values[train_index]   # model
val_model = train_feature['model'].values[test_index]

model_set = dict()
for index in range(len(cars)):
    model_set[cars[index]] = index
train_feature['bodyType'] = train_feature['bodyType'].map({'Hatchback': 0, 'MPV': 1, 'SUV': 2, 'Sedan': 3})
train_feature['model'] = train_feature['model'].map(model_set)
test_feature['bodyType'] = test_feature['bodyType'].map({'Hatchback': 0, 'MPV': 1, 'SUV': 2, 'Sedan': 3})
test_feature['model'] = test_feature['model'].map(model_set)

train_label = train_feature[['label']]
train_feature.drop(['province', 'label'], axis=1, inplace=True)
test_feature.drop(['province', 'label'], axis=1, inplace=True)

train_label['log'] = train_label['label'].apply(lambda index: np.log2(index) + 1)
x_train = train_feature.values[train_index]
y_train = train_label['log'].values[train_index]
x_test = train_feature.values[test_index]
y_test = train_label['log'].values[test_index]

Index(['province', 'adcode', 'model', 'bodyType', 'regYear', 'regMonth',
       'salesVolume', 'is_pring_festival', 'distance_spring_festival',
       'salesVolume_1', 'salesVolume_diff_1', 'salesVolume_qoq_1',
       'popularity_1', 'popularity_diff_1', 'popularity_hb_1', 'salesVolume_2',
       'salesVolume_diff_2', 'salesVolume_qoq_2', 'popularity_2',
       'popularity_diff_2', 'popularity_hb_2', 'salesVolume_3',
       'salesVolume_diff_3', 'salesVolume_qoq_3', 'popularity_3',
       'popularity_diff_3', 'popularity_hb_3', 'salesVolume_4',
       'salesVolume_diff_4', 'salesVolume_qoq_4', 'popularity_4',
       'popularity_diff_4', 'popularity_hb_4', 'salesVolume_5',
       'salesVolume_diff_5', 'salesVolume_qoq_5', 'popularity_5',
       'popularity_diff_5', 'popularity_hb_5', 'salesVolume_6',
       'salesVolume_diff_6', 'salesVolume_qoq_6', 'popularity_6',
       'popularity_diff_6', 'popularity_hb_6', 'salesVolume_7',
       'salesVolume_diff_7', 'salesVolume_qoq_7', 'populari

In [11]:
params = {
      'boosting_type': 'gbdt',
      'objective': 'rmse',
      'metric': ['rmse'],   # 'l2', 'binary_logloss',
      'learning_rate': 0.03,
      'num_leaves': 2 ** 5 - 1,
      # 'min_child_samples': 100,
      'max_depth': 6,
      'subsample': 0.8,
      'subsample_freq': 5,
      'colsample_bytree': 0.8,
      'seed': 2020,
      'nthread': -1,
      'verbose': 1,
}

lgb_train = lgb.Dataset(x_train, y_train.ravel())
lgb_eval = lgb.Dataset(x_test, y_test.ravel(), reference=lgb_train)
# categorial_name = ['adcode', 'model', 'bodyType', 'regYear', 'regMonth']

module = lgb.train(params,
                   lgb_train,
                   num_boost_round=5000,
                   valid_sets=lgb_eval,
                   early_stopping_rounds=100,
                   categorical_feature=categorial_name,
                   verbose_eval=100)

val = module.predict(x_test, num_iteration=module.best_iteration)
val = 2 ** (val - 1)
y_true = 2 ** (y_test.reshape(1, -1)[0] - 1)
nrmse = metrics(y_true, val, val_model.reshape(1, -1)[0])

iters = module.best_iteration + 100
train_all = np.vstack((x_train, x_test))
label_all = np.hstack((y_train, y_test))
lgb_data = lgb.Dataset(train_all, label_all.ravel())
model = lgb.train(params, lgb_data, num_boost_round=iters, categorical_feature=categorial_name)

predict = model.predict(test_feature)
predict = 2 ** (predict - 1)
print('predict: \n', predict)

print('model train over, rmse:', nrmse)   
submit['forecastVolum'] = predict
test_prob_collection = pd.concat([test_prob_collection, submit], axis=0, ignore_index=True)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.602171
[200]	valid_0's rmse: 0.562741
[300]	valid_0's rmse: 0.555046
[400]	valid_0's rmse: 0.552141
[500]	valid_0's rmse: 0.551111
[600]	valid_0's rmse: 0.55082
Early stopping, best iteration is:
[513]	valid_0's rmse: 0.550694
0.17529446590420064
0.20748214473161702
0.3432403129963316
0.2721301228695746
0.4532390604901013
1.0033628189408994
0.20162253932826815
0.17817894299515946
0.1725769416269705
0.16339106831396036
0.32634754073980893
0.2548127158585516
0.1986308262046992
0.39298416332255925
0.2094806622593866
0.2515491391377793
0.15461615103333526
0.19960941230984003
0.10733900727572827
0.44050889388663855
0.2526560721752099
0.3804634605345667
0.09128347167908937
0.432040859136412
0.15247841640016732
0.5276635615778361
0.28116641811371124
0.26310379723230093
0.08119645022436948
0.3323371205154294
0.374451602728627
0.17258252319409287
0.3143193504351198
0.12166167385144219
0.7429617172141403
0.3480

# Model-LightGBM - 三月

In [12]:
size, pre = 3, 8   # 5
train_feature  = get_train_feature(size, pre)
test_feature  = get_test_feature(size, pre)

cols = ['province', 'adcode', 'model', 'regYear', 'regMonth', 'bodyType']   #  , 'salesVolume'

temp_train = get_basic_feature(size, pre, 'train')
train_feature = train_feature.drop(drop_cols, axis=1).merge(temp_train, on=cols, how='left')
train_feature

temp_test = get_basic_feature(size, pre, 'test')
test_feature = test_feature.drop(drop_cols, axis=1).merge(temp_test, on=cols, how='left')
test_feature
train_feature.isnull().sum()
test_feature

submit = test_feature[['province', 'adcode', 'model']]
submit['regYear'] = 2018
submit['regMonth'] = 3
##############################

test_index = list(train_feature[(train_feature['regYear'] == 2017) & (train_feature['regMonth'] == 9)].index)

def drop_duplicate(n):
    return n not in test_index

train_index = list(filter(drop_duplicate, list(range(len(train_feature)))))

train_model = train_feature['model'].values[train_index]   # model
val_model = train_feature['model'].values[test_index]

model_set = dict()
for index in range(len(cars)):
    model_set[cars[index]] = index
train_feature['bodyType'] = train_feature['bodyType'].map({'Hatchback': 0, 'MPV': 1, 'SUV': 2, 'Sedan': 3})
train_feature['model'] = train_feature['model'].map(model_set)
test_feature['bodyType'] = test_feature['bodyType'].map({'Hatchback': 0, 'MPV': 1, 'SUV': 2, 'Sedan': 3})
test_feature['model'] = test_feature['model'].map(model_set)

train_label = train_feature[['label']]
train_feature.drop(['province', 'label'], axis=1, inplace=True)
test_feature.drop(['province', 'label'], axis=1, inplace=True)

train_label['log'] = train_label['label'].apply(lambda index: np.log2(index) + 1)
x_train = train_feature.values[train_index]
y_train = train_label['log'].values[train_index]
x_test = train_feature.values[test_index]
y_test = train_label['log'].values[test_index]

Index(['province', 'adcode', 'model', 'bodyType', 'regYear', 'regMonth',
       'salesVolume', 'is_pring_festival', 'distance_spring_festival',
       'salesVolume_1', 'salesVolume_diff_1', 'salesVolume_qoq_1',
       'popularity_1', 'popularity_diff_1', 'popularity_hb_1', 'salesVolume_2',
       'salesVolume_diff_2', 'salesVolume_qoq_2', 'popularity_2',
       'popularity_diff_2', 'popularity_hb_2', 'salesVolume_3',
       'salesVolume_diff_3', 'salesVolume_qoq_3', 'popularity_3',
       'popularity_diff_3', 'popularity_hb_3', 'salesVolume_4',
       'salesVolume_diff_4', 'salesVolume_qoq_4', 'popularity_4',
       'popularity_diff_4', 'popularity_hb_4', 'salesVolume_5',
       'salesVolume_diff_5', 'salesVolume_qoq_5', 'popularity_5',
       'popularity_diff_5', 'popularity_hb_5', 'salesVolume_6',
       'salesVolume_diff_6', 'salesVolume_qoq_6', 'popularity_6',
       'popularity_diff_6', 'popularity_hb_6', 'salesVolume_7',
       'salesVolume_diff_7', 'salesVolume_qoq_7', 'populari

In [13]:
# LightGBM model
params = {
      'boosting_type': 'gbdt',
      'objective': 'rmse',
      'metric': ['rmse'],   # 'l2', 'binary_logloss',
      'learning_rate': 0.03,
      'num_leaves': 2 ** 5 - 1,
      # 'min_child_samples': 100,
      'max_depth': 6,
      'subsample': 0.8,
      'subsample_freq': 5,
      'colsample_bytree': 0.8,
      'seed': 2020,
      'nthread': -1,
      'verbose': 1,
}

lgb_train = lgb.Dataset(x_train, y_train.ravel())
lgb_eval = lgb.Dataset(x_test, y_test.ravel(), reference=lgb_train)
# categorial_name = ['adcode', 'model', 'bodyType', 'regYear', 'regMonth']

module = lgb.train(params,
                   lgb_train,
                   num_boost_round=5000,
                   valid_sets=lgb_eval,
                   early_stopping_rounds=100,
                   categorical_feature=categorial_name,
                   verbose_eval=100)

val = module.predict(x_test, num_iteration=module.best_iteration)
val = 2 ** (val - 1)
y_true = 2 ** (y_test.reshape(1, -1)[0] - 1)
nrmse = metrics(y_true, val, val_model.reshape(1, -1)[0])

iters = module.best_iteration + 100
train_all = np.vstack((x_train, x_test))
label_all = np.hstack((y_train, y_test))
lgb_data = lgb.Dataset(train_all, label_all.ravel())
model = lgb.train(params, lgb_data, num_boost_round=iters, categorical_feature=categorial_name)

predict = model.predict(test_feature)
predict = 2 ** (predict - 1)
print('predict:\n', predict)

print('model train over, rmse:', nrmse)
submit['forecastVolum'] = predict
test_prob_collection = pd.concat([test_prob_collection, submit], axis=0, ignore_index=True)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.634187
[200]	valid_0's rmse: 0.616264
Early stopping, best iteration is:
[155]	valid_0's rmse: 0.612924
0.36038565164423464
0.28104214146461737
0.35174189486108703
0.3734426225439029
0.46096870672375884
1.0247092744986555
0.3183834916191204
0.36539171826472694
0.24399587629289776
0.15624739058109216
0.470694579111279
0.29815345311169844
0.14722613217262928
0.32396964233771824
0.34214213271604
0.26999258098189344
0.1607605920801131
0.23172039572037653
0.1745219100211342
0.5970547354287056
0.18640184257962397
0.4306250980198308
0.11918953271404335
0.439271119585673
0.2921898824770153
0.5916487890285171
0.39087591234770513
0.34168284428626894
0.11857371250377957
0.38345217795511033
0.25428468671480464
0.23634920331071232
0.3086089413551588
0.09420842535312948
1.0101239411820622
0.5069624553199465
0.16806750244636934
0.6195690479548907
0.20206585541809063
0.12174839142702812
1.4751921233320966
0.295120277

# Model-LightGBM - 四月

In [14]:
size, pre = 4, 7   # 5
train_feature  = get_train_feature(size, pre)
test_feature  = get_test_feature(size, pre)

cols = ['province', 'adcode', 'model', 'regYear', 'regMonth', 'bodyType']   #  , 'salesVolume'

temp_train = get_basic_feature(size, pre, 'train')
train_feature = train_feature.drop(drop_cols, axis=1).merge(temp_train, on=cols, how='left')
train_feature

temp_test = get_basic_feature(size, pre, 'test')
test_feature = test_feature.drop(drop_cols, axis=1).merge(temp_test, on=cols, how='left')
test_feature
train_feature.isnull().sum()
test_feature

submit = test_feature[['province', 'adcode', 'model']]
submit['regYear'] = 2018
submit['regMonth'] = 4
###############################

test_index = list(train_feature[(train_feature['regYear'] == 2017) & (train_feature['regMonth'] == 8)].index)

def drop_duplicate(n):
    return n not in test_index

train_index = list(filter(drop_duplicate, list(range(len(train_feature)))))

train_model = train_feature['model'].values[train_index]   # model
val_model = train_feature['model'].values[test_index]

model_set = dict()
for index in range(len(cars)):
    model_set[cars[index]] = index
train_feature['bodyType'] = train_feature['bodyType'].map({'Hatchback': 0, 'MPV': 1, 'SUV': 2, 'Sedan': 3})
train_feature['model'] = train_feature['model'].map(model_set)
test_feature['bodyType'] = test_feature['bodyType'].map({'Hatchback': 0, 'MPV': 1, 'SUV': 2, 'Sedan': 3})
test_feature['model'] = test_feature['model'].map(model_set)

train_label = train_feature[['label']]
train_feature.drop(['province', 'label'], axis=1, inplace=True)
test_feature.drop(['province', 'label'], axis=1, inplace=True)

train_label['log'] = train_label['label'].apply(lambda index: np.log2(index) + 1)
x_train = train_feature.values[train_index]
y_train = train_label['log'].values[train_index]
x_test = train_feature.values[test_index]
y_test = train_label['log'].values[test_index]

Index(['province', 'adcode', 'model', 'bodyType', 'regYear', 'regMonth',
       'salesVolume', 'is_pring_festival', 'distance_spring_festival',
       'salesVolume_1', 'salesVolume_diff_1', 'salesVolume_qoq_1',
       'popularity_1', 'popularity_diff_1', 'popularity_hb_1', 'salesVolume_2',
       'salesVolume_diff_2', 'salesVolume_qoq_2', 'popularity_2',
       'popularity_diff_2', 'popularity_hb_2', 'salesVolume_3',
       'salesVolume_diff_3', 'salesVolume_qoq_3', 'popularity_3',
       'popularity_diff_3', 'popularity_hb_3', 'salesVolume_4',
       'salesVolume_diff_4', 'salesVolume_qoq_4', 'popularity_4',
       'popularity_diff_4', 'popularity_hb_4', 'salesVolume_5',
       'salesVolume_diff_5', 'salesVolume_qoq_5', 'popularity_5',
       'popularity_diff_5', 'popularity_hb_5', 'salesVolume_6',
       'salesVolume_diff_6', 'salesVolume_qoq_6', 'popularity_6',
       'popularity_diff_6', 'popularity_hb_6', 'salesVolume_diff2_1',
       'salesVolume_diff2_2', 'salesVolume_diff2_3', 

In [15]:
params = {
      'boosting_type': 'gbdt',
      'objective': 'rmse',
      'metric': ['rmse'],   # 'l2', 'binary_logloss',
      'learning_rate': 0.03,
      'num_leaves': 2 ** 5 - 1,
      # 'min_child_samples': 100,
      'max_depth': 6,
      'subsample': 0.8,
      'subsample_freq': 5,
      'colsample_bytree': 0.8,
      'seed': 2020,
      'nthread': -1,
      'verbose': 1,
}

lgb_train = lgb.Dataset(x_train, y_train.ravel())
lgb_eval = lgb.Dataset(x_test, y_test.ravel(), reference=lgb_train)
# categorial_name = ['adcode', 'model', 'bodyType', 'regYear', 'regMonth']

module = lgb.train(params,
                   lgb_train,
                   num_boost_round=5000,
                   valid_sets=lgb_eval,
                   early_stopping_rounds=100,
                   categorical_feature=categorial_name,
                   verbose_eval=100)

val = module.predict(x_test, num_iteration=module.best_iteration)
val = 2 ** (val - 1)
y_true = 2 ** (y_test.reshape(1, -1)[0] - 1)
nrmse = metrics(y_true, val, val_model.reshape(1, -1)[0])

iters = module.best_iteration + 100
train_all = np.vstack((x_train, x_test))
label_all = np.hstack((y_train, y_test))
lgb_data = lgb.Dataset(train_all, label_all.ravel())
model = lgb.train(params, lgb_data, num_boost_round=iters, categorical_feature=categorial_name)

predict = model.predict(test_feature)
predict = 2 ** (predict - 1)
print(predict)

print('model train over, rmse:', nrmse)
submit['forecastVolum'] = predict
test_prob_collection = pd.concat([test_prob_collection, submit], axis=0, ignore_index=True)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.669032
[200]	valid_0's rmse: 0.64649
[300]	valid_0's rmse: 0.637652
[400]	valid_0's rmse: 0.634277
[500]	valid_0's rmse: 0.634494
Early stopping, best iteration is:
[474]	valid_0's rmse: 0.633342
0.3316704080482528
0.2568025226371792
0.41891640981415323
0.09862534434881365
0.3749036515320365
1.0784307265452855
0.1786504669472129
0.1890259656433429
0.3617409424317229
0.15471318904954245
0.5341985116231796
0.19886703890955662
0.15946327900008986
0.5379378013805817
0.36201729196960025
0.27797725273290247
0.1240418501097146
0.21696517969277654
0.20729458531302236
0.5252144304959323
0.27195199400639203
0.27774656674449294
0.1666924246269019
0.32464176169237274
0.3750744022885043
0.4951248481741097
0.2518198829675532
0.17720362160304495
0.22608656592364906
0.39232688337559096
0.1841100050062942
0.16994869067258278
0.4805518659430191
0.2030636440104387
1.0271065985101475
0.46908525943678814
0.126541239423038

In [16]:
train_feature.shape

(17160, 59)

In [17]:
test_prob_collection.head()

Unnamed: 0,province,adcode,model,regYear,regMonth,forecastVolum
0,浙江,330000,f8a6975573af1b33,2018,1,856.817857
1,福建,350000,f8a6975573af1b33,2018,1,445.930417
2,四川,510000,f8a6975573af1b33,2018,1,657.344499
3,陕西,610000,f8a6975573af1b33,2018,1,161.944001
4,安徽,340000,f8a6975573af1b33,2018,1,640.131221


In [18]:
test_prob_collection.index = range(len(test_prob_collection))
evaluation_public = evaluation_public.merge(test_prob_collection, on=['province', 'adcode', 'model', 'regYear', 'regMonth'], how='left')
evaluation_public['forecastVolum'] = evaluation_public['forecastVolum_y']
evaluation_public['forecastVolum'] = evaluation_public['forecastVolum'].apply(lambda index: int(np.round(index)))
evaluation_public['forecastVolum'].mean()

472.5691287878788

In [20]:
evaluation_public[['id', 'forecastVolum']].to_csv('../../sub/sub_method_one.csv', encoding='utf-8', index=None)

In [21]:
evaluation_public.describe()
evaluation_public.groupby(['regMonth'], as_index=False)['forecastVolum'].mean()

Unnamed: 0,regMonth,forecastVolum
0,1,559.806061
1,2,374.103788
2,3,473.390152
3,4,482.976515
